This commit is contained in:
ryan-mangeno 2025-10-01 14:07:47 -04:00
commit 86adde66b2
635 changed files with 53891 additions and 30009 deletions

View File

@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.2.0
ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

123
.devops/s390x.Dockerfile Normal file
View File

@ -0,0 +1,123 @@
ARG GCC_VERSION=15.2.0
ARG UBUNTU_VERSION=24.04
### Build Llama.cpp stage
FROM gcc:${GCC_VERSION} AS build
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt upgrade -y && \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
libopenblas-dev libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY . .
RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_BACKEND_DL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS && \
cmake --build build --config Release -j $(nproc) && \
cmake --install build --prefix /opt/llama.cpp
COPY *.py /opt/llama.cpp/bin
COPY .devops/tools.sh /opt/llama.cpp/bin
COPY gguf-py /opt/llama.cpp/gguf-py
COPY requirements.txt /opt/llama.cpp/gguf-py
COPY requirements /opt/llama.cpp/gguf-py/requirements
### Collect all llama.cpp binaries, libraries and distro libraries
FROM scratch AS collector
# Copy llama.cpp binaries and libraries
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
### Base image
FROM ubuntu:${UBUNTU_VERSION} AS base
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y --no-install-recommends \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
curl libgomp1 libopenblas-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
# Copy llama.cpp libraries
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
### Full
FROM base AS full
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /app
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y \
git cmake libjpeg-dev \
python3 python3-pip python3-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
COPY --from=collector /llama.cpp/bin /app
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
RUN pip install --no-cache-dir --break-system-packages \
-r /app/gguf-py/requirements.txt
ENTRYPOINT [ "/app/tools.sh" ]
### CLI Only
FROM base AS light
WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
### Server
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
EXPOSE 8080
ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

View File

@ -52,3 +52,11 @@ insert_final_newline = unset
[vendor/miniaudio/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
[tools/server/webui/**]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

View File

@ -141,97 +141,6 @@ jobs:
# cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
# ubuntu-24-ppc64el-vulkan-cross:
# runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup PowerPC64le
# run: |
# sudo dpkg --add-architecture ppc64el
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-powerpc64le-linux-gnu \
# g++-14-powerpc64le-linux-gnu \
# libvulkan-dev:ppc64el
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
# -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671

View File

@ -6,7 +6,7 @@ on:
jobs:
debian-13-riscv64-native: # Bianbu 2.2
runs-on: self-hosted
runs-on: [self-hosted, RISCV64]
steps:
- name: Install prerequisites

View File

@ -192,6 +192,10 @@ jobs:
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-22.04-arm
- build: 's390x'
os: ubuntu-24.04-s390x
- build: 'ppc64le'
os: ubuntu-24.04-ppc64le
runs-on: ${{ matrix.os }}
@ -206,11 +210,28 @@ jobs:
key: ubuntu-cpu-cmake
evict-old-files: 1d
- name: Dependencies
id: depends
- name: Build Dependencies
id: build_depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
libjpeg-dev build-essential libcurl4-openssl-dev \
git-lfs
- name: Python Dependencies
id: python_depends
run: |
python3 -m pip install --upgrade pip
pip3 install ./gguf-py
- name: Swap Endianness
id: endianness
if: ${{ matrix.build == 's390x' }}
run: |
for f in models/*.gguf; do
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
done
- name: Build
id: cmake_build
@ -228,6 +249,7 @@ jobs:
- name: Test llama2c conversion
id: llama2c_test
if: ${{ matrix.build != 's390x' }}
run: |
cd build
echo "Fetch tokenizer"
@ -237,6 +259,15 @@ jobs:
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
if: ${{ matrix.build == 's390x' }}
run: |
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@ -475,7 +506,7 @@ jobs:
ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps:
- name: Clone
@ -1247,3 +1278,240 @@ jobs:
-DGGML_CANN=on \
-DSOC_TYPE=${{ matrix.device }}
cmake --build build -j $(nproc)
# TODO: simplify the following workflows using a matrix
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
ggml-ci-x64-cpu-low-perf:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-x64-cpu-low-perf
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-arm64-cpu-low-perf:
runs-on: ubuntu-22.04-arm
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-low-perf
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-x64-cpu-high-perf:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-x64-cpu-high-perf
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-arm64-cpu-high-perf:
runs-on: ubuntu-22.04-arm
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-high-perf
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-arm64-cpu-high-perf-sve:
runs-on: ubuntu-22.04-arm
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ggml-ci-arm64-cpu-high-perf-sve
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-x64-nvidia-cuda:
runs-on: [self-hosted, Linux, X64, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
nvidia-smi
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-nvidia-vulkan-cm:
runs-on: [self-hosted, Linux, X64, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-nvidia-vulkan-cm2:
runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-cpu-amx:
runs-on: [self-hosted, Linux, X64, CPU, AMX]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-x64-amd-vulkan:
# runs-on: [self-hosted, Linux, X64, AMD]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
#
# ggml-ci-x64-amd-rocm:
# runs-on: [self-hosted, Linux, X64, AMD]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Test
# id: ggml-ci
# run: |
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

View File

@ -28,7 +28,7 @@ jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-22.04
runs-on: ${{ matrix.config.runs_on }}
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
@ -39,11 +39,12 @@ jobs:
# Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
steps:
@ -53,6 +54,7 @@ jobs:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU
if: ${{ matrix.config.tag != 's390x' }}
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
@ -67,22 +69,19 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine tag name
- name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Determine image tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
# determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="-b${BUILD_NUMBER}"
else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
TYPE=""
@ -90,9 +89,9 @@ jobs:
TYPE="-${{ matrix.config.tag }}"
fi
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
@ -100,7 +99,6 @@ jobs:
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu)
@ -176,3 +174,29 @@ jobs:
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
create_tag:
name: Create and push git tag
runs-on: ubuntu-22.04
permissions:
contents: write
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Create and push git tag
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0

View File

@ -76,51 +76,206 @@ jobs:
run: |
pip install -r tools/server/tests/requirements.txt
# Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
node-version: '22.11.0'
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: WebUI - Install dependencies
id: webui_lint
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
cd tools/server/webui
npm ci
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libcurl4-openssl-dev
- name: WebUI - Check code format
id: webui_format
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
git config --global --add safe.directory $(realpath .)
cd tools/server/webui
git status
pip install -r tools/server/tests/requirements.txt
npm run format
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Files do not follow coding style. To fix: npm run format"
echo "${modified_files}"
exit 1
fi
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Verify bundled index.html
id: verify_server_index_html
run: |
git config --global --add safe.directory $(realpath .)
cd tools/server/webui
git status
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
npm run build
git status
modified_files="$(git status -s)"
echo "Modified files: ${modified_files}"
if [ -n "${modified_files}" ]; then
echo "Repository is dirty or server/webui is not built as expected"
echo "Hint: You may need to follow Web UI build guide in server/README.md"
echo "${modified_files}"
exit 1
fi
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp

4
.gitignore vendored
View File

@ -148,3 +148,7 @@ poetry.toml
/run-vim.sh
/run-chat.sh
.ccache/
# IDE
*.code-workspace
.windsurf/

View File

@ -92,6 +92,7 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# Required for relocatable CMake package

View File

@ -1,12 +1,109 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
# multiplie collaborators per item can be specified
/ci/ @ggerganov
/.devops/*.Dockerfile @ngxson
/tools/server/ @ngxson
/.github/actions/ @slaren
/.github/workflows/ @CISC
/.github/workflows/release.yml @slaren
/.github/workflows/winget.yml @slaren
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
/common/arg.* @ggerganov @ericcurtin
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
/examples/convert-llama2c-to-ggml/ @ggerganov
/examples/deprecation-warning/ @ggerganov
/examples/diffusion/ @am17an
/examples/embedding/ @ggerganov
/examples/eval-callback/ @ggerganov
/examples/export-docs/ @ggerganov
/examples/gen-docs/ @ggerganov
/examples/gguf/ @ggerganov
/examples/llama.android/ @ggerganov
/examples/llama.swiftui/ @ggerganov
/examples/llama.vim @ggerganov
/examples/lookahead/ @ggerganov
/examples/lookup/ @JohannesGaessler
/examples/model-conversion/ @danbev
/examples/parallel/ @ggerganov
/examples/passkey/ @ggerganov
/examples/retrieval/ @ggerganov
/examples/save-load-state/ @ggerganov
/examples/simple-chat/ @slaren
/examples/simple/ @slaren
/examples/speculative-simple/ @ggerganov
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
/ggml/include/ @ggerganov @slaren
/ggml/src/ggml-alloc.c @slaren
/ggml/src/ggml-backend* @slaren
/ggml/src/ggml-blas/ @slaren
/ggml/src/ggml-common.h @ggerganov @slaren
/ggml/src/ggml-cpu/ @ggerganov @slaren
/ggml/src/ggml-cuda/common.cuh @slaren
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-impl.h @ggerganov @slaren
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
/ggml/src/ggml-threading.* @ggerganov @slaren
/ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-zdnn/ @taronaeo
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml.c @ggerganov @slaren
/ggml/src/ggml.cpp @ggerganov @slaren
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
/gguf-py/ @CISC
/media/ @ggerganov
/scripts/gen* @ggerganov
/scripts/get* @ggerganov
/scripts/sync* @ggerganov
/src/ @ggerganov
/src/llama-adapter.* @CISC
/src/llama-arch.* @CISC
/src/llama-chat.* @ngxson
/src/llama-graph.* @CISC
/src/llama-model-loader.* @slaren
/src/llama-model.* @CISC
/src/llama-vocab.* @CISC
/tests/ @ggerganov
/tests/test-backend-ops.cpp @slaren
/tests/test-thread-safety.cpp @slaren
/tools/batched-bench/ @ggerganov
/tools/llama-bench/ @slaren
/tools/main/ @ggerganov
/tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/run/ @ericcurtin
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov
/vendor/ @ggerganov
/.clang-format @slaren
/.clang-tidy @slaren
/AUTHORS @ggerganov
/CMakeLists.txt @ggerganov
/CONTRIBUTING.md @ggerganov
/LICENSE @ggerganov
/README.md @ggerganov
/SECURITY.md @ggerganov
/build-xcframework.sh @danbev
requirements*.txt @CISC

View File

@ -1,4 +1,12 @@
# Pull requests (for contributors)
# Contributors
The project differentiates between 3 levels of contributors:
- Contributors: people who have contributed before (no special privileges)
- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
# Pull requests (for contributors & collaborators)
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
- Test your changes:
@ -9,15 +17,16 @@
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
# Pull requests (for collaborators)
# Pull requests (for maintainers)
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
- Let authors, who are also collaborators, merge their own PRs
- When merging a PR by a contributor, make sure you have a good understanding of the changes
- Let other maintainers merge their own PRs
- When merging a PR, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
# Coding guidelines
@ -117,6 +126,21 @@
#endif // FOO
```
# Code maintenance
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
- Reviewing and merging related PRs
- Fixing related bugs
- Providing developer guidance/support
- When adding or modifying a large piece of code:
- If you are a collaborator, make sure to add yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
- If you are a contributor, find an existing collaborator who is willing to review and maintain your code long-term
- Provide the necessary CI workflow (and hardware) to test your changes (see [ci/README.md](https://github.com/ggml-org/llama.cpp/tree/master/ci))
- New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
_(NOTE: for legacy reasons, existing code is not required to follow this guideline)_
# Documentation
- Documentation is a community effort

View File

@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
@ -274,6 +275,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
@ -520,8 +522,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
## Contributing
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information

View File

@ -422,6 +422,7 @@ echo "Building for iOS devices..."
cmake -B build-ios-device -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_SYSROOT=iphoneos \
-DCMAKE_OSX_ARCHITECTURES="arm64" \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \

35
ci/README-MUSA.md Normal file
View File

@ -0,0 +1,35 @@
## Running MUSA CI in a Docker Container
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
### 1. Create a local directory to store cached models, configuration files and venv:
```bash
mkdir -p $HOME/llama.cpp/ci-cache
```
### 2. Create a local directory to store CI run results:
```bash
mkdir -p $HOME/llama.cpp/ci-results
```
### 3. Start a Docker container and run the CI:
```bash
docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
```
Inside the container, execute the following commands:
```bash
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

View File

@ -1,18 +1,10 @@
# CI
In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
cover hardware configurations that are not available from Github-hosted runners and/or require more computational
resource than normally available.
https://github.com/ggml-org/ci
It monitors the `master` branch for new commits and runs the
[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
to cover various hardware architectures, including GPU and Apple Silicon instances.
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
Only the branches of this repo are monitored for this keyword.
It is a good practice, before publishing changes to execute the full CI locally on your machine:
It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:
```bash
mkdir tmp
@ -29,40 +21,13 @@ GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with MUSA support
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# etc.
```
## Running MUSA CI in a Docker Container
# Adding self-hosted runners
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
### 1. Create a local directory to store cached models, configuration files and venv:
```bash
mkdir -p $HOME/llama.cpp/ci-cache
```
### 2. Create a local directory to store CI run results:
```bash
mkdir -p $HOME/llama.cpp/ci-results
```
### 3. Start a Docker container and run the CI:
```bash
docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
```
Inside the container, execute the following commands:
```bash
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
- Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
- Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
- Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
- Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env

451
ci/run.sh
View File

@ -45,7 +45,7 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
@ -65,6 +65,16 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
fi
fi
if [ ! -z ${GG_BUILD_ROCM} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
if [ -z ${ONEAPI_ROOT} ]; then
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
@ -82,6 +92,12 @@ fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
# if on Mac, disable METAL
if [[ "$OSTYPE" == "darwin"* ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
fi
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@ -93,6 +109,11 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
fi
## helpers
# download a file if it does not exist or if it is outdated
@ -150,7 +171,7 @@ function gg_run_ctest_debug {
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
@ -200,33 +221,9 @@ function gg_sum_ctest_release {
gg_printf '```\n'
}
# test_scripts_debug
# test_scripts
function gg_run_test_scripts_debug {
cd ${SRC}
set -e
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
# test_scripts_release
function gg_run_test_scripts_release {
function gg_run_test_scripts {
cd ${SRC}
set -e
@ -237,10 +234,10 @@ function gg_run_test_scripts_release {
set +e
}
function gg_sum_test_scripts_release {
function gg_sum_test_scripts {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in release mode\n'
gg_printf 'Runs test scripts\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
@ -249,15 +246,9 @@ function gg_sum_test_scripts_release {
}
function gg_get_model {
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
elif [[ -s $gguf_1 ]]; then
echo -n "$gguf_1"
elif [[ -s $gguf_2 ]]; then
echo -n "$gguf_2"
else
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
@ -316,24 +307,22 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n'
}
# open_llama_7b_v2
# qwen3_0_6b
function gg_run_open_llama_7b_v2 {
function gg_run_qwen3_0_6b {
cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
#gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
path_models="../models-mnt/qwen3/0.6B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@ -343,9 +332,11 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
model_f16="${path_models}/ggml-model-f16.gguf"
model_bf16="${path_models}/ggml-model-bf16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
@ -359,47 +350,51 @@ function gg_run_open_llama_7b_v2 {
wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then
(time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
fi
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -415,138 +410,9 @@ function gg_run_open_llama_7b_v2 {
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_1.4b
function gg_run_pythia_1_4b {
cd ${SRC}
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/pythia/1.4B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
if [ -z ${GG_BUILD_NO_BF16} ]; then
check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@ -563,147 +429,17 @@ function gg_run_pythia_1_4b {
set +e
}
function gg_sum_pythia_1_4b {
function gg_sum_qwen3_0_6b {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 1.4B:\n'
gg_printf 'Qwen3 0.6B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_2_8b
function gg_run_pythia_2_8b {
cd ${SRC}
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/pythia/2.8B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
if [ -z ${GG_BUILD_NO_BF16} ]; then
gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_pythia_2_8b {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 2.8B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
@ -878,20 +614,13 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
test $ret -eq 0 && gg_run test_scripts
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b
else
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi
test $ret -eq 0 && gg_run qwen3_0_6b
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi
fi
exit $ret

View File

@ -89,6 +89,42 @@ if (LLAMA_CURL)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
endif()
if (LLAMA_OPENSSL)
find_package(OpenSSL)
if (OpenSSL_FOUND)
include(CheckCSourceCompiles)
set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
check_c_source_compiles("
#include <openssl/opensslv.h>
#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
# if OPENSSL_VERSION_NUMBER < 0x1010107f
# error bad version
# endif
#else
# if OPENSSL_VERSION_NUMBER < 0x30000000L
# error bad version
# endif
#endif
int main() { return 0; }
" OPENSSL_VERSION_SUPPORTED)
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
if (OPENSSL_VERSION_SUPPORTED)
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
find_library(SECURITY_FRAMEWORK Security REQUIRED)
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
endif()
endif()
else()
message(STATUS "OpenSSL not found, SSL support disabled")
endif()
endif()
if (LLAMA_LLGUIDANCE)
include(ExternalProject)
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)

File diff suppressed because it is too large Load Diff

View File

@ -1616,17 +1616,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
);
});
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\"<|start|>assistant\"? \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_role, " | ") + " )"
);
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
channel + " \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_channel, " | ") + " )"
);
if (data.grammar_lazy) {
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\"<|start|>assistant\"? \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_role, " | ") + " )"
);
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
} else {
auto not_end = builder.add_rule("not-end",
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
auto analysis = builder.add_rule("analysis",
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto commentary = builder.add_rule("commentary",
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
);
builder.add_rule("root",
"( " + analysis + " \"<|start|>assistant\" )? " +
"( " + commentary + " \"<|start|>assistant\" )? " +
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
);
}
// Trigger on tool calls that appear in the commentary channel
data.grammar_triggers.push_back({
@ -1741,10 +1760,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__);
common_chat_params data;
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
const std::optional<json> tools_override = json();
const std::optional<json> additional_context = json {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
};
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@ -2230,15 +2251,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
// Parse thinking tags
static const common_regex start_think_regex(regex_escape("<think>"));
static const common_regex end_think_regex(regex_escape("</think>"));
// Granite models output partial tokens such as "<" and "<think".
// By leveraging try_consume_regex()/try_find_regex() throwing
// common_chat_msg_partial_exception for these partial tokens,
// processing is interrupted and the tokens are not passed to add_content().
if (auto res = builder.try_consume_regex(start_think_regex)) {
// Restore position for try_parse_reasoning()
builder.move_to(res->groups[0].begin);
builder.try_find_regex(end_think_regex, std::string::npos, false);
// Restore position for try_parse_reasoning()
builder.move_to(res->groups[0].begin);
}
builder.try_parse_reasoning("<think>", "</think>");
// Parse response tags using regex
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
if (auto res = builder.try_find_regex(response_regex)) {
// Extract the content between the tags (capture group 1)
auto content = builder.str(res->groups[1]);
builder.add_content(content);
builder.move_to(res->groups[0].end);
// Parse response tags
static const common_regex start_response_regex(regex_escape("<response>"));
static const common_regex end_response_regex(regex_escape("</response>"));
// Granite models output partial tokens such as "<" and "<response".
// Same hack as reasoning parsing.
if (builder.try_consume_regex(start_response_regex)) {
builder.try_find_regex(end_response_regex);
}
if (!builder.syntax().parse_tool_calls) {
@ -2252,13 +2286,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
builder.move_to(res->groups[0].end);
// Expect JSON array of tool calls
auto tool_calls_data = builder.consume_json();
if (tool_calls_data.json.is_array()) {
if (!builder.add_tool_calls(tool_calls_data.json)) {
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
throw common_chat_msg_partial_exception("incomplete tool call");
}
} else {
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
}
} else {
builder.add_content(builder.consume_rest());

View File

@ -14,6 +14,7 @@
#include <climits>
#include <cmath>
#include <codecvt>
#include <chrono>
#include <cstdarg>
#include <cstring>
#include <ctime>
@ -50,6 +51,11 @@
#include <unistd.h>
#endif
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
@ -864,8 +870,20 @@ std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
} else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@ -960,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
if (!has_eos && !has_sep && !has_rerank_prompt) {
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {

View File

@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);

View File

@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}
if (ctx) {
llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
}
}

View File

@ -2396,7 +2396,10 @@ class SmolVLMModel(MmprojModel):
return [] # skip other tensors
@ModelBase.register("Llama4ForConditionalGeneration")
@ModelBase.register(
"Llama4ForConditionalGeneration",
"Llama4ForCausalLM",
)
class Llama4Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA4
undo_permute = False
@ -2414,6 +2417,10 @@ class Llama4Model(LlamaModel):
super().set_gguf_parameters()
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
if "layer_types" in self.hparams:
if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
# all layers are full attention (for MobileLLM), disable swa
self.gguf_writer.add_sliding_window(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.startswith("language_model."):
@ -3713,11 +3720,29 @@ class Qwen2MoeModel(TextModel):
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3
# extra logic for rerank models
is_rerank: bool = False
is_tied_embeddings: bool = False
token_false_id: int | None = None
token_true_id: int | None = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# track for intern-s1-mini
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# a bit hacky, but currently the only way to detect if this is a rerank model
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
readme_path = self.dir_model / "README.md"
readme_text = ""
if readme_path.exists():
with readme_path.open("r", encoding="utf-8") as f:
readme_text = f.read()
if "# Qwen3-Reranker" in readme_text:
self._find_rerank_config()
def set_vocab(self):
# deal with intern-s1-mini
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
@ -3726,6 +3751,53 @@ class Qwen3Model(Qwen2Model):
super().set_vocab()
def _find_rerank_config(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
self.is_rerank = True
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
assert self.token_false_id is not None and self.token_true_id is not None
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.is_rerank:
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
self.gguf_writer.add_chat_template([{
"name": "rerank",
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
}])
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
# extract "yes" and "no" tokens from the output lm_head tensor
false_row = data_torch[self.token_false_id]
true_row = data_torch[self.token_true_id]
return torch.stack([true_row, false_row], dim=0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.is_rerank:
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
is_real_head = not self.is_tied_embeddings and "lm_head" in name
if is_tied_head or is_real_head:
cls_out_head = (
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
self._get_cls_out_tensor(data_torch),
)
if is_tied_head:
embed = (self.map_tensor_name(name), data_torch)
return [cls_out_head, embed]
if is_real_head:
return [cls_out_head]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3MoeForCausalLM")
class Qwen3MoeModel(Qwen2MoeModel):
@ -7652,6 +7724,21 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
if i not in self._attn_layers
]
# There are some models in this family that are non-hybrid, but keep the
# same parent class by setting all layers to "attention." If this is the
# case, the model architecture needs to be updated to a standard
# "granite" or "granitemoe" model
if not self._ssm_layers:
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
new_arch = (
gguf.MODEL_ARCH.GRANITE_MOE
if has_experts else
gguf.MODEL_ARCH.GRANITE
)
self.model_arch = new_arch
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
self.gguf_writer.add_architecture()
# n_group and d_inner are used during reshape_tensors for mamba2
# NOTE: Explicitly include hparam prefix prefix for d_model to
# disambiguate with top-level head_dim
@ -7736,8 +7823,11 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
## If Bamba, use rope, otherwise don't
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
## If Bamba or non-hybrid, use rope, otherwise don't
use_rope = (
"BambaForCausalLM" in self.hparams["architectures"]
or not self._ssm_layers
)
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
if not use_rope:
self.gguf_writer.add_context_length(2**20)
@ -7908,6 +7998,121 @@ class BailingMoeModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
class GroveMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GROVEMOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
self.gguf_writer.add_experts_per_group(2)
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
self.gguf_writer.add_expert_group_scale(0.05)
# YaRN is not enabled by default
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
_experts: list[dict[str, Tensor]] | None = None
_chunk_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith(".expert_bias"):
# FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
return []
# process the experts separately
if name.find("chunk_experts") != -1:
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
assert bid is not None
if self._chunk_experts is None:
self._chunk_experts = [{} for _ in range(self.block_count)]
self._chunk_experts[bid][name] = data_torch
if len(self._chunk_experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
datas.append(self._chunk_experts[bid][ename])
del self._chunk_experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
elif name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._chunk_experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
if len(chunk_experts) > 0:
raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("ChameleonForConditionalGeneration")
@ModelBase.register("ChameleonForCausalLM") # obsolete
class ChameleonModel(TextModel):

61
docs/backend/zDNN.md Normal file
View File

@ -0,0 +1,61 @@
# llama.cpp for IBM zDNN Accelerator
## Background
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
### Llama.cpp + IBM zDNN
The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
## Software & Hardware Support
| Hardware Level | Status | Verified |
| -------------------- | ------------- | -------------------------- |
| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs |
| IBM z16 / LinuxONE 4 | Not Supported | |
## Data Types Supported
| Data Type | Status |
| --------- | --------- |
| F32 | Supported |
| F16 | Supported |
| BF16 | Supported |
## CMake Options
The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
| CMake Option | Default Value | Description |
| ------------ | ------------- | ----------------------------------- |
| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support |
| `ZDNN_ROOT` | `""` | Override zDNN library lookup |
## 1. Install zDNN Library
Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
```sh
git clone --recurse-submodules https://github.com/IBM/zDNN
cd zDNN
autoreconf .
./configure --prefix=/opt/zdnn-libs
make build
sudo make install
```
## 2. Build llama.cpp
```sh
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
cmake -S . -G Ninja -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_ZDNN=ON \
-DZDNN_ROOT=/opt/zdnn-libs
cmake --build build --config Release -j$(nproc)
```

View File

@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
The defaults are:
- `MUSA_VERSION` set to `rc4.2.0`
- `MUSA_VERSION` set to `rc4.3.0`
The resulting images, are essentially the same as the non-MUSA images:

View File

@ -20,7 +20,6 @@ else()
add_subdirectory(gguf-hash)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(parallel)

View File

@ -1,50 +0,0 @@
#!/usr/bin/env bash
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./llama-cli "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}.
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
${AI_NAME}: What do you like to do in your free time? ^_^
${USER_NAME}:" "$@"

View File

@ -1,57 +0,0 @@
@setlocal disabledelayedexpansion enableextensions
@echo off
cd /d "%~dp0.."
if not "%errorlevel%"=="0" (
echo Unable to change directory.
pause
exit /b 1
)
if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
if not defined USER_NAME set "USER_NAME=User"
if not defined AI_NAME set "AI_NAME=ChatLLaMa"
rem Adjust to the number of CPU cores you want to use.
rem if not defined N_THREAD set "N_THREAD=8"
rem Number of tokens to predict (made it larger than default because we want a long interaction)
if not defined N_PREDICTS set "N_PREDICTS=2048"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
rem Default main script paths
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
rem Get main script path from command line arguments
set "MAIN_SCRIPT_PATH=%~1"
rem If the main script path was not specified, try the default paths
if not defined MAIN_SCRIPT_PATH (
for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
)
)
rem If the main script path was not found, tell the user how to specify it
if not defined MAIN_SCRIPT_PATH (
echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
echo %DEFAULT_MAIN_SCRIPT_PATHS%
pause
exit /b 1
)
rem Default context, feel free to edit it
set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
rem Set a temporary variable if N_THREAD is set
if defined N_THREAD (
set "_N_THREAD=--threads %N_THREAD%"
) else (
set "_N_THREAD="
)
rem Run the script
echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
--model "%MODEL%" ^
--n_predict %N_PREDICTS% ^
--color --interactive ^
--reverse-prompt "%USER_NAME%:" ^
--prompt "%PROMPT_TEXT%"

View File

@ -1,41 +0,0 @@
#!/usr/bin/env bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="${USER_NAME:-USER}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "${USER_NAME}:" \
--in-prefix ' ' \
"$@"

View File

@ -1,149 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.." || exit
if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
exit 1
fi
MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
USER_NAME="${USER_NAME:-User}"
AI_NAME="${AI_NAME:-ChatLLaMa}"
DATE_TIME="$(date +%H:%M)"
DATE_YEAR="$(date +%Y)"
LOG="${CHAT_SAVE_DIR}/main.log"
LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
'|'\
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
# An unbuffered `tail -c+N`
skip_bytes() {
LANG=C IFS= read -r -n "$1" -d '' c
while LANG=C IFS= read -r -n 1 -d '' c; do
printf '%s' "$c"
done
}
mkdir -p "$CHAT_SAVE_DIR"
echo >"$LOG"
trap "tail -n100 ${LOG}" EXIT
if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
"$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
fi
if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
fi
if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
echo '...' >>"$NEXT_PROMPT_FILE"
fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing
./llama-cli 2>>"$LOG" \
--batch_size 64 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
--file "$CUR_PROMPT_FILE" \
--n_predict 1
echo
echo 'Done!'
fi
if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
fi
if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
fi
printf '%s ' "$(< "$CUR_PROMPT_FILE")"
n_tokens=0
while read -e line; do
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
# Swap prompts when we're about to run out of context
if ((n_predict <= 0)); then
wait # for background main (below) to finish with next prompt
mv "$NEXT_PROMPT_FILE" "$CUR_PROMPT_FILE"
mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
echo '...' >>"$NEXT_PROMPT_FILE"
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
n_tokens=0
n_predict=$((CTX_SIZE / 2))
fi
echo " ${line}" >>"$CUR_PROMPT_FILE"
if ((n_tokens > CTX_ROTATE_POINT)); then
echo " ${line}" >>"$NEXT_PROMPT_FILE"
fi
n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./llama-cli
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
# if we hit n_predict instead of reverse-prompt, we need to add the prompt
if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
printf '\n%s:' "$USER_NAME"
printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
fi
printf ' '
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1
fi
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
fi
# Update cache for next prompt in background, ideally during user input
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &
done

View File

@ -1,41 +0,0 @@
#!/usr/bin/env bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
# Important:
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View File

@ -43,8 +43,8 @@ The above command will output space-separated float values.
| $"string"$ | |
|--------------|-|
| "\n" | (default)
| "<#embSep#>" | for exemple
| "<#sep#>" | other exemple
| "<#embSep#>" | for example
| "<#sep#>" | other example
## examples
### Unix-based systems (Linux, macOS, etc.):

View File

@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
params.n_batch = params.n_ctx;
}
// For non-causal models, batch size must be equal to ubatch size
// for non-causal models, batch size must be equal to ubatch size
if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
params.n_ubatch = params.n_batch;
}
// get max number of sequences per batch
const int n_seq_max = llama_max_parallel_sequences();
llama_backend_init();
llama_numa_init(params.numa);
@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
// get added sep and eos token, if any
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
@ -153,8 +159,15 @@ int main(int argc, char ** argv) {
// split classification pairs and insert expected separator tokens
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
if (rerank_prompt != nullptr) {
const std::string query = pairs[0];
const std::string doc = pairs[1];
std::string final_prompt = rerank_prompt;
string_replace_all(final_prompt, "{query}" , query);
string_replace_all(final_prompt, "{document}", doc );
inp = common_tokenize(vocab, final_prompt, true, true);
} else {
std::string final_prompt;
for (size_t i = 0; i < pairs.size(); i++) {
final_prompt += pairs[i];
if (i != pairs.size() - 1) {
@ -166,8 +179,8 @@ int main(int argc, char ** argv) {
}
}
}
inp = common_tokenize(ctx, final_prompt, true, true);
}
} else {
inp = common_tokenize(ctx, prompt, true, true);
}
@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
const uint64_t n_toks = inp.size();
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;

View File

@ -5,6 +5,11 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TEST_TARGET test-eval-callback)
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
else()
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
endif()
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@ -1,5 +0,0 @@
set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -1,62 +0,0 @@
## Generative Representational Instruction Tuning (GRIT) Example
[gritlm] a model which can generate embeddings as well as "normal" text
generation depending on the instructions in the prompt.
* Paper: https://arxiv.org/pdf/2402.09906.pdf
### Retrieval-Augmented Generation (RAG) use case
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
that we take documents that we want to use as context, to ground the large
language model (LLM), and we create token embeddings for them. We then store
these token embeddings in a vector database.
When we perform a query, prompt the LLM, we will first create token embeddings
for the query and then search the vector database to retrieve the most
similar vectors, and return those documents so they can be passed to the LLM as
context. Then the query and the context will be passed to the LLM which will
have to _again_ create token embeddings for the query. But because gritlm is used
the first query can be cached and the second query tokenization generation does
not have to be performed at all.
### Running the example
Download a Grit model:
```console
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
```
Run the example using the downloaded model:
```console
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
When shadows lurk and ghosts do roam,
And darkness reigns, a fearsome sight.
Thou didst set out, with heart aglow,
To conquer this mountain, so high,
And reach the summit, where the stars do glow,
And the moon shines bright, up in the sky.
Through the mist and fog, thou didst press on,
With steadfast courage, and a steadfast will,
Through the darkness, thou didst not be gone,
But didst climb on, with a steadfast skill.
At last, thou didst reach the summit's crest,
And gazed upon the world below,
And saw the beauty of the night's best,
And felt the peace, that only nature knows.
Oh, brave adventurer, who dared to climb
The lofty peak of Mt. Fuji in the night,
Thou art a hero, in the eyes of all,
For thou didst conquer this mountain, so bright.
```
[gritlm]: https://github.com/ContextualAI/gritlm

View File

@ -1,231 +0,0 @@
#include "arg.h"
#include "common.h"
#include "llama.h"
#include <string>
#include <vector>
// #define GRIT_DEBUG
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result;
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
for (uint64_t i = 0; i < sentences.size(); i++) {
common_batch_clear(batch);
const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_vocab_eos(vocab));
// we want to ignore instruction tokens for mean pooling
const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
#ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample
std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
});
std::printf("\n");
#endif
// add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) {
common_batch_add(batch, inputs[j], j, { 0 }, true);
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_set_causal_attn(ctx, false);
// run model
llama_decode(ctx, batch);
// get embedding dimensions
uint64_t n_embd = llama_model_n_embd(model);
// allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f);
// sum up all token embeddings
for (int32_t k = n_inst; k < n_toks; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] += emb[j];
}
}
// divide by number of tokens (mean pooling)
{
const uint64_t n_sent = n_toks - n_inst;
for (uint64_t j = 0; j < n_embd; j++) {
emb_unorm[j] /= n_sent;
}
}
std::vector<float> emb_norm(emb_unorm.size());
common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
result.push_back(emb_norm);
#ifdef GRIT_DEBUG
// print out emb_norm
std::printf("embedding %ld: ", i);
for (uint64_t j = 0; j < n_embd; j++) {
std::printf("%.5f ", emb_norm[j]);
}
std::printf("\n\n");
#endif
}
llama_batch_free(batch);
return result;
}
static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
std::string result;
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_token eos_token = llama_vocab_eos(vocab);
llama_memory_clear(llama_get_memory(ctx), true);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
int32_t i_current_token = 0;
while (true) {
common_batch_clear(bat);
{
const int32_t n_inputs = inputs.size();
for (int32_t i = 0; i < n_inputs; i++) {
common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
}
}
inputs.clear();
llama_decode(ctx, bat);
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
if (token == eos_token) {
break;
}
std::string piece = common_token_to_piece(ctx, token);
if (stream) {
std::printf("%s", piece.c_str());
std::fflush(stdout);
}
inputs.push_back(token);
result += piece;
}
if (stream) {
std::printf("\n");
}
llama_batch_free(bat);
return result;
}
static std::string gritlm_instruction(const std::string & instruction) {
return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
}
int main(int argc, char * argv[]) {
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
common_init();
llama_model_params mparams = common_model_params_to_llama(params);
llama_context_params cparams = common_context_params_to_llama(params);
cparams.embeddings = true;
llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// ### Embedding/Representation ###
// samples taken from: https://github.com/ContextualAI/gritlm#basic
{
const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
const std::vector<std::string> queries = {
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
};
const std::vector<std::string> documents = {
"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
};
// No need to add instruction for retrieval documents
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_model_n_embd(model);
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
}
llama_set_embeddings(ctx, false);
// ### Generation ###
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
std::string response = generate(ctx, smpl, prompt, true);
}
llama_sampler_free(smpl);
llama_free(ctx);
llama_model_free(model);
llama_backend_free();
return 0;
}

View File

@ -1,21 +0,0 @@
# llama.cpp/example/jeopardy
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
Step 1: Open jeopardy.sh and modify the following:
```
MODEL=(path to your model)
MODEL_NAME=(name of your model)
prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
opts=(add -instruct here if needed for your model, or anything else you want to test out)
```
Step 2: Run `jeopardy.sh` from the llama.cpp folder
Step 3: Repeat steps 1 and 2 until you have all the results you need.
Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.

View File

@ -1,58 +0,0 @@
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import os
import csv
labels = []
numbers = []
numEntries = 1
rows = []
def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels)
plt.title("Jeopardy Results by Model")
plt.xlabel("Model")
plt.ylabel("Questions Correct")
plt.show()
def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
for row in csv_reader:
global rows
rows.append(row)
for listing in os.listdir(directory):
filename = os.fsdecode(listing)
if filename.endswith(".txt"):
file = open("./examples/jeopardy/results/" + filename, "rt")
global labels
global numEntries
global numbers
labels.append(filename[:-4])
numEntries += 1
i = 1
totalcorrect = 0
for line in file.readlines():
if line.strip() != "------":
print(line)
else:
print("Correct answer: " + rows[i][2] + "\n")
i += 1
print("Did the AI get the question right? (y/n)")
if input() == "y":
totalcorrect += 1
numbers.append(totalcorrect)
if __name__ == '__main__':
calculatecorrect()
pos = list(range(numEntries))
labels.append("Human")
numbers.append(48.11)
bar_chart(numbers, labels, pos)
print(labels)
print(numbers)

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
MODEL_NAME=Vicuna
# exec options
prefix="Human: " # Ex. Vicuna uses "Human: "
opts="--temp 0 -n 80" # additional flags
nl='
'
introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
# file options
question_file=./examples/jeopardy/questions.txt
touch ./examples/jeopardy/results/$MODEL_NAME.txt
output_file=./examples/jeopardy/results/$MODEL_NAME.txt
counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"
echo -e "\n------" >> $output_file
counter=$((counter+1))
done < "$question_file"

View File

@ -1,103 +0,0 @@
Index,Original Category,Original Correct Question,Model Prompt
1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
3,Writers Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
10,Movies of the 80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
28,Geographic Names the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?"
38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?"
39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
44,Art & Science,What is Halleys Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
51,World War II,What is Schindlers list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
70,USA,What is Jack Daniels?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
79,Childrens Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
83,Childrens Lit,What is Charlottes Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
92,Names The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
100,Childrens Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
,,,
TOTALS,,,
Can't render this file because it has a wrong number of fields in line 34.

View File

@ -1,100 +0,0 @@
Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
For a special 1970s cookbook, who provided one simple recipea can of Campbell's tomato soup & 2 cans of milk?
Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territorys largest ethnic group?
In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
In 2010 who introduced the 4-point shot, 35 feet from the basket?
Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
Like Sir Thomas More, 3 16th century English queens are buried at what British location?
In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
What was first sold in 1908, at a price equivalent to about $27,000 today?
The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
Until a 1903 secession, what country's contiguous territory spanned 2 continents?
Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

View File

@ -1,28 +0,0 @@
" Basic plugin example
function! Llm()
let url = "http://127.0.0.1:8080/completion"
" Get the content of the current buffer
let buffer_content = join(getline(1, '$'), "\n")
" Create the JSON payload
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
let json_payload.prompt = buffer_content
" Define the curl command
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
let response = system(curl_command, json_encode(json_payload))
" Extract the content field from the response
let content = json_decode(response).content
let split_newlines = split(content, '\n', 1)
" Insert the content at the cursor position
call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
endfunction
command! Llm call Llm()
noremap <F2> :Llm<CR>

View File

@ -118,13 +118,17 @@ embedding-convert-model:
embedding-run-original-model:
$(call validate_embedding_model_path,embedding-run-original-model)
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
./scripts/embedding/run-original-model.py \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
embedding-run-converted-model:
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
@./scripts/embedding/compare-embeddings-logits.sh
@./scripts/embedding/compare-embeddings-logits.sh \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
embedding-inspect-original-model:
$(call validate_embedding_model_path,embedding-inspect-original-model)
@ -156,7 +160,8 @@ embedding-quantize-model:
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
embedding-run-quantized-model:
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
###
### Perplexity targets/recipes

View File

@ -105,12 +105,12 @@ new model, the model can be converted to GGUF format using the following command
### Inspecting the converted model
The converted model can be inspected using the following command:
```console
(venv) $ make inspect-converted-model
(venv) $ make causal-inspect-converted-model
```
### Running the converted model
```console
(venv) $ make run-converted-model
(venv) $ make causal-run-converted-model
```
### Model logits verfication

View File

@ -151,6 +151,35 @@ int main(int argc, char ** argv) {
logits = llama_get_embeddings(ctx);
n_logits = llama_model_n_embd(model) * batch.n_tokens;
type = "-embeddings";
const int n_embd = llama_model_n_embd(model);
const int n_embd_count = batch.n_tokens;
printf("Embedding dimension: %d\n", n_embd);
printf("\n");
// Print embeddings in the specified format
for (int j = 0; j < n_embd_count; j++) {
printf("embedding %d: ", j);
// Print first 3 values
for (int i = 0; i < 3 && i < n_embd; i++) {
printf("%9.6f ", logits[j * n_embd + i]);
}
printf(" ... ");
// Print last 3 values
for (int i = n_embd - 3; i < n_embd; i++) {
if (i >= 0) {
printf("%9.6f ", logits[j * n_embd + i]);
}
}
printf("\n");
}
printf("\n");
printf("Embeddings size: %d\n", n_logits);
} else {
logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
@ -183,11 +212,11 @@ int main(int argc, char ** argv) {
return 1;
}
for (int i = 0; i < n_logits; i++) {
fprintf(f, "%d: %.6f\n", i, logits[i]); // Added index and changed format
fprintf(f, "%d: %.6f\n", i, logits[i]);
}
fclose(f);
// Print first and last 10 logits for quick verification
if (!embedding_mode) {
printf("First 10 logits: ");
for (int i = 0; i < 10 && i < n_logits; i++) {
printf("%.6f ", logits[i]);
@ -199,6 +228,7 @@ int main(int argc, char ** argv) {
if (i >= 0) printf("%.6f ", logits[i]);
}
printf("\n\n");
}
printf("Logits saved to %s\n", bin_filename);
printf("Logits saved to %s\n", txt_filename);

View File

@ -48,7 +48,7 @@ def main():
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
model_name = os.path.splitext(os.path.basename(model_path))[0]
model_name = os.path.basename(model_path)
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"

View File

@ -193,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids)
outputs = model(input_ids.to(model.device))
logits = outputs.logits
# Extract logits for the last token (next token prediction)

View File

@ -2,8 +2,37 @@
set -e
MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
# Parse command line arguments
MODEL_PATH=""
MODEL_NAME=""
PROMPTS_FILE=""
# First argument is always model path
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
MODEL_PATH="$1"
shift
fi
# Parse remaining arguments
while [[ $# -gt 0 ]]; do
case $1 in
--prompts-file|-pf)
PROMPTS_FILE="$2"
shift 2
;;
*)
# If MODEL_NAME not set and this isn't a flag, use as model name
if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
MODEL_NAME="$1"
fi
shift
;;
esac
done
# Set defaults
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
@ -35,8 +64,18 @@ with open('$TEMP_FILE', 'wb') as f:
trap "rm -f $TEMP_FILE" EXIT
fi
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
# Build the semantic_check.py command
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
--cpp-embeddings $CPP_EMBEDDINGS \
--prompt "Hello world today"
--cpp-embeddings $CPP_EMBEDDINGS"
# Add prompts file if specified, otherwise use default prompt
if [ -n "$PROMPTS_FILE" ]; then
SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
else
SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
fi
# Execute the command
eval $SEMANTIC_CMD

View File

@ -2,8 +2,27 @@
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
# Parse command line arguments
CONVERTED_MODEL=""
PROMPTS_FILE=""
while [[ $# -gt 0 ]]; do
case $1 in
-p|--prompts-file)
PROMPTS_FILE="$2"
shift 2
;;
*)
if [ -z "$CONVERTED_MODEL" ]; then
CONVERTED_MODEL="$1"
fi
shift
;;
esac
done
# First try command line argument, then environment variable
CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
@ -13,8 +32,19 @@ if [ -z "$CONVERTED_MODEL" ]; then
exit 1
fi
# Read prompt from file or use default
if [ -n "$PROMPTS_FILE" ]; then
if [ ! -f "$PROMPTS_FILE" ]; then
echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
exit 1
fi
PROMPT=$(cat "$PROMPTS_FILE")
else
PROMPT="Hello world today"
fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
# TODO: update logits.cpp to accept a --file/-f option for the prompt
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"

View File

@ -13,14 +13,37 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
args = parser.parse_args()
def read_prompt_from_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: Prompts file '{file_path}' not found")
exit(1)
except Exception as e:
print(f"Error reading prompts file: {e}")
exit(1)
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
# This can be used to override the sliding window size for manual testing. This
# can be useful to verify the sliding window attention mask in the original model
# and compare it with the converted .gguf model.
if hasattr(config, 'sliding_window'):
original_sliding_window = config.sliding_window
#original_sliding_window = 6
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
print(f"Using unreleased model: {unreleased_model_name}")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
@ -29,18 +52,27 @@ if unreleased_model_name:
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
model = model_class.from_pretrained(model_path, config=config)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
model = AutoModel.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, config=config)
print(f"Model class: {type(model)}")
#print(f"Model file: {type(model).__module__}")
config = AutoConfig.from_pretrained(model_path)
print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window
if hasattr(model.config, 'sliding_window'):
print(f"Model's sliding_window: {model.config.sliding_window}")
else:
print("Model config does not have sliding_window attribute")
model_name = os.path.basename(model_path)
if args.prompts_file:
prompt_text = read_prompt_from_file(args.prompts_file)
texts = [prompt_text]
else:
texts = ["Hello world today"]
encoded = tokenizer(

View File

@ -67,7 +67,7 @@ def main():
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
args = parser.parse_args()
model_name = os.path.splitext(os.path.basename(args.model_path))[0]
model_name = os.path.basename(args.model_path)
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"

View File

@ -40,7 +40,7 @@ if os.path.exists(index_path):
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
with safe_open(file_path, framework="pt") as f: # type: ignore
with safe_open(file_path, framework="pt") as f:
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
with safe_open(single_file_path, framework="pt") as f: # type: ignore
with safe_open(single_file_path, framework="pt") as f:
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):

View File

@ -101,6 +101,17 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
}
def read_prompt_from_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: Prompts file '{file_path}' not found")
exit(1)
except Exception as e:
print(f"Error reading prompts file: {e}")
exit(1)
def main():
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
@ -108,14 +119,20 @@ def main():
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
args = parser.parse_args()
if args.prompts_file:
prompt = read_prompt_from_file(args.prompts_file)
else:
prompt = args.prompt
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
print("=" * 70)
# Single prompt detailed comparison
print(f"\nTesting with prompt: '{args.prompt}'")
print(f"\nTesting with prompt: '{prompt}'")
# Load the python model to get configuration information and also to load the tokenizer.
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
@ -144,7 +161,7 @@ def main():
else:
model = AutoModel.from_pretrained(args.model_path)
encoded = tokenizer(args.prompt, return_tensors="pt")
encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
@ -155,7 +172,7 @@ def main():
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
# Run comparison
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
# Summary
print(f"\n=== SUMMARY ===")

View File

@ -1,5 +1,41 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 0)
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
# Get current git commit hash
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GGML_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
# Check if the working directory is dirty (i.e., has uncommitted changes)
execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE GGML_GIT_DIRTY
ERROR_QUIET
)
endif()
# Build the version string with optional -dev suffix and dirty flag
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
set(GGML_VERSION "${GGML_VERSION}-dirty")
endif()
if(NOT GGML_BUILD_COMMIT)
set(GGML_BUILD_COMMIT "unknown")
endif()
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -141,7 +177,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
if (MINGW)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
endif()
# ggml core
@ -190,7 +226,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_ZDNN "ggml: use zDNN" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@ -301,26 +336,6 @@ endif()
# Create CMake package
#
# Generate version info based on git commit.
if(NOT DEFINED GGML_BUILD_NUMBER)
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GGML_BUILD_NUMBER
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(GGML_BUILD_NUMBER EQUAL 1)
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
endif()
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GGML_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
endif()
# Capture variables prefixed with GGML_.
@ -349,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
# Create the CMake package and set install location.
set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
set(GGML_INSTALL_VERSION ${GGML_VERSION})
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")

View File

@ -314,6 +314,7 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);

View File

@ -39,6 +39,7 @@ extern "C" {
// user-code should use only these functions
//
// TODO: remove in the future
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);

View File

@ -7,6 +7,9 @@
extern "C" {
#endif
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
#ifdef __cplusplus

View File

@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
//
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
const type prefix##0 = (pointer)->array[0]; \
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
GGML_UNUSED(prefix##0);
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
const type prefix##1 = (pointer)->array[1]; \
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
GGML_UNUSED(prefix##1);
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
const type prefix##2 = (pointer)->array[2]; \
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
GGML_UNUSED(prefix##2);
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
const type prefix##3 = (pointer)->array[3]; \
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
GGML_UNUSED(prefix##3);
#define GGML_TENSOR_UNARY_OP_LOCALS \

View File

@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
if (NOT MSVC)
if (GGML_STATIC)
if (UNIX AND NOT APPLE)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
endif()
add_link_options(-static)
if (MINGW)
add_link_options(-static-libgcc -static-libstdc++)

View File

@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
}
// ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) {
bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
case GGML_OP_SCALE:
case GGML_OP_DIAG_MASK_ZERO:
@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
// dynamic tensor allocator
#define GGML_VBUFFER_MAX_CHUNKS 16
// relative memory address within an allocation that can be split into multiple buffers (chunks)
struct buffer_address {
int chunk; // index of a backend buffer
size_t offset; // local memory offset within the buffer
};
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
}
struct free_block {
size_t offset;
size_t size;
};
struct tallocr_chunk {
struct free_block free_blocks[MAX_FREE_BLOCKS];
int n_free_blocks;
size_t max_size;
};
struct ggml_dyn_tallocr {
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
size_t max_chunk_size;
struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
int n_chunks;
#ifdef GGML_ALLOCATOR_DEBUG
struct {
const struct ggml_tensor * tensor;
size_t offset;
struct buffer_address addr;
} allocated_tensors[1024];
#endif
};
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
chunk->free_blocks[i] = chunk->free_blocks[i-1];
}
// insert the new block
chunk->free_blocks[insert_pos].offset = offset;
chunk->free_blocks[insert_pos].size = size;
chunk->n_free_blocks++;
}
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
// shift all elements after idx by 1 to the left, overwriting the element at idx
for (int i = idx; i < chunk->n_free_blocks; i++) {
chunk->free_blocks[i] = chunk->free_blocks[i+1];
}
chunk->n_free_blocks--;
}
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
return -1;
}
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
chunk->n_free_blocks = 1;
chunk->free_blocks[0].offset = 0;
// available space in a chunk is limited to max_chunk_size, but can be higher if:
// 1. a single tensor exceeds the maximum, and cannot fit any other way
// 2. we are running out of chunks
// backends will either manage to allocate the larger size, or report an error.
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
chunk->free_blocks[0].size = SIZE_MAX/2;
}
alloc->chunks[alloc->n_chunks] = chunk;
alloc->n_chunks++;
return alloc->n_chunks - 1;
}
#ifdef GGML_ALLOCATOR_DEBUG
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor == NULL) {
alloc->allocated_tensors[i].tensor = tensor;
alloc->allocated_tensors[i].offset = offset;
alloc->allocated_tensors[i].addr = addr;
return;
}
}
GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].offset == offset) {
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
alloc->allocated_tensors[i].tensor = NULL;
return;
}
@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
}
#endif
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
int best_fit_chunk = -1;
int best_fit_block = -1;
size_t max_avail = 0;
// find the best fitting free block besides the last block
int best_fit_block = -1;
// find the best fitting free block besides the last block, within any chunk
for (int c = 0; c < alloc->n_chunks; ++c) {
struct tallocr_chunk * chunk = alloc->chunks[c];
size_t best_fit_size = SIZE_MAX;
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
struct free_block * block = &alloc->free_blocks[i];
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
struct free_block * block = &chunk->free_blocks[i];
max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) {
best_fit_chunk = c;
best_fit_block = i;
best_fit_size = block->size;
}
}
}
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
// no suitable block found, try the last block (this will grow a chunks size)
for (int c = 0; c < alloc->n_chunks; ++c) {
struct tallocr_chunk * chunk = alloc->chunks[c];
if (chunk->n_free_blocks > 0) {
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
best_fit_chunk = c;
best_fit_block = chunk->n_free_blocks - 1;
break;
}
}
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
size_t offset = block->offset;
block->offset = offset + size;
if (best_fit_block == -1) {
// none of the existing chunks have enough space left
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
best_fit_block = 0;
}
if (best_fit_chunk == -1) {
// since the last chunk always has virtually endless memory, this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("graph allocation: failed to reserve memory");
}
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
struct free_block * block = &chunk->free_blocks[best_fit_block];
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
block->offset += size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
alloc->n_free_blocks--;
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
}
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
}
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, offset, tensor);
size_t cur_max = offset + size;
if (cur_max > alloc->max_size) {
// sort allocated_tensors by offset
add_allocated_tensor(alloc, addr, tensor);
size_t cur_max = addr.offset + size;
if (cur_max > alloc->max_size[addr.chunk]) {
// sort allocated_tensors by chunk/offset
for (int i = 0; i < 1024; i++) {
for (int j = i + 1; j < 1024; j++) {
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
size_t tmp_offset = alloc->allocated_tensors[i].offset;
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
alloc->allocated_tensors[j].tensor = tmp_tensor;
alloc->allocated_tensors[j].offset = tmp_offset;
alloc->allocated_tensors[j].addr = tmp_addr;
}
}
}
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].addr.chunk,
alloc->allocated_tensors[i].addr.offset,
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
}
#endif
alloc->max_size = MAX(alloc->max_size, offset + size);
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
return offset;
return addr;
GGML_UNUSED(tensor);
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, offset, tensor);
remove_allocated_tensor(alloc, addr, tensor);
#endif
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
// see if we can merge with an existing block
for (int i = 0; i < alloc->n_free_blocks; i++) {
struct free_block * block = &alloc->free_blocks[i];
for (int i = 0; i < chunk->n_free_blocks; i++) {
struct free_block * block = &chunk->free_blocks[i];
// check if ptr is at the end of the block
if (block->offset + block->size == offset) {
if (block->offset + block->size == addr.offset) {
block->size += size;
// check if we can merge with the next block
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
block->size += alloc->free_blocks[i+1].size;
alloc->n_free_blocks--;
for (int j = i+1; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
if (i < chunk->n_free_blocks - 1) {
struct free_block * next = &chunk->free_blocks[i+1];
if (block->offset + block->size == next->offset) {
block->size += next->size;
ggml_dyn_tallocr_remove_block(chunk, i+1);
}
}
return;
}
// check if ptr is at the beginning of the block
if (offset + size == block->offset) {
block->offset = offset;
if (addr.offset + size == block->offset) {
block->offset = addr.offset;
block->size += size;
// check if we can merge with the previous block
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
alloc->free_blocks[i-1].size += block->size;
alloc->n_free_blocks--;
for (int j = i; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
if (i > 0) {
struct free_block * prev = &chunk->free_blocks[i-1];
if (prev->offset + prev->size == block->offset) {
prev->size += block->size;
ggml_dyn_tallocr_remove_block(chunk, i);
}
}
return;
}
}
// otherwise, add a new block
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
alloc->free_blocks[i] = alloc->free_blocks[i-1];
}
// insert the new block
alloc->free_blocks[insert_pos].offset = offset;
alloc->free_blocks[insert_pos].size = size;
alloc->n_free_blocks++;
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
GGML_UNUSED(tensor);
}
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
alloc->n_free_blocks = 1;
alloc->free_blocks[0].offset = 0;
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
alloc->max_size = 0;
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
free(alloc->chunks[i]);
alloc->chunks[i] = NULL;
}
alloc->n_chunks = 0;
#ifdef GGML_ALLOCATOR_DEBUG
for (int i = 0; i < 1024; i++) {
@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
#endif
}
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
*alloc = (struct ggml_dyn_tallocr) {
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
/*.max_size = */ 0,
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
/*.chunks = */ {NULL},
/*.n_chunks = */ 0,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ {{0}},
#endif
@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
}
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
for (int i = 0; i < alloc->n_chunks; ++i) {
free(alloc->chunks[i]);
}
free(alloc);
}
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
return alloc->max_size;
size_t max_size = 0;
for (int i = 0; i < alloc->n_chunks; i++) {
max_size += alloc->chunks[i]->max_size;
}
return max_size;
}
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
struct vbuffer {
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
};
static void ggml_vbuffer_free(struct vbuffer * buf) {
if (buf == NULL) {
return;
}
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
ggml_backend_buffer_free(buf->chunks[i]);
}
free(buf);
}
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
int n = 0;
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
return n;
}
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
size_t size = 0;
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
size += ggml_backend_buffer_get_size(buf->chunks[i]);
}
return size;
}
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
if (buf == NULL) {
return NULL;
}
for (int n = 0; n < talloc->n_chunks; n++) {
size_t chunk_size = talloc->chunks[n]->max_size;
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
if (buf->chunks[n] == NULL) {
ggml_vbuffer_free(buf);
return NULL;
}
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
}
return buf;
}
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
void * addr = (char *)base + buf_addr.offset;
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
}
static void ggml_vbuffer_reset(struct vbuffer * buf) {
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
ggml_backend_buffer_reset(buf->chunks[i]);
}
}
@ -328,13 +470,13 @@ struct hash_node {
int n_children;
int n_views;
int buffer_id;
size_t offset; // offset within the buffer
struct buffer_address addr;
bool allocated;
};
struct tensor_alloc {
int buffer_id;
size_t offset;
struct buffer_address addr;
size_t size_max; // 0 = pre-allocated, unused, or view
};
@ -349,7 +491,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
struct vbuffer ** buffers; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@ -370,7 +512,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL);
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL);
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@ -390,7 +532,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
if (galloc->buf_tallocs[i] == NULL) {
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
}
}
galloc->n_buffers = n_bufs;
@ -418,7 +561,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
}
}
if (!freed) {
ggml_backend_buffer_free(galloc->buffers[i]);
ggml_vbuffer_free(galloc->buffers[i]);
}
}
if (galloc->buf_tallocs != NULL) {
@ -467,7 +610,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
hn->allocated = true;
assert(hn->offset == 0);
assert(hn->addr.offset == 0);
// try to reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) {
@ -501,9 +644,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
assert(view_src_hn->offset == p_hn->offset);
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset;
hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent
view_src_hn->allocated = false;
return;
@ -511,7 +654,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
} else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset;
hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent
return;
}
@ -522,9 +665,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
}
}
@ -536,12 +678,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
}
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset;
int buffer_id = hn->buffer_id;
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
hn->allocated = false;
}
@ -692,24 +833,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->dst.size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.offset = hn->offset;
node_alloc->dst.addr = hn->addr;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->src[j].size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].addr = hn->addr;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
}
}
@ -725,11 +866,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
galloc->leaf_allocs[i].leaf.size_max = 0;
} else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.addr = hn->addr;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
}
@ -744,7 +885,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@ -753,13 +894,12 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
}
}
@ -772,11 +912,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) {
if (tensor->buffer == NULL) {
assert(tensor_alloc->offset == SIZE_MAX);
assert(tensor_alloc->addr.offset == SIZE_MAX);
if (tensor->view_src->buffer == NULL) {
// this tensor was allocated without ggml-backend
return;
@ -785,11 +925,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
} else {
if (tensor->data == NULL) {
assert(tensor_alloc->offset != SIZE_MAX);
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
void * addr = (char *)base + tensor_alloc->offset;
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
assert(tensor_alloc->addr.offset != SIZE_MAX);
assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
} else {
if (tensor->buffer == NULL) {
// this tensor was allocated without ggml-backend
@ -874,7 +1012,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
// reset buffers
for (int i = 0; i < galloc->n_buffers; i++) {
if (galloc->buffers[i] != NULL) {
ggml_backend_buffer_reset(galloc->buffers[i]);
ggml_vbuffer_reset(galloc->buffers[i]);
}
}
@ -917,7 +1055,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
}
}
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
}
// utils

View File

@ -116,7 +116,7 @@ extern "C" {
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// (optional) sort/optimize the nodes in the graph
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
};
struct ggml_backend {

View File

@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
backend->iface.event_wait(backend, event);
}
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
if (backend->iface.optimize_graph != NULL) {
backend->iface.optimize_graph(backend, cgraph);
if (backend->iface.graph_optimize != NULL) {
backend->iface.graph_optimize(backend, cgraph);
}
}
@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
// Optimize this split of the graph. This needs to happen before we make graph_copy,
// so they are in sync.
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
return sched->backends[i];
}
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return sched->bufts[backend_index];
}
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);

View File

@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .optimize_graph = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_blas_guid(void) {

View File

@ -1728,7 +1728,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_get_rows(ctx, dst);
break;
case GGML_OP_SET_ROWS:
std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl;
ggml_cann_set_rows(ctx, dst);
break;
case GGML_OP_DUP:
@ -2757,7 +2756,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .graph_compute = */ ggml_backend_cann_graph_compute,
/* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait,
/* .optimize_graph = */ NULL,
/* .graph_optimize = */ NULL,
};
/**

View File

@ -7,7 +7,7 @@
#include "ggml-cpu.h"
#include "traits.h"
#if defined(__gnu_linux__)
#if defined(__linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
#if defined(__linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
@ -194,6 +194,8 @@ static bool ggml_amx_init() {
return true;
#elif defined(_WIN32)
return true;
#else
return false;
#endif
}

View File

@ -160,7 +160,6 @@
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

View File

@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
return ((v4f32)res)[0];
}
// multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors
const __m128i ax = __lsx_vsigncov_b(x, x);
// Sign the values of the y vectors
const __m128i sy = __lsx_vsigncov_b(x, y);
// Perform multiplication and create 16-bit values
const __m128i dot = lsx_maddubs_h(ax, sy);
const __m128i ones = __lsx_vreplgr2vr_h(1);
return lsx_madd_h(ones, dot);
}
#endif
#if defined(__loongarch_asx)
@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
}
}
// multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors
const __m128i ax = __lsx_vsigncov_b(x, x);
// Sign the values of the y vectors
const __m128i sy = __lsx_vsigncov_b(x, y);
// Perform multiplication and create 16-bit values
const __m128i dot = lsx_maddubs_h(ax, sy);
const __m128i ones = __lsx_vreplgr2vr_h(1);
return lsx_madd_h(ones, dot);
}
// horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) {
__m128 res = lasx_extractf128(x, 1);

View File

@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
for (int j = 0; j < 8; j++) {
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v);
/* Uses non-default rounding for vec_signed or vec_round */
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
for (int j = 0; j < 8; j++) {
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v);
/* Uses non-default rounding for vec_signed or vec_round */
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif
}
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK_MXFP4 == 0);
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
const int qk = QK_MXFP4;
const int nb = n / qk;
const block_mxfp4 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
int ib = 0;
float sumf = 0.0f;
#if defined(__VXE__) || defined(__VXE2__)
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
float32x4_t v_acc = vec_splats(0.0f);
#pragma GCC unroll 8
for (; ib + 1 < nb; ib += 2) {
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
const int8x16_t v_y0l = vec_xl(0, y0->qs);
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
const int8x16_t v_y1l = vec_xl(0, y1->qs);
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
const float32x4_t v_xy0f = vec_float(v_xy0);
const float32x4_t v_xy1f = vec_float(v_xy1);
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
}
for (; ib < nb; ++ib) {
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
const uint8x16_t v_x = vec_xl(0, x0->qs);
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
const int8x16_t v_yl = vec_xl(0, y0->qs);
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
const float32x4_t v_xyf = vec_float(v_xy);
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
v_acc = vec_madd(v_xyf, v_d, v_acc);
}
sumf = vec_hsum_f32x4(v_acc);
*s = sumf;
#else
UNUSED(x);
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
uint8x16_t q3h[4];
uint8x16_t q3b[2];
int8x16_t q3bytes[4];
int8x16_t q8bytes[4];
int8x16_t q8bytes[8];
uint8x16_t qhbits[2];
float sum = 0;

View File

@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);

View File

@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
return GGML_BF16_TO_FP32(x);
}
static inline float i32_to_f32(int32_t x) {
return x;
}
static inline int32_t f32_to_i32(float x) {
return x;
}
static inline float f32_to_f32(float x) {
return x;
}
@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
};
template <>
struct type_conversion_table<int32_t> {
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
};
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
const int64_t ith = params->ith;
const int64_t nth = params->nth;

View File

@ -473,10 +473,10 @@ struct ggml_threadpool {
struct ggml_compute_state {
#ifndef GGML_USE_OPENMP
ggml_thread_t thrd;
bool cpumask[GGML_MAX_N_THREADS];
int last_graph;
bool pending;
#endif
bool cpumask[GGML_MAX_N_THREADS];
struct ggml_threadpool * threadpool;
int ith;
};
@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->workers = workers;
#ifndef GGML_USE_OPENMP
#ifdef GGML_USE_OPENMP
int32_t cpumask_iter = 0;
// Compute CPU masks for each thread
for (int j = 0; j < tpp->n_threads; j++) {
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
}
#else // GGML_USE_OPENMP
ggml_mutex_init(&threadpool->mutex);
ggml_cond_init(&threadpool->cond);
@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
}
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
// Apply thread CPU mask and priority
int ith = omp_get_thread_num();
ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
}
ggml_graph_compute_thread(&threadpool->workers[ith]);
}
} else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);

View File

@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .optimize_graph = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_cpu_guid(void) {

File diff suppressed because it is too large Load Diff

View File

@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO __lsx_vldi(0)
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
return __lsx_vld(tmp, 0);
return (__m128)__lsx_vld(tmp, 0);
}
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
}
#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD __lsx_vfadd_s

View File

@ -25,10 +25,14 @@ if (CUDAToolkit_FOUND)
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
set(CMAKE_CUDA_ARCHITECTURES "native")
else()
if (CUDAToolkit_VERSION VERSION_LESS "13")
list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 61-virtual 70-virtual)
endif ()
list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
else()
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
endif()
endif()
endif()

View File

@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
return;
}

View File

@ -586,19 +586,44 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
}
static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
#ifdef FAST_FP16_AVAILABLE
acc += v*u;
#else
const float2 tmpv = __half22float2(v);
const float2 tmpu = __half22float2(u);
float2 tmpacc = __half22float2(acc);
tmpacc.x += tmpv.x * tmpu.x;
tmpacc.y += tmpv.y * tmpu.y;
acc = make_half2(tmpacc.x, tmpacc.y);
#endif // FAST_FP16_AVAILABLE
}
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
template <int nbytes>
template <int nbytes, int alignment = 0>
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
if constexpr (nbytes == 4) {
*(int *) dst = *(const int *) src;
} else if constexpr (nbytes == 8) {
*(int2 *) dst = *(const int2 *) src;
} else if constexpr (nbytes == 16) {
*(int4 *) dst = *(const int4 *) src;
if constexpr (alignment != 0) {
static_assert(nbytes % alignment == 0, "bad alignment");
}
constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
#pragma unroll
for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
if constexpr (nb_per_cpy == 1) {
((char *) dst)[i] = ((const char *) src)[i];
} else if constexpr (nb_per_cpy == 2) {
((short *) dst)[i] = ((const short *) src)[i];
} else if constexpr (nb_per_cpy == 4) {
((int *) dst)[i] = ((const int *) src)[i];
} else if constexpr (nb_per_cpy == 8) {
((int2 *) dst)[i] = ((const int2 *) src)[i];
} else if constexpr (nb_per_cpy == 16) {
((int4 *) dst)[i] = ((const int4 *) src)[i];
} else {
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
}
}
}
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
#if CUDART_VERSION >= 12080
@ -652,6 +677,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
}
// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
const uint32_t div_val = fastdiv(n, fastdiv_values);
const uint32_t mod_val = n - div_val * fastdiv_values.z;
return make_uint2(div_val, mod_val);
}
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
static __device__ __forceinline__ float get_alibi_slope(

View File

@ -441,6 +441,10 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, nv_bfloat16>>;
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, float>>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
return (void*) cpy_flt<cpy_1_flt<float, int32_t>>;
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_flt<cpy_1_flt<int32_t, float>>;
} else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));

View File

@ -33,276 +33,230 @@ typedef void (* fattn_kernel_t)(
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33);
typedef half (*vec_dot_KQ_f16_t)(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
typedef float (*vec_dot_KQ_f32_t)(
typedef float (*vec_dot_KQ_t)(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_0;
const int shift = k_KQ & (QI8_1/2);
const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/warp_size];
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (8/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_1;
const int shift = k_KQ & (QI8_1/2);
const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/warp_size];
const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
sum += (T) (sumid4d8 + m4s8scaled);
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_0;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v = (get_int_b2(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int vh = get_int_b2(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/warp_size];
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (16/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_1;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v = (get_int_b2(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int vh = get_int_b2(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/warp_size];
const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
sum += (T) (sumid5d8 + m5s8scaled);
}
}
return sum;
}
template <typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_0;
const int iqs = k_KQ % QI8_0;
const int v = get_int_b2(K_q8_0[ib].qs, iqs);
T Q_d;
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
Q_d = __low2half(Q_ds[k_KQ_0/warp_size]);
} else {
const float2 * Q_ds = (const float2 *) Q_ds_v;
Q_d = Q_ds[k_KQ_0/warp_size].x;
}
sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/warp_size], K_q8_0[ib].d, Q_d);
}
return sum;
}
template <typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
template <int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
const half2 * K_h2 = (const half2 *) K_c;
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_h2 = (const half2 *) Q_v;
half2 sum2 = make_half2(0.0f, 0.0f);
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/warp_size];
}
return __low2half(sum2) + __high2half(sum2);
}
#endif // FP16_AVAILABLE
const float2 * Q_f2 = (const float2 *) Q_v;
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4;
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[k_KQ];
sum += __low2float(K_ik) * Q_f2[k_KQ_0/warp_size].x;
sum += __high2float(K_ik) * Q_f2[k_KQ_0/warp_size].y;
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
half2 tmp[cpy_ne];
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
#pragma unroll
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
#ifdef FAST_FP16_AVAILABLE
ggml_cuda_mad(sum, tmp[k_KQ_1] , ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#else
ggml_cuda_mad(sum, __half22float2(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#endif // FP16_AVAILABLE
}
}
return sum;
}
template <typename Tds>
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_0;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q4_0[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += __half2float(K_q4_0[ib].d) * (sumi*Q_ds.x - (8/QI8_1)*Q_ds.y);
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q4_1[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 K_dm = __half22float2(K_q4_1[ib].dm);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_0;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q5_0[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
{
int vh;
ggml_cuda_memcpy_1<sizeof(int), 2>(&vh, K_q5_0[ib].qh);
vh >>= iqs8 * QI5_0;
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
}
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += __half2float(K_q5_0[ib].d) * (sumi*Q_ds.x - (16/QI8_1)*Q_ds.y);
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_1;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q5_1[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
{
int vh;
ggml_cuda_memcpy_1<sizeof(int)>(&vh, K_q5_1[ib].qh);
vh >>= iqs8 * QI5_0;
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
}
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 K_dm = __half22float2(K_q5_1[ib].dm);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
}
return sum;
}
template <int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_0;
const int iqs = k_KQ % QI8_0;
int v;
ggml_cuda_memcpy_1<sizeof(v), 2>(&v, K_q8_0[ib].qs + 4*iqs);
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float Q_d = Q_ds[k_KQ_0/nthreads].x;
sum += vec_dot_q8_0_q8_1_impl<float, 1>(&v, &Q_q8[k_KQ_0/nthreads], K_q8_0[ib].d, Q_d);
}
return sum;
}
template <typename Tds, int ni>
static __device__ __forceinline__ void quantize_q8_1_to_shared(
const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
float vals[sizeof(int)] = {0.0f};
#pragma unroll
for (int l = 0; l < int(sizeof(int)); ++l) {
vals[l] = scale * x[4*threadIdx.x + l];
vals[l] = (ni == WARP_SIZE || threadIdx.x < ni) ? scale * x[4*threadIdx.x + l] : 0.0f;
}
float amax = fabsf(vals[0]);
@ -330,7 +284,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
}
yq32[threadIdx.x] = q32;
if (threadIdx.x % QI8_1 == 0) {
if (threadIdx.x % QI8_1 == 0 && (ni == WARP_SIZE || threadIdx.x < ni)) {
if (std::is_same<Tds, half2>::value) {
((half2 *) yds)[threadIdx.x/QI8_1] = make_half2(d, sum);
} else {
@ -339,167 +293,276 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
}
}
typedef half (*dequantize_1_f16_t)(const void *, const int64_t);
typedef float (*dequantize_1_f32_t)(const void *, const int64_t);
typedef void (*dequantize_V_t)(const void *, void *, const int64_t);
template <typename T>
static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) {
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
if constexpr (std::is_same_v<T, half>) {
ggml_cuda_memcpy_1<ne*sizeof(half)>(dst, (const half *) vx + i0);
} else if constexpr (std::is_same_v<T, float>) {
static_assert(ne % 2 == 0, "bad ne");
half2 tmp[ne/2];
ggml_cuda_memcpy_1<ne*sizeof(half)>(tmp, (const half *) vx + i0);
float2 * dst_f2 = (float2 *) dst;
#pragma unroll
for (int l = 0; l < ne/2; ++l) {
dst_f2[l] = __half22float2(tmp[l]);
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
}
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q4_0 * x = (const block_q4_0 *) vx;
const int64_t ib = i / QK4_0;
const int iqs = i % (QK4_0/2);
const int shift = (i % QK4_0) / (QK4_0/2);
const int64_t ib = i0 / QK4_0;
const int iqs = i0 % (QK4_0/2);
const int shift = (i0 % QK4_0) / (QK4_0/2);
const T d = x[ib].d;
const int q0 = x[ib].qs[iqs];
const int q = ((q0 >> (4*shift)) & 0x0F) - 8;
int q;
static_assert(ne == 2 || ne == 4, "bad ne");
ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
q = __vsubss4(q, 0x08080808);
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
if constexpr (std::is_same_v<T, half>) {
const half2 d = __half2half2(x[ib].d);
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float d = x[ib].d;
return ((float) d)*((float) q);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * q8[l];
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
}
template <typename T>
static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) {
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q4_1 * x = (const block_q4_1 *) vx;
const int64_t ib = i / QK4_1;
const int iqs = i % (QK4_1/2);
const int shift = (i % QK4_1) / (QK4_1/2);
const int64_t ib = i0 / QK4_1;
const int iqs = i0 % (QK4_1/2);
const int shift = (i0 % QK4_1) / (QK4_1/2);
const half2 dm = x[ib].dm;
const int q0 = x[ib].qs[iqs];
const int q = ((q0 >> (4*shift)) & 0x0F);
int q;
static_assert(ne == 2 || ne == 4, "bad ne");
ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return __low2half(dm)*((half) q) + __high2half(dm);
if constexpr (std::is_same_v<T, half>) {
const half2 dm = x[ib].dm;
const half2 d = __half2half2( __low2half(dm));
const half2 m = __half2half2(__high2half(dm));
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float2 dm = __half22float2(x[ib].dm);
return __low2float(dm)*((float) q) + __high2float(dm);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = dm.x * q8[l] + dm.y;
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
}
template <typename T>
static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) {
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q5_0 * x = (const block_q5_0 *) vx;
const int64_t ib = i / QK5_0;
const int idq = i % QK5_0;
const int iqs = i % (QK5_0/2);
const int shift = (i % QK5_0) / (QK5_0/2);
const int64_t ib = i0 / QK5_0;
const int idq = i0 % QK5_0;
const int iqs = i0 % (QK5_0/2);
const int shift = (i0 % QK5_0) / (QK5_0/2);
const T d = x[ib].d;
const int ql0 = x[ib].qs[iqs];
const int qh0 = get_int_b2(x[ib].qh, 0);
const int ql = ((ql0 >> (4*shift)) & 0x0F);
const int qh = ((qh0 >> idq) << 4) & 0x10;
const int q = (ql | qh) - 16;
int q;
static_assert(ne == 2 || ne == 4, "bad ne");
ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
{
int qh;
ggml_cuda_memcpy_1<ne, 2>(&qh, x[ib].qh);
#pragma unroll
for (int l = 0; l < ne; ++l) {
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
}
}
q = __vsubss4(q, 0x10101010);
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
if constexpr (std::is_same_v<T, half>) {
const half2 d = __half2half2(x[ib].d);
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float d = x[ib].d;
return ((float) d)*((float) q);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * q8[l];
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
}
template <typename T>
static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) {
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q5_1 * x = (const block_q5_1 *) vx;
const int64_t ib = i / QK5_1;
const int idq = i % QK5_1;
const int iqs = i % (QK5_1/2);
const int shift = (i % QK5_1) / (QK5_1/2);
const int64_t ib = i0 / QK5_1;
const int idq = i0 % QK5_1;
const int iqs = i0 % (QK5_1/2);
const int shift = (i0 % QK5_1) / (QK5_1/2);
const half2 dm = x[ib].dm;
const int ql0 = x[ib].qs[iqs];
const int qh0 = get_int_b4(x[ib].qh, 0);
const int ql = ((ql0 >> (4*shift)) & 0x0F);
const int qh = ((qh0 >> idq) << 4) & 0x10;
const int q = (ql | qh);
int q;
static_assert(ne == 2 || ne == 4, "bad ne");
ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
{
int qh;
ggml_cuda_memcpy_1<ne>(&qh, x[ib].qh);
#pragma unroll
for (int l = 0; l < ne; ++l) {
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
}
}
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return __low2half(dm)*((half) q) + __high2half(dm);
if constexpr (std::is_same_v<T, half>) {
const half2 dm = x[ib].dm;
const half2 d = __half2half2( __low2half(dm));
const half2 m = __half2half2(__high2half(dm));
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float2 dm = __half22float2(x[ib].dm);
return __low2float(dm)*((float) q) + __high2float(dm);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = dm.x * q8[l] + dm.y;
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
}
template <typename T>
static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) {
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q8_0 * x = (const block_q8_0 *) vx;
const int64_t ib = i / QK8_0;
const int iqs = i % QK8_0;
const int64_t ib = i0 / QK8_0;
const int iqs = i0 % QK8_0;
const T d = x[ib].d;
const int q = x[ib].qs[iqs];
static_assert(ne % 2 == 0, "bad ne");
int8_t qs[ne];
ggml_cuda_memcpy_1<ne, 2>(qs, x[ib].qs + iqs);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
if constexpr (std::is_same<T, half>::value) {
const half2 d = __half2half2(x[ib].d);
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]);
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same<T, float>::value) {
const float d = x[ib].d;
return ((float) d)*((float) q);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * qs[l];
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
}
template <typename T>
static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) {
const half * x = (const half *) vx;
return x[i];
template <ggml_type type_K, int D, int nthreads>
constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
if constexpr (type_K == GGML_TYPE_F16) {
return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q4_0) {
return vec_dot_fattn_vec_KQ_q4_0<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q4_1) {
return vec_dot_fattn_vec_KQ_q4_1<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q5_0) {
return vec_dot_fattn_vec_KQ_q5_0<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q5_1) {
return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q8_0) {
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
} else {
static_assert(type_K == -1, "bad type");
return nullptr;
}
}
template <int D, int warp_size = WARP_SIZE>
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> :
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> :
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> :
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> :
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> :
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> :
nullptr;
template <ggml_type type_V, typename T, int ne>
constexpr __device__ dequantize_V_t get_dequantize_V() {
if constexpr (type_V == GGML_TYPE_F16) {
return dequantize_V_f16<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q4_0) {
return dequantize_V_q4_0<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q4_1) {
return dequantize_V_q4_1<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q5_0) {
return dequantize_V_q5_0<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q5_1) {
return dequantize_V_q5_1<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q8_0) {
return dequantize_V_q8_0<T, ne>;
} else {
static_assert(type_V == -1, "bad type");
return nullptr;
}
template <int D, int warp_size = WARP_SIZE>
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> :
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> :
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> :
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> :
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> :
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
nullptr;
}
constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) {
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<half> :
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<half> :
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<half> :
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<half> :
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<half> :
type_V == GGML_TYPE_F16 ? dequantize_1_f16<half> :
nullptr;
}
constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<float> :
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<float> :
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<float> :
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<float> :
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<float> :
type_V == GGML_TYPE_F16 ? dequantize_1_f16<float> :
nullptr;
}
template <int ncols1>
@ -870,7 +933,7 @@ void launch_fattn(
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
// Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead.
if (efficiency_percent_best >= 90 && nwaves > nwaves_best) {
if (efficiency_percent_best >= 95 && nwaves > nwaves_best) {
break;
}

View File

@ -35,7 +35,6 @@ static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int
switch (D) {
case 64:
case 128:
return 128;
case 256:
return ncols <= 16 ? 128 : 64;
default:
@ -86,7 +85,6 @@ static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols
switch (D) {
case 64:
case 128:
return 128;
case 256:
return ncols <= 16 ? 128 : 64;
default:

View File

@ -1,495 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
#endif // GGML_USE_HIP
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
if (ncols > 1) {
NO_DEVICE_CODE;
return;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
half2 * KQ2 = (half2 *) KQ;
half kqmax[ncols];
half kqsum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
kqsum[j] = 0.0f;
}
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__shared__ half maskh_shared[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskh_shared[j*D + tid] = 0.0f;
}
__syncthreads();
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)];
half2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
if (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 2 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
tmp_q_i32[i] = 0;
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f);
}
continue;
}
const float * Q_f = (const float *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
quantize_q8_1_to_shared<half2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
__syncthreads();
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*D * nb11;
V += blockIdx.y*D * nb21;
maskh += blockIdx.y*D;
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
// Increment pointers after each loop:
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
if (mask) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
}
__syncthreads();
}
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum((float)sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += maskh_shared[j*D + i_KQ];
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
if (sinksf && blockIdx.y == 0) {
const half sink = __float2half(sinksf[head]);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(sink - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale;
if (tid == 0) {
kqsum[j] += val;
}
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum((float)kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (gridDim.y == 1) {
dst_val /= kqsum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
}
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_f16_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,486 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
#endif // GGML_USE_HIP
static __global__ void flash_attn_vec_ext_f32(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
if (ncols > 1) {
NO_DEVICE_CODE;
return;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ float KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -FLT_MAX/2.0f;
}
float kqmax[ncols];
float kqsum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -FLT_MAX/2.0f;
kqsum[j] = 0.0f;
}
__shared__ float kqmax_shared[ncols][WARP_SIZE];
__shared__ float kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__shared__ float maskf_shared[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskf_shared[j*D + tid] = 0.0f;
}
__syncthreads();
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
float2 Q_f2[ncols][D/(2*WARP_SIZE)];
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)];
float2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
if (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 2 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
tmp_q_i32[i] = 0;
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
}
continue;
}
const float * Q_f = (const float *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
Q_f2[j][i0/WARP_SIZE].x *= scale;
Q_f2[j][i0/WARP_SIZE].y *= scale;
}
}
}
float VKQ[ncols] = {0.0f};
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*D * nb11;
V += blockIdx.y*D * nb21;
maskh += blockIdx.y*D;
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
// Increment pointers after each loop:
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
if (mask) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
}
__syncthreads();
}
float kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += maskf_shared[j*D + i_KQ];
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_new_arr[j];
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const float val = expf(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= KQ_max_scale;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < D; ++k) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
break;
}
const float V_ki = dequantize_1_v(V + k*nb21, tid);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_ki*KQ[j*D + k];
}
}
__syncthreads();
}
if (sinksf && blockIdx.y == 0) {
const float sink = sinksf[head];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const float val = expf(sink - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale;
if (tid == 0) {
kqsum[j] += val;
}
VKQ[j] *= KQ_max_scale;
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
float dst_val = VKQ[j_VKQ];
if (gridDim.y == 1) {
dst_val /= kqsum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
}
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_f32_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -0,0 +1,593 @@
#include "common.cuh"
#include "fattn-common.cuh"
static int ggml_cuda_fattn_vec_get_nthreads_host(const int cc) {
return 128;
GGML_UNUSED(cc);
}
static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
return 128;
}
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
static __global__ void flash_attn_ext_vec(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4;
#ifdef GGML_USE_HIP
#ifdef RDNA
constexpr int nthreads_KQ_q = 2;
#else
constexpr int nthreads_KQ_q = 4;
#endif // RDNA
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
#else
constexpr int nthreads_KQ_q = (D/4 < 32 ? D/4 : 32);
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
#endif // GGML_USE_HIP
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
#ifdef FAST_FP16_AVAILABLE
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
#else
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, float, V_rows_per_thread>();
#endif // FAST_FP16_AVAILABLE
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = nthreads / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nthreads);
constexpr int ne_KQ = ncols*D;
constexpr int ne_combine = nwarps*V_cols_per_iter*D;
#ifdef FAST_FP16_AVAILABLE
half2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ half KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#else
float2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#endif // FAST_FP16_AVAILABLE
float KQ_max[ncols];
float KQ_sum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_max[j] = -FLT_MAX/2.0f;
KQ_sum[j] = 0.0f;
}
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
#ifdef FAST_FP16_AVAILABLE
half2 Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
#else
float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
#endif // FAST_FP16_AVAILABLE
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
if constexpr (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 1 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE <= D/sizeof(int) || i < D/sizeof(int)) {
tmp_q_i32[i] = 0;
}
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
}
} else {
const float * Q_f = (const float *) (Q + j*nb01);
constexpr int nthreads_quantize = D/sizeof(int) < WARP_SIZE ? D/sizeof(int) : WARP_SIZE;
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_quantize) {
quantize_q8_1_to_shared<float2, nthreads_quantize>
(Q_f + i0*sizeof(int), scale, tmp_q_i32 + i0, tmp_q_ds + i0/QI8_1);
}
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_KQ) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ);
Q_i32[j][i0/nthreads_KQ] = tmp_q_i32[i];
Q_ds[j][i0/nthreads_KQ] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#ifdef FAST_FP16_AVAILABLE
const half2 scale_h2 = make_half2(scale, scale);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
if (ncols == 1 || ic0 + j < ne01) {
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
}
#pragma unroll
for (int i1 = 0; i1 < cpy_ne; ++i1) {
Q_reg[j][i0/nthreads_KQ + i1] = make_half2(tmp[i1].x, tmp[i1].y);
}
}
#pragma unroll
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
Q_reg[j][k] *= scale_h2;
}
}
#else
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
if (ncols == 1 || ic0 + j < ne01) {
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
}
}
#pragma unroll
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
Q_reg[j][k].x *= scale;
Q_reg[j][k].y *= scale;
}
}
#endif // FAST_FP16_AVAILABLE
}
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*nthreads * nb11;
V += blockIdx.y*nthreads * nb21;
maskh += blockIdx.y*nthreads;
for (int k_VKQ_0 = blockIdx.y*nthreads; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nthreads,
// Increment pointers after each loop:
K += gridDim.y*nthreads*nb11, V += gridDim.y*nthreads*nb21, maskh += gridDim.y*nthreads) {
// Calculate KQ tile and keep track of new maximum KQ values:
float KQ_reg[ncols]; // KQ in registers.
float KQ_max_new[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_max_new[j] = KQ_max[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < nthreads_KQ; ++i_KQ_0) {
const int i_KQ = threadIdx.y*WARP_SIZE + (nthreads_KQ == WARP_SIZE ? 0 : (threadIdx.x & ~(nthreads_KQ-1))) + i_KQ_0;
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum<nthreads_KQ>(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
if (mask) {
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
}
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == i_KQ_0) {
KQ_reg[j] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
}
const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
KQ_max[j] = KQ_max_new[j];
KQ_reg[j] = expf(KQ_reg[j] - KQ_max[j]);
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
KQ[j*nthreads + tid] = KQ_reg[j];
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
#ifndef GGML_USE_HIP
__syncwarp();
#endif // GGML_USE_HIP
#pragma unroll
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
#ifdef FAST_FP16_AVAILABLE
half2 KQ_k[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
half2 tmp[V_rows_per_thread/2];
dequantize_V(V + k*nb21, tmp,
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
#pragma unroll
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1] += tmp[i_VKQ_1]*KQ_k[j];
}
}
}
#else
float KQ_k[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_k[j] = KQ[j*nthreads + k];
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
float2 tmp[V_rows_per_thread/2];
dequantize_V(V + k*nb21, tmp,
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
#pragma unroll
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
}
}
}
#endif // FAST_FP16_AVAILABLE
}
}
if (sinks && blockIdx.y == 0) {
const float sink = ((const float *) sinks)[head];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
const float kqmax_new_j = fmaxf(sink, KQ_max[j]);
const float KQ_max_scale = expf(KQ_max[j] - kqmax_new_j);
KQ_max[j] = kqmax_new_j;
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
}
__shared__ float KQ_max_shared[ncols][WARP_SIZE];
__shared__ float KQ_sum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
KQ_max_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
KQ_sum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
KQ_max_shared[j][threadIdx.y] = KQ_max[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 1 && ic0 + j_VKQ >= ne01) {
break;
}
float kqmax_new = KQ_max_shared[j_VKQ][threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
KQ_max[j_VKQ] = kqmax_new;
#ifdef FAST_FP16_AVAILABLE
half2 * VKQ_tmp = (half2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
const half2 kqmax_scale_h2 = make_half2(kqmax_scale, kqmax_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j_VKQ][i_VKQ_0/nthreads_V] *= kqmax_scale_h2;
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
ggml_cuda_memcpy_1<V_rows_per_thread*sizeof(half)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
}
#else
float2 * VKQ_tmp = (float2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j_VKQ][i_VKQ_0/nthreads_V].x *= kqmax_scale;
VKQ[j_VKQ][i_VKQ_0/nthreads_V].y *= kqmax_scale;
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ + V_rows_per_thread/4, &VKQ[j_VKQ][i_VKQ_0/nthreads_V + V_rows_per_thread/4]);
}
#endif // FAST_FP16_AVAILABLE
KQ_sum[j_VKQ] *= kqmax_scale;
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
if (threadIdx.x == 0) {
KQ_sum_shared[j_VKQ][threadIdx.y] = KQ_sum[j_VKQ];
}
__syncthreads();
if (nthreads <= D || tid < D) {
KQ_sum[j_VKQ] = KQ_sum_shared[j_VKQ][threadIdx.x];
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
#pragma unroll
for (int i0 = 0; i0 < D; i0 += nthreads) {
float dst_val = 0;
#pragma unroll
for (int w = 0; w < nwarps; ++w) {
#pragma unroll
for (int v = 0; v < V_cols_per_iter; ++v) {
dst_val += float(KQ[w*V_cols_per_iter*D + v*D + i0 + tid]);
}
}
if (gridDim.y == 1) {
dst_val /= KQ_sum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
}
}
if (j_VKQ < ncols-1) {
__syncthreads();
}
}
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int nthreads = ggml_cuda_fattn_vec_get_nthreads_host(cc);
const int nwarps = nthreads / WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_ext_vec<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = false;
constexpr bool need_f16_V = false;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
#define EXTERN_DECL_FATTN_VEC_CASES(D, type_K) \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_F16); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_0); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_1); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)

View File

@ -2,8 +2,7 @@
#include "fattn-common.cuh"
#include "fattn-mma-f16.cuh"
#include "fattn-tile.cuh"
#include "fattn-vec-f16.cuh"
#include "fattn-vec-f32.cuh"
#include "fattn-vec.cuh"
#include "fattn-wmma-f16.cuh"
#include "fattn.cuh"
@ -117,151 +116,68 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
}
}
#define FATTN_VEC_F16_CASE(D, type_K, type_V) \
#define FATTN_VEC_CASE(D, type_K, type_V) \
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \
ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
return; \
} \
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
#define FATTN_VEC_CASES_ALL_D(type_K, type_V) \
FATTN_VEC_CASE( 64, type_K, type_V) \
FATTN_VEC_CASE(128, type_K, type_V) \
FATTN_VEC_CASE(256, type_K, type_V) \
static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
#ifdef GGML_CUDA_FA_ALL_QUANTS
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
#else
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#endif // GGML_CUDA_FA_ALL_QUANTS
GGML_ABORT("fatal error");
}
#define FATTN_VEC_F32_CASE(D, type_K, type_V) \
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
return; \
} \
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
#ifdef GGML_CUDA_FA_ALL_QUANTS
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#else
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
#endif // GGML_CUDA_FA_ALL_QUANTS
GGML_ABORT("fatal error");
@ -271,8 +187,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
enum best_fattn_kernel {
BEST_FATTN_KERNEL_NONE = 0,
BEST_FATTN_KERNEL_TILE = 200,
BEST_FATTN_KERNEL_VEC_F32 = 100,
BEST_FATTN_KERNEL_VEC_F16 = 110,
BEST_FATTN_KERNEL_VEC = 100,
BEST_FATTN_KERNEL_WMMA_F16 = 300,
BEST_FATTN_KERNEL_MMA_F16 = 400,
};
@ -283,7 +198,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE;
#endif// FLASH_ATTN_AVAILABLE
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
@ -293,8 +207,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
const int cc = ggml_cuda_info().devices[device].cc;
const int warp_size = ggml_cuda_info().devices[device].warp_size;
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
switch (K->ne[0]) {
case 64:
@ -343,31 +255,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
#endif // GGML_CUDA_FA_ALL_QUANTS
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
#ifdef GGML_CUDA_FA_ALL_QUANTS
if (K->ne[0] != 128 && K->ne[0] != 64) {
return BEST_FATTN_KERNEL_NONE;
}
#else
if (K->ne[0] != 128) {
return BEST_FATTN_KERNEL_NONE;
}
#endif // GGML_CUDA_FA_ALL_QUANTS
break;
default:
return BEST_FATTN_KERNEL_NONE;
}
switch (V->type) {
case GGML_TYPE_F16:
break;
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
if (K->ne[0] != 128) {
return BEST_FATTN_KERNEL_NONE;
}
break;
default:
return BEST_FATTN_KERNEL_NONE;
@ -377,30 +264,39 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE;
}
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0;
// If Turing tensor cores available, use them except for some cases with batch size 1:
if (turing_mma_available(cc)) {
const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192);
const bool mma_faster_for_bs1 = gqa_opt_applies && !mma_needs_data_conversion &&
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
return BEST_FATTN_KERNEL_VEC_F16;
best_fattn_kernel best = BEST_FATTN_KERNEL_MMA_F16;
if (can_use_vector_kernel) {
if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
best = BEST_FATTN_KERNEL_VEC;
}
return BEST_FATTN_KERNEL_VEC_F32;
} else {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
if (Q->ne[1] <= 2) {
best = BEST_FATTN_KERNEL_VEC;
}
} else {
if (Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC;
}
}
}
if ((gqa_ratio % 2 != 0 || !mask) && Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC; // GQA-specific optimizations in the mma kernel do not apply.
}
return BEST_FATTN_KERNEL_MMA_F16;
}
// Use kernels specializes for small batch sizes if possible:
if (Q->ne[1] <= 8 && can_use_vector_kernel) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
return BEST_FATTN_KERNEL_VEC_F16;
return best;
}
return BEST_FATTN_KERNEL_VEC_F32;
// Use kernels specialized for small batch sizes if possible:
if (Q->ne[1] <= 8 && can_use_vector_kernel) {
return BEST_FATTN_KERNEL_VEC;
}
// For large batch sizes, use the WMMA kernel if possible:
@ -420,11 +316,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
case BEST_FATTN_KERNEL_TILE:
ggml_cuda_flash_attn_ext_tile(ctx, dst);
break;
case BEST_FATTN_KERNEL_VEC_F32:
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
break;
case BEST_FATTN_KERNEL_VEC_F16:
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
case BEST_FATTN_KERNEL_VEC:
ggml_cuda_flash_attn_ext_vec(ctx, dst);
break;
case BEST_FATTN_KERNEL_WMMA_F16:
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);

View File

@ -45,6 +45,7 @@
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/mean.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/topk-moe.cuh"
#include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv.cuh"
@ -2030,7 +2031,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[id].cc;
const int warp_size = ggml_cuda_info().devices[id].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
@ -2038,7 +2039,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[ctx.device].cc;
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
@ -2110,7 +2111,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
return;
}
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) {
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
return;
}
@ -2825,6 +2826,44 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
GGML_ASSERT(unary_ops.size() == num_unary);
#endif
//TODO: remove special case once ggml_can_fuse can handle empty nodes
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
return false;
}
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
}
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+8];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
return true;
}
}
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
return false;
}
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
}
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+4];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
return true;
}
}
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
return false;
}
@ -2915,6 +2954,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
ggml_tensor * weights = cgraph->nodes[i+8];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
i += 8;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
ggml_tensor * weights = cgraph->nodes[i+4];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
i += 4;
continue;
}
if (node->op == GGML_OP_ADD) {
int n_fuse = 0;
ggml_op ops[8];
@ -3140,7 +3195,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .optimize_graph = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_cuda_guid() {
@ -3427,7 +3482,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_I64;
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
case GGML_OP_CPY:
{

View File

@ -84,7 +84,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
}
}
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) {
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
if (ggml_is_quantized(type)) {
return false;
@ -96,9 +96,19 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
return false;
}
if (mul_mat_id) {
if (type == GGML_TYPE_F32 && src1_ncols > 32) {
return false;
}
if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 64) {
return false;
}
} else {
if (src1_ncols > 16) {
return false;
}
}
switch (type) {
case GGML_TYPE_F32:

View File

@ -9,13 +9,13 @@ using namespace ggml_cuda_mma;
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
static __global__ void mul_mat_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
const int ncols, const int ncols_dst_total, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
const int stride_col_id, const int stride_row_id,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
@ -31,9 +31,20 @@ static __global__ void mul_mat_f(
const int row0 = blockIdx.x * rows_per_block;
const int expert_idx = has_ids ? blockIdx.y : 0;
int expert_idx = 0;
int col_base = 0;
const int channel_dst = has_ids ? 0 : blockIdx.y;
if constexpr (has_ids) {
// experts + tiles of ncols_dst are packed in the y dimension
int col_tiles = (ncols_dst_total + cols_per_block - 1) / cols_per_block;
const int nchannels_x = gridDim.y / col_tiles;
const int tile_idx = blockIdx.y / nchannels_x;
expert_idx = blockIdx.y - tile_idx * nchannels_x;
col_base = tile_idx * cols_per_block;
}
const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio);
const int channel_y = channel_dst;
const int sample_dst = blockIdx.z;
@ -44,6 +55,14 @@ static __global__ void mul_mat_f(
y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y);
dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst);
if constexpr (has_ids) {
constexpr int y_stride_scale = std::is_same_v<T, float> ? 1 : 2;
const int64_t col_offset = col_base;
y += col_offset * stride_col_y * y_stride_scale;
dst += col_offset * stride_col_dst;
ids += col_offset * stride_row_id;
}
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
@ -61,12 +80,17 @@ static __global__ void mul_mat_f(
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
const int j = j0 + threadIdx.y;
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
if (threadIdx.x == 0) {
slot_map[j] = -1;
}
if (col_base + j >= ncols_dst_total) {
continue;
}
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
int match = id_row[k*stride_col_id] == expert_idx;
@ -108,7 +132,8 @@ static __global__ void mul_mat_f(
if constexpr (!has_ids) {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
} else {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
tile_xy[j0*tile_k_padded + threadIdx.x] = valid ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
}
}
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
@ -120,7 +145,8 @@ static __global__ void mul_mat_f(
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
} else {
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
}
}
@ -183,14 +209,14 @@ static __global__ void mul_mat_f(
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
} else {
const int slot = (j < cols_per_block) ? slot_map[j] : -1;
if (slot >= 0) {
if (slot >= 0 && (col_base + j) < ncols_dst_total) {
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
}
}
}
#else
GGML_UNUSED_VARS(x, y, ids, dst,
ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
@ -201,20 +227,23 @@ static __global__ void mul_mat_f(
template<typename T, int cols_per_block, int nwarps>
static inline void mul_mat_f_switch_ids(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nchannels_dst,
const int64_t ncols_x, const int64_t ncols_dst, const int64_t nchannels_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
if (ids) {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
const int64_t col_tiles = (ncols_dst + cols_per_block - 1) / cols_per_block;
dim3 block_nums_ids = block_nums;
block_nums_ids.y *= col_tiles;
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} else {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols_x, cols_per_block, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
}
@ -223,7 +252,8 @@ static inline void mul_mat_f_switch_ids(
template <typename T, int cols_per_block>
void mul_mat_f_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
@ -268,49 +298,49 @@ void mul_mat_f_cuda(
switch (nwarps_best) {
case 1: {
mul_mat_f_switch_ids<T, cols_per_block, 1>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 2: {
mul_mat_f_switch_ids<T, cols_per_block, 2>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 3: {
mul_mat_f_switch_ids<T, cols_per_block, 3>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 4: {
mul_mat_f_switch_ids<T, cols_per_block, 4>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 5: {
mul_mat_f_switch_ids<T, cols_per_block, 5>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 6: {
mul_mat_f_switch_ids<T, cols_per_block, 6>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 7: {
mul_mat_f_switch_ids<T, cols_per_block, 7>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 8: {
mul_mat_f_switch_ids<T, cols_per_block, 8>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
@ -332,84 +362,89 @@ static void mul_mat_f_switch_cols_per_block(
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
switch (ncols_dst) {
const int ncols_case = (ids && ncols_dst > 16) ? 16 : ncols_dst;
GGML_ASSERT(ids || ncols_dst <= 16);
switch (ncols_case) {
case 1: {
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 2: {
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 3: {
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 4: {
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 5: {
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 6: {
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 7: {
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 8: {
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 9: {
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 10: {
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 11: {
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 12: {
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 13: {
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 14: {
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 15: {
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 16: {
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
@ -422,7 +457,7 @@ static void mul_mat_f_switch_cols_per_block(
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \
template void mul_mat_f_cuda<T, ncols_dst>( \
const T * x, const float * y, const int32_t * ids, float * dst, \
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
const int64_t ncols_x, const int64_t nrows_x, int64_t ncols_dst_total, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
const int64_t stride_col_id, const int64_t stride_row_id, \
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\

View File

@ -81,7 +81,7 @@ static __global__ void mmq_ids_helper(
#pragma unroll
for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
if (threadIdx.x >= offset) {
if (threadIdx.x >= static_cast<unsigned int>(offset)) {
it_compact_add_lower += tmp;
}
}
@ -110,7 +110,7 @@ static __global__ void mmq_ids_helper(
expert_bounds[expert] = nex_prev;
if (expert < gridDim.x - 1) {
if (expert < static_cast<int>(gridDim.x) - 1) {
return;
}

View File

@ -220,7 +220,7 @@ static __global__ void mul_mat_vec_q(
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
}
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
}
}

View File

@ -1,11 +1,12 @@
#include "pad_reflect_1d.cuh"
static __global__ void pad_reflect_1d_kernel_f32(
static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
pad_reflect_1d_kernel_f32(
const void * __restrict__ src0,
void * __restrict__ dst,
const int64_t ne0,
const int64_t ne00,
const int64_t ne01,
const uint3 ne01,
const int64_t ne02,
const int64_t ne03,
const int64_t nb00,
@ -18,35 +19,40 @@ static __global__ void pad_reflect_1d_kernel_f32(
const int64_t nb3,
const int p0,
const int p1) {
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;
const int64_t i1 = blockIdx.x;
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
const uint2 div_mod_packed = fast_div_modulo(blockIdx.x, ne01);
const int64_t tile1 = div_mod_packed.y; // i1
const int64_t tile0 = div_mod_packed.x; // nth i0 tile
const int64_t i1 = tile1;
const int64_t i0 = threadIdx.x + tile0 * blockDim.x;
// ne01.z is original value of unpacked ne01 (see init_fastdiv_values in common.cuh)
if (i0 >= ne0 || i1 >= ne01.z || i2 >= ne02 || i3 >= ne03) {
return;
}
const char * src0_ptr = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
char * dst_ptr = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
float value;
const int64_t rel_i0 = i0 - p0; // relative i0 in src0
int64_t src_idx;
if (i0 < p0) {
if (rel_i0 < 0) {
// Left padding - reflect
value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
} else if (i0 < ne0 - p1) {
src_idx = -rel_i0;
} else if (rel_i0 < ne00) {
// Middle - copy
value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
src_idx = rel_i0;
} else {
// Right padding - reflect
int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
value = *(const float *)(src0_ptr + src_idx * nb00);
src_idx = 2 * ne00 - 2 - rel_i0;
}
const float value = *(const float *) (src0_ptr + src_idx * nb00);
*(float *) (dst_ptr + i0 * nb0) = value;
}
GGML_UNUSED(p1);
}
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -62,21 +68,24 @@ void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor *
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const uint3 ne01_packed = init_fastdiv_values(ne01);
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne0 = dst->ne[0];
// sanity: padded length matches
GGML_ASSERT(ne0 == ne00 + p0 + p1);
const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
const dim3 grid_dims(ne01, ne02, ne03);
constexpr int64_t bx = CUDA_PAD_REFLECT_1D_BLOCK_SIZE; // threads per block (x)
const int64_t tiles0 = (ne0 + bx - 1) / bx; // number of tiles along i0
// grid.x covers i1 and all tiles of i0: [ne01 * tiles0]
// grid.y covers i2: [ne02]
// grid.z covers i3: [ne03]
const dim3 grid_dims((unsigned) (ne01 * tiles0), (unsigned) ne02, (unsigned) ne03);
const dim3 block_dims((unsigned) bx, 1, 1);
pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
src0->data, dst->data,
ne0, ne00, ne01, ne02, ne03,
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
p0, p1
);
src0->data, dst->data, ne0, ne00, ne01_packed, ne02, ne03, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], p0, p1);
}

View File

@ -4,9 +4,9 @@
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
// Generic quantized set_rows kernel template
template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
template<typename idx_t, typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
static __global__ void k_set_rows_quant(
const float * __restrict__ src0, const int64_t * __restrict__ src1, block_type * __restrict__ dst,
const float * __restrict__ src0, const idx_t * __restrict__ src1, block_type * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s01, const int64_t s02, const int64_t s03,
@ -45,9 +45,9 @@ static __global__ void k_set_rows_quant(
}
// Template dispatch function for quantized set_rows
template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
template<typename idx_t, typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
static void set_rows_cuda_quant(
const float * src0_d, const int64_t * src1_d, block_type * dst_d,
const float * src0_d, const idx_t * src1_d, block_type * dst_d,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const size_t nb01, const size_t nb02, const size_t nb03,
@ -64,15 +64,15 @@ static void set_rows_cuda_quant(
const int64_t s01 = nb01/sizeof(float);
const int64_t s02 = nb02/sizeof(float);
const int64_t s03 = nb03/sizeof(float);
const int64_t s10 = nb10/sizeof(int64_t);
const int64_t s11 = nb11/sizeof(int64_t);
const int64_t s12 = nb12/sizeof(int64_t);
const int64_t s10 = nb10/sizeof(idx_t);
const int64_t s11 = nb11/sizeof(idx_t);
const int64_t s12 = nb12/sizeof(idx_t);
const int64_t s1 = nb1;
const int64_t s2 = nb2;
const int64_t s3 = nb3;
if (ne_total > 0) {
k_set_rows_quant<block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
k_set_rows_quant<idx_t, block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -82,9 +82,9 @@ static void set_rows_cuda_quant(
}
}
template<typename src_t, typename dst_t>
template<typename src_t, typename idx_t, typename dst_t>
static __global__ void k_set_rows(
const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
const src_t * __restrict__ src0, const idx_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s01, const int64_t s02, const int64_t s03,
@ -118,9 +118,9 @@ static __global__ void k_set_rows(
GGML_UNUSED(ne13);
}
template<typename src_t, typename dst_t>
template<typename src_t, typename idx_t, typename dst_t>
static void set_rows_cuda(
const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
const src_t * src0_d, const idx_t * src1_d, dst_t * dst_d,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const size_t nb01, const size_t nb02, const size_t nb03,
@ -137,9 +137,9 @@ static void set_rows_cuda(
const int64_t s01 = nb01/sizeof(src_t);
const int64_t s02 = nb02/sizeof(src_t);
const int64_t s03 = nb03/sizeof(src_t);
const int64_t s10 = nb10/sizeof(int64_t);
const int64_t s11 = nb11/sizeof(int64_t);
const int64_t s12 = nb12/sizeof(int64_t);
const int64_t s10 = nb10/sizeof(idx_t);
const int64_t s11 = nb11/sizeof(idx_t);
const int64_t s12 = nb12/sizeof(idx_t);
const int64_t s1 = nb1/sizeof(dst_t);
const int64_t s2 = nb2/sizeof(dst_t);
const int64_t s3 = nb3/sizeof(dst_t);
@ -155,23 +155,16 @@ static void set_rows_cuda(
}
}
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I64);
template<typename src_t, typename idx_t>
static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const src_t * src0_d = (const src_t *)src0->data;
const idx_t * src1_d = (const idx_t *)src1->data;
GGML_TENSOR_BINARY_OP_LOCALS
const float * src0_d = (const float *)src0->data;
const int64_t * src1_d = (const int64_t *)src1->data;
cudaStream_t stream = ctx.stream();
if (dst->type == GGML_TYPE_F32) {
set_rows_cuda(
src0_d, src1_d, (float*)dst->data,
@ -203,7 +196,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_Q4_0) {
set_rows_cuda_quant<block_q4_0, QK4_0, quantize_f32_q4_0_block>(
set_rows_cuda_quant<idx_t, block_q4_0, QK4_0, quantize_f32_q4_0_block>(
src0_d, src1_d, (block_q4_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -213,7 +206,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_Q4_1) {
set_rows_cuda_quant<block_q4_1, QK4_1, quantize_f32_q4_1_block>(
set_rows_cuda_quant<idx_t, block_q4_1, QK4_1, quantize_f32_q4_1_block>(
src0_d, src1_d, (block_q4_1*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -223,7 +216,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_Q5_0) {
set_rows_cuda_quant<block_q5_0, QK5_0, quantize_f32_q5_0_block>(
set_rows_cuda_quant<idx_t, block_q5_0, QK5_0, quantize_f32_q5_0_block>(
src0_d, src1_d, (block_q5_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -233,7 +226,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_Q5_1) {
set_rows_cuda_quant<block_q5_1, QK5_1, quantize_f32_q5_1_block>(
set_rows_cuda_quant<idx_t, block_q5_1, QK5_1, quantize_f32_q5_1_block>(
src0_d, src1_d, (block_q5_1*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -243,7 +236,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_Q8_0) {
set_rows_cuda_quant<block_q8_0, QK8_0, quantize_f32_q8_0_block>(
set_rows_cuda_quant<idx_t, block_q8_0, QK8_0, quantize_f32_q8_0_block>(
src0_d, src1_d, (block_q8_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -253,7 +246,7 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
stream
);
} else if (dst->type == GGML_TYPE_IQ4_NL) {
set_rows_cuda_quant<block_iq4_nl, QK4_NL, quantize_f32_iq4_nl_block>(
set_rows_cuda_quant<idx_t, block_iq4_nl, QK4_NL, quantize_f32_iq4_nl_block>(
src0_d, src1_d, (block_iq4_nl*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
@ -266,3 +259,18 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
}
}
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32);
if (src1->type == GGML_TYPE_I64) {
set_rows_cuda<float, int64_t>(ctx, src0, src1, dst);
} else {
set_rows_cuda<float, int32_t>(ctx, src0, src1, dst);
}
}

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);

Some files were not shown because too many files have changed in this diff Show More