Merge remote-tracking branch 'origin' into convert-split

This commit is contained in:
Christian Zhou-Zheng 2024-05-22 20:23:13 -04:00
commit 2dd784108b
133 changed files with 67328 additions and 42384 deletions

View File

@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan) (cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic) (cmakeBool "LLAMA_STATIC" enableStatic)
] ]
@ -227,20 +226,20 @@ effectiveStdenv.mkDerivation (
) )
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_C_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
]; ];
# Environment variables needed for ROCm
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''

73
.github/labeler.yml vendored Normal file
View File

@ -0,0 +1,73 @@
# https://github.com/actions/labeler
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml-sycl.h
- ggml-sycl.cpp
- README-sycl.md
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml_vk_generate_shaders.py
- ggml-vulkan*
documentation:
- changed-files:
- any-glob-to-any-file:
- docs/**
- media/**
testing:
- changed-files:
- any-glob-to-any-file:
- tests/**
build:
- changed-files:
- any-glob-to-any-file:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server:
- changed-files:
- any-glob-to-any-file:
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
nix:
- changed-files:
- any-glob-to-any-file:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix

View File

@ -271,49 +271,15 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip name: llama-bin-ubuntu-x64.zip
# ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest runs-on: ubuntu-latest
continue-on-error: true continue-on-error: true
strategy: strategy:
matrix: matrix:
mpi_library: [mpich, libopenmpi-dev] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps: steps:
- name: Clone - name: Clone
@ -324,14 +290,44 @@ jobs:
id: depends id: depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }} sudo apt-get install build-essential
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_MPI=ON .. cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_RPC=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -362,6 +358,33 @@ jobs:
cmake -DLLAMA_VULKAN=ON .. cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -663,24 +686,28 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- build: 'noavx' - build: 'rpc-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute' - build: 'kompute-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan' - build: 'vulkan-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'arm64' - build: 'llvm-arm64'
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps: steps:
- name: Clone - name: Clone
@ -691,13 +718,13 @@ jobs:
- name: Clone Kompute submodule - name: Clone Kompute submodule
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute' }} if: ${{ matrix.build == 'kompute-x64' }}
run: | run: |
git submodule update --init kompute git submodule update --init kompute
- name: Download OpenCL SDK - name: Download OpenCL SDK
id: get_opencl id: get_opencl
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip" curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl mkdir $env:RUNNER_TEMP/opencl
@ -705,7 +732,7 @@ jobs:
- name: Download CLBlast - name: Download CLBlast
id: get_clblast id: get_clblast
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@ -718,7 +745,7 @@ jobs:
- name: Download OpenBLAS - name: Download OpenBLAS
id: get_openblas id: get_openblas
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@ -731,38 +758,41 @@ jobs:
- name: Install Vulkan SDK - name: Install Vulkan SDK
id: get_vulkan id: get_vulkan
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -S . -B build ${{ matrix.defines }}
cd build cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll - name: Add clblast.dll
id: add_clblast_dll id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll - name: Add libopenblas.dll
id: add_libopenblas_dll id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support - name: Check AVX512F support
id: check_avx512f id: check_avx512f
if: ${{ matrix.build == 'avx512' }} if: ${{ matrix.build == 'avx512-x64' }}
continue-on-error: true continue-on-error: true
run: | run: |
cd build cd build
@ -776,14 +806,14 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE) - name: Test (Intel SDE)
id: cmake_test_sde id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: | run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz # for some weird reason windows tar doesn't like sde tar.xz
@ -811,14 +841,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}-x64.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-latest
@ -898,9 +928,9 @@ jobs:
shell: bash shell: bash
env: env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
@ -932,6 +962,17 @@ jobs:
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
@ -941,6 +982,37 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest

17
.github/workflows/labeler.yml vendored Normal file
View File

@ -0,0 +1,17 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'

View File

@ -32,10 +32,8 @@ jobs:
strategy: strategy:
matrix: matrix:
# TODO: temporary disabled due to linux kernel issues sanitizer: [ADDRESS, THREAD, UNDEFINED]
#sanitizer: [ADDRESS, THREAD, UNDEFINED] build_type: [RelWithDebInfo]
sanitizer: [UNDEFINED]
build_type: [Debug]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
@ -102,10 +100,8 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh

View File

@ -1,29 +0,0 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
@ -77,6 +77,7 @@ option(LLAMA_AVX2 "llama: enable AVX2"
option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
# in MSVC F16C is implied with AVX2/AVX512 # in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC) if (NOT MSVC)
@ -122,7 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version") "llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@ -133,6 +134,8 @@ set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeli
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments # add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF) option(LLAMA_PERF "llama: enable perf" OFF)
@ -296,7 +299,7 @@ if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
endif() endif()
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22) if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8) set(BLA_SIZEOF_INTEGER 8)
endif() endif()
@ -465,33 +468,15 @@ if (LLAMA_CUDA)
endif() endif()
endif() endif()
if (LLAMA_MPI) if (LLAMA_RPC)
cmake_minimum_required(VERSION 3.10) add_compile_definitions(GGML_USE_RPC)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h) if (WIN32)
set(GGML_SOURCES_MPI ggml-mpi.c) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
add_compile_options(-Wno-cast-qual)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif() endif()
set(GGML_HEADERS_RPC ggml-rpc.h)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif() endif()
if (LLAMA_CLBLAST) if (LLAMA_CLBLAST)
@ -520,6 +505,12 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN) add_compile_definitions(GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()
if (LLAMA_VULKAN_CHECK_RESULTS) if (LLAMA_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif() endif()
@ -543,16 +534,37 @@ if (LLAMA_VULKAN)
endif() endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH /opt/rocm)
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") # CMake on Windows doesn't support the HIP language yet
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") if(WIN32)
set(CXX_IS_HIPCC TRUE)
else()
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
endif() endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") if(CXX_IS_HIPCC)
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") if(LINUX)
endif() if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
endif()
find_package(hip REQUIRED) find_package(hip REQUIRED)
find_package(hipblas REQUIRED) find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED) find_package(rocblas REQUIRED)
@ -586,13 +598,18 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
if (LLAMA_STATIC) if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm") message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
endif() endif()
if (LLAMA_SYCL) if (LLAMA_SYCL)
@ -995,6 +1012,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@ -1047,6 +1069,10 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif() endif()
if (LLAMA_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
elseif (LLAMA_AVX2) elseif (LLAMA_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX) elseif (LLAMA_AVX)
@ -1078,6 +1104,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (LLAMA_AVX512_VNNI) if (LLAMA_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
if (LLAMA_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
@ -1087,6 +1116,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (LLAMA_LASX)
list(APPEND ARCH_FLAGS -mlasx)
endif()
if (LLAMA_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
@ -1175,7 +1215,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
@ -1262,7 +1302,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER) install(TARGETS ggml PUBLIC_HEADER)
@ -1281,17 +1321,6 @@ install(
WORLD_READ WORLD_READ
WORLD_EXECUTE WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR}) DESTINATION ${CMAKE_INSTALL_BINDIR})
install(
FILES convert-lora-to-ggml.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (LLAMA_METAL) if (LLAMA_METAL)
install( install(
FILES ggml-metal.metal FILES ggml-metal.metal

45
CMakePresets.json Normal file
View File

@ -0,0 +1,45 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
]
}

View File

@ -379,6 +379,11 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
CUDA_POWER_ARCH = 1 CUDA_POWER_ARCH = 1
endif endif
ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CFLAGS += -mlasx
MK_CXXFLAGS += -mlasx
endif
else else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
@ -399,13 +404,6 @@ ifndef LLAMA_NO_ACCELERATE
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@ -560,10 +558,10 @@ endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else else
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif endif
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
@ -575,7 +573,7 @@ ifdef LLAMA_HIP_UMA
endif # LLAMA_HIP_UMA endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@ -629,11 +627,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@

View File

@ -107,7 +107,6 @@ Typically finetunes of the base models below are supported as well.
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@ -140,6 +139,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server** **HTTP server**
@ -300,7 +300,7 @@ cd llama.cpp
### Build ### Build
In order to build llama.cpp you have three different options. In order to build llama.cpp you have four different options.
- Using `make`: - Using `make`:
- On Linux or MacOS: - On Linux or MacOS:
@ -381,45 +381,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument. argument.
### MPI Build
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
- Using `make`:
```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```
- Using `CMake`:
```bash
cmake -S . -B build -DLLAMA_MPI=ON
```
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
Here is an example hostfile:
```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### BLAS Build ### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
@ -527,13 +488,28 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16 && cmake --build build --config Release -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads): - Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash ```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
@ -542,10 +518,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash ```bash
set PATH=%HIP_PATH%\bin;%PATH% set PATH=%HIP_PATH%\bin;%PATH%
mkdir build cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cd build cmake --build build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
cmake --build .
``` ```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@ -711,6 +685,9 @@ Building the program with BLAS support may lead to some performance improvements
### Prepare and Quantize ### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

172
build.zig
View File

@ -1,172 +0,0 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const target = builder.standardTargetOptions(.{});
const zig_version = @import("builtin").zig_version_string;
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"common"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
// https://github.com/ziglang/zig/issues/15448
if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
for (server_assets) |asset| {
const input_path = b.fmt("examples/server/public/{s}", .{asset});
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
defer b.allocator.free(input);
var buf = std.ArrayList(u8).init(b.allocator);
defer buf.deinit();
for (input) |byte| {
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
}
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
defer b.allocator.free(name);
std.mem.replaceScalar(u8, name, '.', '_');
try std.fs.cwd().writeFile(output_path, b.fmt(
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
.{ name, buf.items, name, input.len },
));
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
}
}

View File

@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/3B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# open_llama_7b_v2 # open_llama_7b_v2
@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/7B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# currently not supported by the CUDA backend
# q8_0
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# bge-small # bge-small

View File

@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View File

@ -0,0 +1,6 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \ #define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
@ -35,14 +35,18 @@
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT; extern char const * LLAMA_COMMIT;
extern char const *LLAMA_COMPILER; extern char const * LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET; extern char const * LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info; struct llama_control_vector_load_info;
int get_math_cpu_count(); //
int32_t get_num_physical_cores(); // CPU utils
//
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
// //
// CLI argument parsing // CLI argument parsing
@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_math_cpu_count(); int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1; int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1; int32_t n_threads_batch_draft = -1;
@ -82,6 +86,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
@ -140,6 +145,7 @@ struct gpt_params {
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
@ -177,33 +183,34 @@ struct gpt_params {
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
bool validate_file_name(const std::string & filename);
// //
// String utils // String utils
// //
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator); std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str); std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type); std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
//
// Filesystem utils
//
bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
// //
// Model utils // Model utils
@ -274,29 +281,15 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false. // defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model); bool llama_should_add_bos_token(const llama_model * model);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
// //
// KV cache utils // KV cache utils
// //
// Dump the KV cache view with the number of sequences per cell. // Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output). // Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
// //
// Embedding utils // Embedding utils
@ -330,6 +323,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
// //
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count"; static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
//
// YAML utils
//
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View File

@ -26,7 +26,7 @@ namespace grammar_parser {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second; return result.first->second;
} }
@ -142,6 +142,9 @@ namespace grammar_parser {
pos++; pos++;
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
while (*pos != '"') { while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos); auto char_pair = parse_char(pos);
pos = char_pair.second; pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@ -156,6 +159,9 @@ namespace grammar_parser {
} }
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
while (*pos != ']') { while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos); auto char_pair = parse_char(pos);
pos = char_pair.second; pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size() enum llama_gretype type = last_sym_start < out_elements.size()
@ -164,6 +170,9 @@ namespace grammar_parser {
out_elements.push_back({type, char_pair.first}); out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') { if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1); auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second; pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

View File

@ -272,7 +272,7 @@ private:
if (literal.empty()) { if (literal.empty()) {
return false; return false;
} }
ret.push_back(std::make_pair(literal, true)); ret.emplace_back(literal, true);
literal.clear(); literal.clear();
return true; return true;
}; };
@ -298,7 +298,7 @@ private:
while (i < length) { while (i < length) {
char c = sub_pattern[i]; char c = sub_pattern[i];
if (c == '.') { if (c == '.') {
seq.push_back(std::make_pair(get_dot(), false)); seq.emplace_back(get_dot(), false);
i++; i++;
} else if (c == '(') { } else if (c == '(') {
i++; i++;
@ -307,7 +307,7 @@ private:
_warnings.push_back("Unsupported pattern syntax"); _warnings.push_back("Unsupported pattern syntax");
} }
} }
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false)); seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') { } else if (c == ')') {
i++; i++;
if (start > 0 && sub_pattern[start - 1] != '(') { if (start > 0 && sub_pattern[start - 1] != '(') {
@ -331,9 +331,9 @@ private:
} }
square_brackets += ']'; square_brackets += ']';
i++; i++;
seq.push_back(std::make_pair(square_brackets, false)); seq.emplace_back(square_brackets, false);
} else if (c == '|') { } else if (c == '|') {
seq.push_back(std::make_pair("|", false)); seq.emplace_back("|", false);
i++; i++;
} else if (c == '*' || c == '+' || c == '?') { } else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false); seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@ -417,7 +417,7 @@ private:
} }
} }
if (!literal.empty()) { if (!literal.empty()) {
seq.push_back(std::make_pair(literal, true)); seq.emplace_back(literal, true);
} }
} }
} }

View File

@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_FLF_FMT "%s" #define LOG_FLF_FMT "%s"
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_TEE_FLF_FMT "%s" #define LOG_TEE_FLF_FMT "%s"
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Main LOG macro. // Main LOG macro.
// behaves like printf, and supports arguments the exact same way. // behaves like printf, and supports arguments the exact same way.
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else #else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Secondary target can be changed just like LOG_TARGET // Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET // by defining LOG_TEE_TARGET
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else #else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif #endif
// LOG macro variants with auto endline. // LOG macro variants with auto endline.
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else #else

View File

@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev); result->prev.resize(params.n_prev);
result->n_considered = 0; result->n_valid = 0;
llama_sampling_set_rng_seed(result, params.seed); llama_sampling_set_rng_seed(result, params.seed);
@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
std::fill(ctx->prev.begin(), ctx->prev.end(), 0); std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear(); ctx->cur.clear();
ctx->n_considered = 0; ctx->n_valid = 0;
} }
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties "; std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) { for (auto sampler_type : params.samplers_sequence) {
const auto sampler_type_name = sampler_type_to_name_string(sampler_type); const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) { if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " "; result += "-> " + sampler_type_name + " ";
} }
@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
return result; return result;
} }
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header // no reasons to expose this function in header
static void sampler_queue( static void sampler_queue(
struct llama_context * ctx_main, struct llama_context * ctx_main,
@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
struct llama_context * ctx_main, struct llama_context * ctx_main,
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx, const int idx,
bool is_resampling) { // Add a parameter to indicate if we are resampling bool is_resampling) {
const llama_sampling_params & params = ctx_sampling->params; const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp; const float temp = params.temp;
@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
const float mirostat_eta = params.mirostat_eta; const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits; std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (!is_resampling) { if (ctx_sampling->grammar != NULL && !is_resampling) {
GGML_ASSERT(!original_logits.empty()); GGML_ASSERT(!original_logits.empty());
} }
llama_token id = 0; llama_token id = 0;
@ -252,11 +333,11 @@ static llama_token llama_sampling_sample_impl(
// Restore logits from the copy // Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits); std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
} }
} }
ctx_sampling->n_considered = cur_p.size; ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
return id; return id;
} }
@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
// Get a pointer to the logits // Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx); float * logits = llama_get_logits_ith(ctx_main, idx);
if (apply_grammar && original_logits != NULL) { if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
} }
@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx) { const int idx) {
// Call the implementation function with is_resampling set to false by default // Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
} }
llama_token_data_array llama_sampling_prepare( llama_token_data_array llama_sampling_prepare(

View File

@ -81,7 +81,7 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> prev; std::vector<llama_token> prev;
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
size_t n_considered; size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng; std::mt19937 rng;
}; };
@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string // Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params); std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience // this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function // it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call // Note: When using multiple sequences, it is the caller's responsibility to call

View File

@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
void finish_processing_train_args(struct train_params_common * params) { void finish_processing_train_args(struct train_params_common * params) {
if (params->escape) { if (params->escape) {
process_escapes(params->sample_start); string_process_escapes(params->sample_start);
} }
} }

View File

@ -20,11 +20,13 @@
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
# #
import logging import logging
import os import os
import pathlib
import re
import requests import requests
import sys import sys
import json import json
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update") logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
class TOKENIZER_TYPE(IntEnum): class TOKENIZER_TYPE(IntEnum):
@ -69,70 +72,55 @@ models = [
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
] ]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path): def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers) response = sess.get(url, headers=headers)
if response.status_code == 200: response.raise_for_status()
with open(save_path, 'wb') as f: os.makedirs(os.path.dirname(save_path), exist_ok=True)
f.write(response.content) with open(save_path, 'wb') as f:
logger.info(f"File {save_path} downloaded successfully") f.write(response.content)
else: logger.info(f"File {save_path} downloaded successfully")
logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models def download_model(model):
for model in models:
name = model["name"] name = model["name"]
repo = model["repo"] repo = model["repo"]
tokt = model["tokt"] tokt = model["tokt"]
if not os.path.exists(f"models/tokenizers/{name}"): os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
os.makedirs(f"models/tokenizers/{name}")
else:
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue
logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
# if downloaded file is less than 1KB, we likely need to download an LFS instead
if os.path.getsize(save_path) < 1024:
# remove the file
os.remove(save_path)
url = f"{repo}/resolve/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM: if tokt == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model" files.append("tokenizer.model")
save_path = f"models/tokenizers/{name}/tokenizer.model"
download_file_with_auth(url, token, save_path) for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
url = f"{repo}/raw/main/tokenizer_config.json"
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = "" src_ifs = ""
for model in models: for model in models:
@ -142,8 +130,17 @@ for model in models:
if tokt == TOKENIZER_TYPE.SPM: if tokt == TOKENIZER_TYPE.SPM:
continue continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer # create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -161,6 +158,8 @@ for model in models:
logger.info("normalizer: " + json.dumps(normalizer, indent=4)) logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"] pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
logger.info("") logger.info("")
@ -210,11 +209,18 @@ src_func = f"""
return res return res
""" """
print(src_func) # noqa: NP100 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
logger.info("\n") convert_py_pth.write_text(convert_py)
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n") logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model # generate tests for each tokenizer model
@ -282,8 +288,17 @@ for model in models:
name = model["name"] name = model["name"]
tokt = model["tokt"] tokt = model["tokt"]
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer # create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests: for text in tests:

460
convert-hf-to-gguf.py Executable file → Normal file
View File

@ -12,8 +12,9 @@ import sys
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from hashlib import sha256 from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast, overload from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
import math
import numpy as np import numpy as np
import torch import torch
@ -50,7 +51,6 @@ class Model:
dir_model: Path dir_model: Path
ftype: int ftype: int
fname_out: Path
is_big_endian: bool is_big_endian: bool
endianess: gguf.GGUFEndian endianess: gguf.GGUFEndian
use_temp_file: bool use_temp_file: bool
@ -62,17 +62,17 @@ class Model:
block_count: int block_count: int
tensor_map: gguf.TensorNameMap tensor_map: gguf.TensorNameMap
tensor_names: set[str] | None tensor_names: set[str] | None
fname_out: Path
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
split_arguments: gguf.SplitArguments): split_arguments: gguf.SplitArguments):
if self.__class__ == Model: if type(self) is Model:
raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model self.dir_model = dir_model
self.ftype = ftype self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
@ -89,6 +89,19 @@ class Model:
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED:
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
_, first_tensor = next(self.get_tensors())
if first_tensor.dtype == torch.float16:
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_F16
else:
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
ftype_up: str = self.ftype.name.partition("_")[2].upper()
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
@classmethod @classmethod
def __init_subclass__(cls): def __init_subclass__(cls):
@ -148,14 +161,27 @@ class Model:
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
name: str = gguf.TENSOR_NAMES[key]
if key not in gguf.MODEL_TENSORS[self.model_arch]: if key not in gguf.MODEL_TENSORS[self.model_arch]:
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in name: if "{bid}" in name:
assert bid is not None assert bid is not None
name = name.format(bid=bid) name = name.format(bid=bid)
return name + suffix return name + suffix
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
if key not in gguf.MODEL_TENSORS[self.model_arch]:
return False
key_name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in key_name:
if bid is None:
return False
key_name = key_name.format(bid=bid)
else:
if bid is not None:
return False
return name == (key_name + suffix)
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
if new_name is None: if new_name is None:
@ -245,35 +271,64 @@ class Model:
data: np.ndarray = data # type hint data: np.ndarray = data # type hint
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
data_qtype: gguf.GGMLQuantizationType | None = None
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# when both are True, f32 should win # when both are True, f32 should win
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight") # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
extra_f32 = any(cond for cond in (
extra_f32,
n_dims == 1,
new_name.endswith("_norm.weight"),
))
# Some tensor types are always in float32
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
))
# if f16 desired, convert any float32 2-dim weight tensors to float16 # if f16 desired, convert any float32 2-dim weight tensors to float16
extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2) extra_f16 = any(cond for cond in (
extra_f16,
(name.endswith(".weight") and n_dims >= 2),
))
# when both extra_f32 and extra_f16 are False, convert to float32 by default if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
if self.ftype == 1 and data_dtype == np.float16 and (extra_f32 or not extra_f16): if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data = data.astype(np.float32) data = gguf.quantize_bf16(data)
assert data.dtype == np.int16
data_qtype = gguf.GGMLQuantizationType.BF16
if self.ftype == 1 and data_dtype == np.float32 and extra_f16 and not extra_f32: elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
data = data.astype(np.float16) data = gguf.quantize_q8_0(data)
assert data.dtype == np.uint8
data_qtype = gguf.GGMLQuantizationType.Q8_0
else: # default to float16 for quantized tensors
if data_dtype != np.float16:
data = data.astype(np.float16)
data_qtype = gguf.GGMLQuantizationType.F16
if data_qtype is None: # by default, convert to float32
if data_dtype != np.float32:
data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
# reverse shape to make it similar to the internal ggml dimension order # reverse shape to make it similar to the internal ggml dimension order
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" shape_str = f"""{{{', '.join(str(n) for n in reversed(
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
)}}}"""
# n_dims is implicit in the shape # n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data.dtype}, shape = {shape_str}") logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
def write(self): def write(self):
self.write_tensors() self.write_tensors()
@ -351,6 +406,7 @@ class Model:
# NOTE: this function is generated by convert-hf-to-gguf-update.py # NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually! # do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920 # ref: https://github.com/ggerganov/llama.cpp/pull/6920
# Marker: Start get_vocab_base_pre
def get_vocab_base_pre(self, tokenizer) -> str: def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
@ -394,6 +450,9 @@ class Model:
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2 # ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2" res = "gpt-2"
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
res = "stablelm2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact" res = "refact"
@ -407,8 +466,17 @@ class Model:
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo" res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-instruct # ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx" res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
res = "jina-v2-es"
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -429,6 +497,7 @@ class Model:
logger.debug(f"chkhsh: {chkhsh}") logger.debug(f"chkhsh: {chkhsh}")
return res return res
# Marker: End get_vocab_base_pre
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
@ -466,7 +535,7 @@ class Model:
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -511,6 +580,10 @@ class Model:
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id) piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8") text = piece.encode("utf-8")
@ -526,21 +599,23 @@ class Model:
elif tokenizer.IsByte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = SentencePieceTokenTypes.BYTE
tokens.append(text) tokens[token_id] = text
scores.append(score) scores[token_id] = score
toktypes.append(toktype) toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json' added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file(): if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f: with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f) added_tokens_json = json.load(f)
for key in added_tokens_json: for key in added_tokens_json:
key = key.encode("utf-8") token_id = added_tokens_json[key]
if key not in tokens: if (token_id >= vocab_size):
tokens.append(key) logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
scores.append(-1000.0) continue
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if vocab_size > len(tokens): if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens) pad_count = vocab_size - len(tokens)
@ -550,8 +625,6 @@ class Model:
scores.append(-1000.0) scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED) toktypes.append(SentencePieceTokenTypes.UNUSED)
assert len(tokens) == vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
@ -786,6 +859,7 @@ class BaichuanModel(Model):
self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear": if self.hparams["rope_scaling"].get("type") == "linear":
@ -908,6 +982,7 @@ class XverseModel(Model):
self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear": if self.hparams["rope_scaling"].get("type") == "linear":
@ -1016,6 +1091,18 @@ class StarCoderModel(Model):
class RefactModel(Model): class RefactModel(Model):
model_arch = gguf.MODEL_ARCH.REFACT model_arch = gguf.MODEL_ARCH.REFACT
def set_vocab(self):
super().set_vocab()
# TODO: how to determine special FIM tokens automatically?
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
special_vocab._set_special_token("prefix", 1)
special_vocab._set_special_token("suffix", 3)
special_vocab._set_special_token("middle", 2)
special_vocab._set_special_token("fsep", 4) # is this correct?
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
hidden_dim = self.hparams["n_embd"] hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim inner_dim = 4 * hidden_dim
@ -1065,45 +1152,6 @@ class RefactModel(Model):
return tensors return tensors
@Model.register("PersimmonForCausalLM")
class PersimmonModel(Model):
model_arch = gguf.MODEL_ARCH.PERSIMMON
def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"]
head_count_kv = head_count
hidden_size = self.hparams["hidden_size"]
self.gguf_writer.add_name('persimmon-8b-chat')
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
# than the head size?
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
def set_vocab(self):
self._set_vocab_sentencepiece()
# self.gguf_writer.add_bos_token_id(71013)
# self.gguf_writer.add_eos_token_id(71013)
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
del name, new_name, bid, n_dims # unused
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
return True
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model): class StableLMModel(Model):
model_arch = gguf.MODEL_ARCH.STABLELM model_arch = gguf.MODEL_ARCH.STABLELM
@ -1130,6 +1178,7 @@ class StableLMModel(Model):
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
self.gguf_writer.add_file_type(self.ftype)
_q_norms: list[dict[str, Tensor]] | None = None _q_norms: list[dict[str, Tensor]] | None = None
_k_norms: list[dict[str, Tensor]] | None = None _k_norms: list[dict[str, Tensor]] | None = None
@ -1506,6 +1555,7 @@ class QwenModel(Model):
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
@Model.register("Qwen2ForCausalLM") @Model.register("Qwen2ForCausalLM")
@ -1694,6 +1744,38 @@ class Phi3MiniModel(Model):
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, foken_data in added_tokens_decoder.items():
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
tokenizer_file = self.dir_model / 'tokenizer.json'
if tokenizer_file.is_file():
with open(tokenizer_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get("added_tokens", [])
for foken_data in added_tokens:
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
@ -1706,23 +1788,59 @@ class Phi3MiniModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
rot_pct = 1.0
n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"]) n_head = self.find_hparam(["num_attention_heads", "n_head"])
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"]) rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rope_dims = n_embd // n_head
self.gguf_writer.add_name("Phi3") self.gguf_writer.add_name("Phi3")
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_context_length(max_pos_embds)
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(8192) self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head) self.gguf_writer.add_head_count_kv(n_head_kv)
self.gguf_writer.add_layer_norm_rms_eps(rms_eps) self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_rope_dimension_count(rope_dims)
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if (rope_scaling is None):
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('type', '').lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
if rope_scaling_type == 'su':
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
elif rope_scaling_type == 'yarn':
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
else:
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@Model.register("PlamoForCausalLM") @Model.register("PlamoForCausalLM")
class PlamoModel(Model): class PlamoModel(Model):
@ -1743,6 +1861,7 @@ class PlamoModel(Model):
self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
def shuffle_attn_q_weight(self, data_torch): def shuffle_attn_q_weight(self, data_torch):
assert data_torch.size() == (5120, 5120) assert data_torch.size() == (5120, 5120)
@ -1922,6 +2041,7 @@ in chat mode so that the conversation can end normally.")
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"] num_heads = self.hparams["num_attention_heads"]
@ -2026,12 +2146,6 @@ class BertModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
del new_name, bid, n_dims # unused
# not used with get_rows, must be F32
return name == "embeddings.token_type_embeddings.weight"
@Model.register("NomicBertModel") @Model.register("NomicBertModel")
class NomicBertModel(BertModel): class NomicBertModel(BertModel):
@ -2280,96 +2394,71 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.intermediate_size = self.hparams["intermediate_size"]
def get_tensors(self):
for name, data in super().get_tensors():
if 'gated_layers' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
yield name1, d1
yield name2, d2
continue
yield name, data
def set_vocab(self, *args, **kwargs):
tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_class = json.load(f)['tokenizer_class']
if tokenizer_class == 'BertTokenizer':
super().set_vocab()
elif tokenizer_class == 'RobertaTokenizer':
self._set_vocab_gpt2()
self.gguf_writer.add_token_type_count(2)
else:
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######
# tree of lazy tensors # tree of lazy tensors
class LazyTorchTensor: class LazyTorchTensor(gguf.LazyBase):
_meta: Tensor _tensor_type = torch.Tensor
_data: Tensor | None # to keep the type-checker happy
_args: tuple dtype: torch.dtype
_func: Callable[[tuple], Tensor] | None shape: torch.Size
def __init__(self, *, meta: Tensor, data: Tensor | None = None, args: tuple = (), func: Callable[[tuple], Tensor] | None = None):
self._meta = meta
self._data = data
self._args = args
self._func = func
@staticmethod
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
# TODO: dict and set
if isinstance(o, (list, tuple)):
L = []
for item in o:
L.append(LazyTorchTensor._recurse_apply(item, fn))
if isinstance(o, tuple):
L = tuple(L)
return L
elif isinstance(o, LazyTorchTensor):
return fn(o)
else:
return o
def _wrap_fn(self, fn: Callable, use_self: bool = False) -> Callable[[Any], LazyTorchTensor]:
def wrapped_fn(*args, **kwargs):
if kwargs is None:
kwargs = {}
args = ((self,) if use_self else ()) + args
meta_args = LazyTorchTensor._recurse_apply(args, lambda t: t._meta)
return LazyTorchTensor(meta=fn(*meta_args, **kwargs), args=args, func=lambda a: fn(*a, **kwargs))
return wrapped_fn
def __getattr__(self, __name: str) -> Any:
meta_attr = getattr(self._meta, __name)
if callable(meta_attr):
return self._wrap_fn(getattr(torch.Tensor, __name), use_self=True)
elif isinstance(meta_attr, torch.Tensor):
# for things like self.T
return self._wrap_fn(lambda s: getattr(s, __name))(self)
else:
return meta_attr
# only used when converting a torch.Tensor to a np.ndarray
_dtype_map: dict[torch.dtype, type] = { _dtype_map: dict[torch.dtype, type] = {
torch.float16: np.float16, torch.float16: np.float16,
torch.float32: np.float32, torch.float32: np.float32,
} }
def numpy(self) -> gguf.LazyTensor: def numpy(self) -> gguf.LazyNumpyTensor:
dtype = self._dtype_map[self.dtype] dtype = self._dtype_map[self.dtype]
return gguf.LazyTensor(lambda: LazyTorchTensor.to_eager(self).numpy(), dtype=dtype, shape=self.shape) return gguf.LazyNumpyTensor(
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
lazy=self._lazy,
args=(self,),
func=(lambda s: s[0].numpy())
)
@overload @classmethod
@staticmethod def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
def to_eager(t: Tensor | LazyTorchTensor) -> Tensor: ... return torch.empty(size=shape, dtype=dtype, device="meta")
@overload
@staticmethod
def to_eager(t: tuple) -> tuple: ...
@staticmethod
def to_eager(t: Any) -> Any:
def simple_to_eager(_t: LazyTorchTensor) -> Tensor:
# wake up the lazy tensor
if _t._data is None and _t._func is not None:
# recurse into its arguments
_t._args = LazyTorchTensor.to_eager(_t._args)
_t._data = _t._func(_t._args)
if _t._data is not None:
return _t._data
else:
raise ValueError(f"Could not compute lazy tensor {_t!r} with args {_t._args!r}")
# recurse into lists and/or tuples, keeping their structure
return LazyTorchTensor._recurse_apply(t, simple_to_eager)
@staticmethod
def from_eager(t: Tensor) -> Tensor:
if (t.__class__ == LazyTorchTensor):
return t
return LazyTorchTensor(meta=t.detach().to("meta"), data=t) # type: ignore
@classmethod @classmethod
def __torch_function__(cls, func, types, args=(), kwargs=None): def __torch_function__(cls, func, types, args=(), kwargs=None):
@ -2380,28 +2469,8 @@ class LazyTorchTensor:
if func is torch.Tensor.numpy: if func is torch.Tensor.numpy:
return args[0].numpy() return args[0].numpy()
if func is torch.equal:
eager_args = LazyTorchTensor.to_eager(args)
return func(*eager_args, **kwargs)
return LazyTorchTensor._wrap_fn(args[0], func)(*args, **kwargs) return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
# special methods bypass __getattr__, so they need to be added manually
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
# NOTE: LazyTorchTensor can't be a subclass of Tensor (and then be used
# as self._meta is currently used), because then the following
# operations would by default not be wrapped, and so not propagated
# when the tensor is made eager.
# It's better to get non-silent errors for not-yet-supported operators.
# TODO: add more when needed to avoid clutter, or find a more concise way
def __neg__(self, *args): # mamba
return self._wrap_fn(torch.Tensor.__neg__)(self, *args)
def __add__(self, *args): # gemma
return self._wrap_fn(torch.Tensor.__add__)(self, *args)
def __getitem__(self, *args): # bloom falcon refact internlm2
return self._wrap_fn(torch.Tensor.__getitem__)(self, *args)
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
@ -2417,11 +2486,11 @@ def parse_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--outfile", type=Path, "--outfile", type=Path,
help="path to write to; default: based on input", help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
) )
parser.add_argument( parser.add_argument(
"--outtype", type=str, choices=["f32", "f16"], default="f16", "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
) )
parser.add_argument( parser.add_argument(
"--bigendian", action="store_true", "--bigendian", action="store_true",
@ -2504,15 +2573,18 @@ def main() -> None:
split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments() split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments()
ftype_map = { ftype_map = {
"f32": gguf.GGMLQuantizationType.F32, "f32": gguf.LlamaFileType.ALL_F32,
"f16": gguf.GGMLQuantizationType.F16, "f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"auto": gguf.LlamaFileType.GUESSED,
} }
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else: else:
# output in the same directory as the model by default # output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' fname_out = dir_model / 'ggml-model-{ftype}.gguf'
logger.info(f"Loading model: {dir_model.name}") logger.info(f"Loading model: {dir_model.name}")
@ -2529,14 +2601,16 @@ def main() -> None:
logger.info("Set model tokenizer") logger.info("Set model tokenizer")
model_instance.set_vocab() model_instance.set_vocab()
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
if args.vocab_only: if args.vocab_only:
logger.info(f"Exporting model vocab to '{fname_out}'") logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
model_instance.write_vocab() model_instance.write_vocab()
else: else:
logger.info(f"Exporting model to '{fname_out}'") logger.info(f"Exporting model to '{model_instance.fname_out}'")
model_instance.write() model_instance.write()
logger.info(f"Model successfully exported to '{fname_out}'") logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,150 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence
import numpy as np
import torch
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("lora-to-gguf")
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
assert (
int(params["lora_alpha"]) == params["lora_alpha"]
), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
NUMPY_TYPE_TO_FTYPE[data_type.name],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)
if __name__ == '__main__':
if len(sys.argv) < 2:
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
logger.error(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
logger.error("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
logger.error("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
logger.error("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.detach().numpy()
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
logger.error(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
logger.error(f"Error: could not map tensor name {orig_k}")
logger.error(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
logger.info(f"Converted {input_json} and {input_model} to {output_path}")

View File

@ -1,143 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import sys
from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("persimmon-to-gguf")
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
new_prefix = prefix + '.' + key if prefix is not None else key
if isinstance(dct[key], torch.Tensor):
tensors[new_prefix] = dct[key]
elif isinstance(dct[key], dict):
_flatten_dict(dct[key], tensors, new_prefix)
else:
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
logger.info('getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
logger.info('adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
block_count = hparams.num_layers
head_count = hparams.num_attention_heads
head_count_kv = head_count
ctx_length = hparams.seq_length
hidden_size = hparams.hidden_size
gguf_writer.add_name('persimmon-8b-chat')
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
logger.info(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
raise ValueError(f"Can not map tensor '{name}'")
n_dims = len(data.shape)
logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
gguf_writer.add_tensor(new_name, data)
logger.info("gguf: write header")
gguf_writer.write_header_to_file()
logger.info("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
logger.info("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
logger.info(f"gguf: model successfully exported to '{args.outfile}'")
if __name__ == '__main__':
main()

181
convert.py Executable file → Normal file
View File

@ -25,6 +25,7 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
# TEMPORARY IMPORT - TODO REMOVE
import importlib import importlib
gguf = importlib.import_module("gguf-py.gguf") gguf = importlib.import_module("gguf-py.gguf")
@ -346,10 +347,47 @@ class Params:
return params return params
@dataclass
class Metadata:
name: Optional[str] = None
author: Optional[str] = None
version: Optional[str] = None
url: Optional[str] = None
description: Optional[str] = None
licence: Optional[str] = None
source_url: Optional[str] = None
source_hf_repo: Optional[str] = None
@staticmethod
def load(metadata_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists():
return Metadata()
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance
metadata = Metadata()
# Assigning values to Metadata attributes if they exist in the JSON file
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get("general.name")
metadata.author = data.get("general.author")
metadata.version = data.get("general.version")
metadata.url = data.get("general.url")
metadata.description = data.get("general.description")
metadata.license = data.get("general.license")
metadata.source_url = data.get("general.source.url")
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
return metadata
# #
# vocab # vocab
# #
@runtime_checkable @runtime_checkable
class BaseVocab(Protocol): class BaseVocab(Protocol):
tokenizer_model: ClassVar[str] tokenizer_model: ClassVar[str]
@ -1068,21 +1106,42 @@ class OutputFile:
def __init__(self, fname_out: Path, split_arguments: gguf.SplitArguments, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): def __init__(self, fname_out: Path, split_arguments: gguf.SplitArguments, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], split_arguments, endianess=endianess) self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], split_arguments, endianess=endianess)
def add_meta_arch(self, params: Params) -> None: def add_meta_model(self, params: Params, metadata: Metadata) -> None:
# Metadata About The Model And Its Provenence
name = "LLaMA" name = "LLaMA"
if metadata is not None and metadata.name is not None:
# TODO: better logic to determine model name name = metadata.name
if params.n_ctx == 4096:
name = "LLaMA v2"
elif params.path_model is not None: elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1] name = params.path_model.name
elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model
name = "LLaMA v2"
self.gguf.add_name (name) self.gguf.add_name(name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx) if metadata is not None:
self.gguf.add_embedding_length (params.n_embd) if metadata.author is not None:
self.gguf.add_block_count (params.n_layer) self.gguf.add_author(metadata.author)
self.gguf.add_feed_forward_length (params.n_ff) if metadata.version is not None:
self.gguf.add_version(metadata.version)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.description is not None:
self.gguf.add_description(metadata.description)
if metadata.licence is not None:
self.gguf.add_licence(metadata.licence)
if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None:
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
self.gguf.add_vocab_size(params.n_vocab)
self.gguf.add_context_length(params.n_ctx)
self.gguf.add_embedding_length(params.n_embd)
self.gguf.add_block_count(params.n_layer)
self.gguf.add_feed_forward_length(params.n_ff)
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count (params.n_head) self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
@ -1179,13 +1238,14 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only( def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, gguf.SplitArguments(), endianess=endianess) of = OutputFile(fname_out, gguf.SplitArguments(), endianess=endianess)
# meta data # meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params) of.add_meta_arch(params)
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab) of.add_meta_special_vocab(svocab)
@ -1210,12 +1270,14 @@ class OutputFile:
@staticmethod @staticmethod
def write_all( def write_all(
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
split_arguments: gguf.SplitArguments, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE split_arguments: gguf.SplitArguments, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, metadata: Metadata = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=args.pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, split_arguments, endianess=endianess) of = OutputFile(fname_out, split_arguments, endianess=endianess)
# meta data # meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params) of.add_meta_arch(params)
if isinstance(vocab, Vocab): if isinstance(vocab, Vocab):
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
@ -1247,6 +1309,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise ValueError(f"Unexpected combination of types: {name_to_type}") raise ValueError(f"Unexpected combination of types: {name_to_type}")
def model_parameter_count(model: LazyModel) -> int:
total_model_parameters = 0
for i, (name, lazy_tensor) in enumerate(model.items()):
sum_weights_in_tensor = 1
for dim in lazy_tensor.shape:
sum_weights_in_tensor *= dim
total_model_parameters += sum_weights_in_tensor
return total_model_parameters
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
if model_params_count > 1e12 :
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9 :
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6 :
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
return f"{round(scaled_model_params)}{scale_suffix}"
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()} for (name, tensor) in model.items()}
@ -1426,13 +1519,35 @@ class VocabFactory:
return vocab, special_vocab return vocab, special_vocab
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
namestr = { quantization = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "F32",
GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyF16: "F16",
GGMLFileType.MostlyQ8_0:"q8_0", GGMLFileType.MostlyQ8_0: "Q8_0",
}[file_type] }[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
parameters = model_parameter_count_rounded_notation(model_params_count)
expert_count = ""
if params.n_experts is not None:
expert_count = f"{params.n_experts}x"
version = ""
if metadata is not None and metadata.version is not None:
version = f"-{metadata.version}"
name = "ggml-model"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
ret = model_paths[0].parent / f"{default_filename}.gguf"
if ret in model_paths: if ret in model_paths:
logger.error( logger.error(
f"Error: Default output path ({ret}) would overwrite the input. " f"Error: Default output path ({ret}) would overwrite the input. "
@ -1475,17 +1590,30 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files")
parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity") parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
elif args.dump_single or args.dump: elif args.dump_single or args.dump or args.get_outfile:
# Avoid printing anything besides the dump output # Avoid printing anything besides the dump output
logging.basicConfig(level=logging.WARNING) logging.basicConfig(level=logging.WARNING)
else: else:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
metadata = Metadata.load(args.metadata)
if args.get_outfile:
model_plus = load_some_model(args.model)
params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = model_parameter_count(model_plus.model)
ftype = pick_output_type(model, args.outtype)
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
return
if args.no_vocab and args.vocab_only: if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab") raise ValueError("--vocab-only does not make sense with --no-vocab")
@ -1507,6 +1635,9 @@ def main(args_in: list[str] | None = None) -> None:
else: else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model_params_count = model_parameter_count(model_plus.model)
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
@ -1560,7 +1691,7 @@ def main(args_in: list[str] | None = None) -> None:
f_norm_eps = 1e-5, f_norm_eps = 1e-5,
) )
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab) endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
logger.info(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")
return return
@ -1573,14 +1704,14 @@ def main(args_in: list[str] | None = None) -> None:
model = convert_model_names(model, params, args.skip_unknown) model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype) model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
params.ftype = ftype params.ftype = ftype
logger.info(f"Writing {outfile}, format {ftype}") logger.info(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, split_arguments, OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, split_arguments,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
if not args.dry_run: if not args.dry_run:
logger.info(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")

104
docs/debugging-tests.md Normal file
View File

@ -0,0 +1,104 @@
# Debugging Tests Tips
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
For example, running the following command will output an interactive list from which you can select a test. It takes this form:
`debug-test.sh [OPTION]... <test_regex> <test_number>`
It will then build & run in the debugger for you.
To just execute a test and get back a PASS or FAIL message run:
```bash
./scripts/debug-test.sh test-tokenizer
```
To test in GDB use the `-g` flag to enable gdb test mode.
```bash
./scripts/debug-test.sh -g test-tokenizer
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
>>> b main
```
To speed up the testing loop, if you know your test number you can just run it similar to below:
```bash
./scripts/debug-test.sh test 23
```
For further reference use `debug-test.sh -h` to print help.
&nbsp;
### How does the script work?
If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
#### Step 1: Reset and Setup folder context
From base of this repository, let's create `build-ci-debug` as our build context.
```bash
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
```
#### Step 2: Setup Build Environment and Compile Test Binaries
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
```bash
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
make -j
```
#### Step 3: Find all tests available that matches REGEX
The output of this command will give you the command & arguments needed to run GDB.
* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
* `-V` : Verbose Mode
```bash
ctest -R "test-tokenizer" -V -N
```
This may return output similar to below (focusing on key lines to pay attention to):
```bash
...
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
1: Working Directory: .
Labels: main
Test #1: test-tokenizer-0-llama-spm
...
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
4: Working Directory: .
Labels: main
Test #4: test-tokenizer-0-falcon
...
```
#### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
#### Step 5: Run GDB on test command
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
```bash
gdb --args ${Test Binary} ${Test GGUF Model}
```
Example:
```bash
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
```

View File

@ -49,4 +49,7 @@ else()
add_subdirectory(server) add_subdirectory(server)
endif() endif()
add_subdirectory(export-lora) add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif() endif()

View File

@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
process_escapes(params.prompt); string_process_escapes(params.prompt);
// init LLM // init LLM

View File

@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd); llama_embd_normalize(embd, out, n_embd);
} }
} }
@ -74,7 +80,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -101,7 +107,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// split the prompt into lines // split the prompt into lines
@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
inputs.push_back(inp); inputs.push_back(inp);
} }
// add SEP if not present // check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) { for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) { if (inp.empty() || inp.back() != llama_token_sep(model)) {
inp.push_back(llama_token_sep(model)); fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
} }
} }
@ -203,6 +211,7 @@ int main(int argc, char ** argv) {
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View File

@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v; float v;
if (type == GGML_TYPE_F16) { if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) { } else if (type == GGML_TYPE_F32) {
v = *(float *) data + i; v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) { } else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i; v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) { } else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i; v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) { } else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i; v = (float) *(int8_t *) &data[i];
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = run(ctx, params); bool OK = run(ctx, params);

View File

@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_ext(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };

View File

@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
sparams.dataset = params.prompt_file; sparams.dataset = params.prompt_file;
@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);

View File

@ -50,9 +50,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -70,7 +70,7 @@ static void write_logfile(
fprintf(logfile, "binary: infill\n"); fprintf(logfile, "binary: infill\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -78,8 +78,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(llama_add_eos_token(model) != 1);
@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
if (params.escape) { if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
process_escapes(params.input_prefix); string_process_escapes(params.input_prefix);
process_escapes(params.input_suffix); string_process_escapes(params.input_suffix);
} }
suff_rm_leading_spc = params.escape; suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {

View File

@ -26,16 +26,21 @@ options:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512) -pg <pp,tg> (default: 512,128)
-ctk <t>, --cache-type-k <t> (default: f16) -b, --batch-size <n> (default: 2048)
-ctv <t>, --cache-type-v <t> (default: f16) -ub, --ubatch-size <n> (default: 512)
-t, --threads <n> (default: 112) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16)
-ngl, --n-gpu-layers <n> (default: 99) -ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer) -sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0) -mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1) -mmp, --mmap <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> (default: 0) --numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5) -r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md) -o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0) -v, --verbose (default: 0)
@ -43,10 +48,11 @@ options:
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
``` ```
llama-bench can perform two types of tests: llama-bench can perform three types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`) - Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`) - Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).

View File

@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
} }
} }
static std::string pair_str(const std::pair<int, int> & p) {
static char buf[32];
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
return buf;
}
struct cmd_params { struct cmd_params {
std::vector<std::string> model; std::vector<std::string> model;
std::vector<int> n_prompt; std::vector<int> n_prompt;
std::vector<int> n_gen; std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
std::vector<int> n_batch; std::vector<int> n_batch;
std::vector<int> n_ubatch; std::vector<int> n_ubatch;
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
@ -188,11 +195,12 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_pg */ {},
/* n_batch */ {2048}, /* n_batch */ {2048},
/* n_ubatch */ {512}, /* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_math_cpu_count()}, /* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0}, /* main_gpu */ {0},
@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
}
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
} else if (arg == "-b" || arg == "--batch-size") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.model.empty()) { params.model = cmd_params_defaults.model; } if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
}; };
instances.push_back(instance); instances.push_back(instance);
} }
for (const auto & n_pg : params.n_pg) {
if (n_pg.first == 0 && n_pg.second == 0) {
continue;
}
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_pg.first,
/* .n_gen = */ n_pg.second,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
instances.push_back(instance);
}
} }
return instances; return instances;
@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") { if (field == "n_gpu_layers") {
return 3; return 3;
} }
if (field == "test") {
return 13;
}
int width = std::max((int)field.length(), 10); int width = std::max((int)field.length(), 10);
@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
value = test::get_backend(); value = test::get_backend();
} else if (field == "test") { } else if (field == "test") {
if (t.n_prompt > 0 && t.n_gen == 0) { if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) { } else if (t.n_gen > 0 && t.n_prompt == 0) {
snprintf(buf, sizeof(buf), "tg %d", t.n_gen); snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
} else { } else {
assert(false); snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
exit(1);
} }
value = buf; value = buf;
} else if (field == "t/s") { } else if (field == "t/s") {
@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
} }

View File

@ -12,15 +12,20 @@ cmake_minimum_required(VERSION 3.22.1)
# build script scope). # build script scope).
project("llama-android") project("llama-android")
include(FetchContent) ## Fetch latest llama.cpp from GitHub
FetchContent_Declare( #include(FetchContent)
llama #FetchContent_Declare(
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp # llama
GIT_TAG master # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
) # GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# Also provides "common" # llama.cpp CI uses the code from the current branch
FetchContent_MakeAvailable(llama) # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC # Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code. # or SHARED, and provides the relative paths to its source code.

View File

@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -425,6 +426,7 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w; struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
bool use_gelu = false; bool use_gelu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf; struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data; struct ggml_context * ctx_data;
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); struct ggml_tensor * embeddings = inp;
ggml_set_name(embeddings, "embeddings"); if (ctx->has_class_embedding) {
ggml_set_input(embeddings); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm // pre-layernorm
{ if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln"); ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur; embeddings = cur;
} }
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector // llava projector
{ {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
try {
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
} catch(const std::exception& e) { } catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__); LOG_TEE("%s: failed to load vision model tensors\n", __func__);
} }
@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -1825,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
{ {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings)); void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings)); memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem); free(zero_mem);
}
} }
{ {

View File

@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
LOG_TEE("\n"); LOG_TEE("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = ""; std::string response = "";
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
@ -285,7 +290,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params); gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
@ -295,14 +300,10 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
for (auto & image : params.image) { if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model); auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image); auto image_embed = load_image(ctx_llava, &params, "");
if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1;
}
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
@ -311,7 +312,26 @@ int main(int argc, char ** argv) {
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL; ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
} }
llama_free_model(model); llama_free_model(model);
return 0; return 0;

View File

@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct { struct {
struct ggml_tensor * newline;
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) {
LOG_TEE("newline_tmp tensor buffer is NULL\n");
}
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else {
model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) {
LOG_TEE("newline_tmp tensor data is NULL\n");
}
}
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base // fill it with the image embeddings, ignoring the base

View File

@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/

View File

@ -121,7 +121,7 @@ int main(int argc, char ** argv){
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// print current draft sequence // print current draft sequence

View File

@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

View File

@ -60,9 +60,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -80,7 +80,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -88,8 +88,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
} }
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
@ -703,7 +707,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -724,7 +728,7 @@ int main(int argc, char ** argv) {
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -875,11 +879,11 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
} }
if (params.escape) { if (params.escape) {
process_escapes(buffer); string_process_escapes(buffer);
} }
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

View File

@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
llama_batch_clear(batch); llama_batch_clear(batch);

View File

@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16. Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`). The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
By default only the mean perplexity value and the corresponding uncertainty is calculated. By default only the mean perplexity value and the corresponding uncertainty is calculated.
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation. The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
@ -32,12 +34,21 @@ In addition to the KL divergence the following statistics are calculated with `-
## LLaMA 3 8b Scoreboard ## LLaMA 3 8b Scoreboard
Results are sorted by Kullback-Leibler divergence relative to FP16. | Revision | f364eb6f |
|:---------|:-------------------|
| Backend | CUDA |
| CPU | AMD Epyc 7742 |
| GPU | 1x NVIDIA RTX 4090 |
Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat). The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
So the "f16" results are to be understood as the difference resulting only from this downcast.
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp | | Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------| |--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
| f16 | None | 14.97 | 6.233160 ± 0.037828 | - | - | - | - | | f16 | None | 14.97 | 6.233160 ± 0.037828 | 0.001524 ± 0.000755 | 0.000551 ± 0.000002 | 0.001 ± 0.002 % | 0.787 ± 0.004 % |
| q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % | | q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % |
| q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % | | q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % |
| q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % | | q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % |
@ -89,6 +100,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
## LLaMA 2 vs. LLaMA 3 Quantization comparison ## LLaMA 2 vs. LLaMA 3 Quantization comparison
| Revision | f364eb6f |
|:---------|:-------------------|
| Backend | CUDA |
| CPU | AMD Epyc 7742 |
| GPU | 1x NVIDIA RTX 4090 |
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 | | Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------| |-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 | | Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
@ -107,6 +124,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % | | RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % | | Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
## LLaMA 3 BF16 vs. FP16 comparison
| Revision | 83330d8c |
|:---------|:--------------|
| Backend | CPU |
| CPU | AMD Epyc 7742 |
| GPU | N/A |
Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
| Metric | Value |
|--------------------------------|--------------------------|
| Mean PPL(Q) | 6.227711 ± 0.037833 |
| Mean PPL(base) | 6.225194 ± 0.037771 |
| Cor(ln(PPL(Q)), ln(PPL(base))) | 99.990% |
| Mean ln(PPL(Q)/PPL(base)) | 0.000404 ± 0.000086 |
| Mean PPL(Q)/PPL(base) | 1.000404 ± 0.000086 |
| Mean PPL(Q)-PPL(base) | 0.002517 ± 0.000536 |
| Mean KLD | 0.00002515 ± 0.00000020 |
| Maximum KLD | 0.012206 |
| 99.9% KLD | 0.000799 |
| 99.0% KLD | 0.000222 |
| 99.0% KLD | 0.000222 |
| Median KLD | 0.000013 |
| 10.0% KLD | -0.000002 |
| 5.0% KLD | -0.000008 |
| 1.0% KLD | -0.000023 |
| Minimum KLD | -0.000059 |
| Mean Δp | -0.0000745 ± 0.0003952 % |
| Maximum Δp | 4.186% |
| 99.9% Δp | 1.049% |
| 99.0% Δp | 0.439% |
| 95.0% Δp | 0.207% |
| 90.0% Δp | 0.125% |
| 75.0% Δp | 0.029% |
| Median Δp | 0.000% |
| 25.0% Δp | -0.030% |
| 10.0% Δp | -0.126% |
| 5.0% Δp | -0.207% |
| 1.0% Δp | -0.434% |
| 0.1% Δp | -1.016% |
| Minimum Δp | -4.672% |
| RMS Δp | 0.150 ± 0.001 % |
| Same top p | 99.739 ± 0.013 % |
## Old Numbers ## Old Numbers

View File

@ -44,9 +44,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -64,7 +64,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -72,9 +72,9 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_vector_float_yaml(logfile, "logits", results.logits); yaml_dump_vector_float(logfile, "logits", results.logits);
fprintf(logfile, "ppl_value: %f\n", results.ppl_value); fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
dump_vector_float_yaml(logfile, "probs", results.probs); yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// Use all tasks // Use all tasks
tasks.resize(n_task); tasks.resize(n_task);
printf("%s: reading tasks", __func__); printf("%s: reading tasks", __func__);
int n_dot = n_task/100; int n_dot = std::max((int) n_task/100, 1);
int i = 0; int i = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i; ++i;
@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
llama_batch_free(batch); llama_batch_free(batch);
if (n_done < 100) return; if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
float p = 1.f*n_correct/n_done; float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1)); float sigma = sqrt(p*(1-p)/(n_done-1));
@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
struct results_perplexity results; struct results_perplexity results;

View File

@ -1,6 +1,8 @@
# quantize # quantize
TODO You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
Note: It is synced from llama.cpp `main` every 6 hours.
## Llama 2 7B ## Llama 2 7B

View File

@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) { } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--keep-split")) { } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true; params.keep_split = true;
} else { } else {
usage(argv[0]); usage(argv[0]);

View File

@ -41,8 +41,8 @@ $SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/g
echo PASS echo PASS
echo echo
# 3. Requant model with '--keep_split' # 3. Requant model with '--keep-split'
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K $QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
echo PASS echo PASS
echo echo
@ -51,7 +51,7 @@ $MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt
echo PASS echo PASS
echo echo
# 4. Requant mode without '--keep_split' # 4. Requant mode without '--keep-split'
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K $QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
echo PASS echo PASS
echo echo

View File

@ -11,7 +11,7 @@ struct retrieval_params {
}; };
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
gpt_print_usage(argc, argv, gpt_params); gpt_params_print_usage(argc, argv, gpt_params);
printf("retrieval options:\n"); printf("retrieval options:\n");
printf(" --context-file FNAME file containing context to embed.\n"); printf(" --context-file FNAME file containing context to embed.\n");
printf(" specify multiple files by providing --context-file option multiple times.\n"); printf(" specify multiple files by providing --context-file option multiple times.\n");
@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// max batch size // max batch size

View File

@ -0,0 +1,2 @@
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

74
examples/rpc/README.md Normal file
View File

@ -0,0 +1,74 @@
## Overview
The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
flowchart TD
rpcb---|TCP|srva
rpcb---|TCP|srvb
rpcb-.-|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
end
subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
end
subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
end
subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
```
Each host can run a different backend, e.g. one with CUDA and another with Metal.
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
## Usage
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support:
```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
cmake --build . --config Release
```
Then, start the `rpc-server` with the backend:
```bash
$ bin/rpc-server -p 50052
create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
Starting RPC server on 0.0.0.0:50052
```
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
```bash
mkdir build-rpc
cd build-rpc
cmake .. -DLLAMA_RPC=ON
cmake --build . --config Release
```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

134
examples/rpc/rpc-server.cpp Normal file
View File

@ -0,0 +1,134 @@
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <unistd.h>
#endif
#include <string>
#include <stdio.h>
struct rpc_server_params {
std::string host = "0.0.0.0";
int port = 50052;
size_t backend_mem = 0;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-H" || arg == "--host") {
if (++i >= argc) {
return false;
}
params.host = argv[i];
} else if (arg == "-p" || arg == "--port") {
if (++i >= argc) {
return false;
}
params.port = std::stoi(argv[i]);
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
}
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#elif GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
backend = ggml_backend_metal_init();
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
fprintf(stderr, "%s: using CPU backend\n", __func__);
backend = ggml_backend_cpu_init();
}
return backend;
}
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else
#ifdef _WIN32
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
*total_mem = status.ullTotalPhys;
*free_mem = status.ullAvailPhys;
#else
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
*total_mem = pages * page_size;
*free_mem = *total_mem;
#endif
#endif
}
int main(int argc, char * argv[]) {
rpc_server_params params;
if (!rpc_server_params_parse(argc, argv, params)) {
fprintf(stderr, "Invalid parameters\n");
return 1;
}
ggml_backend_t backend = create_backend();
if (!backend) {
fprintf(stderr, "Failed to create backend\n");
return 1;
}
std::string endpoint = params.host + ":" + std::to_string(params.port);
size_t free_mem, total_mem;
if (params.backend_mem > 0) {
free_mem = params.backend_mem;
total_mem = params.backend_mem;
} else {
get_backend_memory(&free_mem, &total_mem);
}
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View File

@ -17,8 +17,9 @@ The project is under active development, and we are [looking for feedback and co
**Command line options:** **Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend. - `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes - `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system - `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems. - `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -48,8 +47,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--path`: Path from which to serve static files. Default: disabled - `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1` - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

View File

@ -0,0 +1,52 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>SimpleChat (LlamaCPP, ...) </title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="message" content="Save Nature Save Earth" />
<meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
<meta name="author" content="by Humans for All" />
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
<script src="simplechat.js" defer></script>
<link rel="stylesheet" href="simplechat.css" />
</head>
<body>
<div class="samecolumn" id="fullbody">
<div class="sameline">
<p class="heading flex-grow" > <b> SimpleChat </b> </p>
<div class="sameline">
<label for="api-ep">Mode:</label>
<select name="api-ep" id="api-ep">
<option value="chat" selected>Chat</option>
<option value="completion">Completion</option>
</select>
</div>
</div>
<div id="sessions-div" class="sameline"></div>
<hr>
<div class="sameline">
<label for="system-in">System</label>
<input type="text" name="system" id="system-in" class="flex-grow"/>
</div>
<hr>
<div id="chat-div">
<p> Enter the system prompt above, before entering/submitting any user query.</p>
<p> Enter your text to the ai assistant below.</p>
<p> Use shift+enter for inserting enter.</p>
<p> Refresh the page to start over fresh.</p>
</div>
<hr>
<div class="sameline">
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
<button id="user-btn">submit</button>
</div>
</div>
</body>
</html>

View File

@ -0,0 +1,81 @@
# SimpleChat
by Humans for All.
## overview
This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
own system prompts.
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
enough manner, in general.
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
culling of old messages from the chat.
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
the js file as needed.
## usage
One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
frontend to configure the server over http(s) or so, then run this web frontend using something like python's
http module.
### running using examples/server
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
### running using python3's server module
first run examples/server
* bin/server -m path/model.gguf
next run this web front end in examples/server/public_simplechat
* cd ../examples/server/public_simplechat
* python3 -m http.server PORT
### using the front end
Open this simple web front end from your local browser
* http://127.0.0.1:PORT/index.html
Once inside
* Select between chat and completion mode. By default it is set to chat mode.
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
* if chat.add_system_begin is used
* you cant change the system prompt, after it is has been submitted once along with user query.
* you cant set a system prompt, after you have submitted any user query
* if chat.add_system_anytime is used
* one can change the system prompt any time during chat, by changing the contents of system prompt.
* inturn the updated/changed system prompt will be inserted into the chat session.
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
* Enter your query and either press enter or click on the submit button.
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
* Wait for the logic to communicate with the server and get the response.
* the user is not allowed to enter any fresh query during this time.
* the user input box will be disabled and a working message will be shown in it.
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
* Using NewChat one can start independent chat sessions.
* two independent chat sessions are setup by default.
## Devel note
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
may not be visible. Also remember that just refreshing/reloading page in browser or for that
matter clearing site data, dont directly override site caching in all cases. Worst case you may
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
Concept of multiple chat sessions with different servers, as well as saving and restoring of
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
its instances relatively easily, however given the current goal of keeping this simple, it has
not been added, for now.
By switching between chat.add_system_begin/anytime, one can control whether one can change
the system prompt, anytime during the conversation or only at the beginning.

View File

@ -0,0 +1,61 @@
/**
* the styling of the simplechat web frontend
* by Humans for All
*/
#fullbody {
height: 98vh;
}
.heading {
background-color: lightgray;
}
.session-selected {
background-color: lightblue;
}
.role-system {
background-color: lightblue;
}
.role-user {
background-color: lightgray;
}
.flex-grow {
flex-grow: 1;
}
.float-right {
float: right;
}
#chat-div {
overflow: scroll;
flex-grow: 1;
flex-shrink: 1;
min-height: 40vh;
}
button {
min-width: 8vw;
}
.sameline {
display: flex;
flex-direction: row;
}
.samecolumn {
display: flex;
flex-direction: column;
}
* {
margin: 0.6vmin;
}
@media print {
#fullbody {
height: auto;
}
}

View File

@ -0,0 +1,478 @@
// @ts-check
// A simple completions and chat/completions test related web front end logic
// by Humans for All
class Roles {
static System = "system";
static User = "user";
static Assistant = "assistant";
}
class ApiEP {
static Chat = "chat";
static Completion = "completion";
}
let gUsageMsg = `
<p> Enter the system prompt above, before entering/submitting any user query.</p>
<p> Enter your text to the ai assistant below.</p>
<p> Use shift+enter for inserting enter.</p>
<p> Refresh the page to start over fresh.</p>
`;
class SimpleChat {
constructor() {
/**
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
* @type {{role: string, content: string}[]}
*/
this.xchat = [];
this.iLastSys = -1;
}
/**
* Add an entry into xchat
* @param {string} role
* @param {string|undefined|null} content
*/
add(role, content) {
if ((content == undefined) || (content == null) || (content == "")) {
return false;
}
this.xchat.push( {role: role, content: content} );
if (role == Roles.System) {
this.iLastSys = this.xchat.length - 1;
}
return true;
}
/**
* Show the contents in the specified div
* @param {HTMLDivElement} div
* @param {boolean} bClear
*/
show(div, bClear=true) {
if (bClear) {
div.replaceChildren();
}
let last = undefined;
for(const x of this.xchat) {
let entry = document.createElement("p");
entry.className = `role-${x.role}`;
entry.innerText = `${x.role}: ${x.content}`;
div.appendChild(entry);
last = entry;
}
if (last !== undefined) {
last.scrollIntoView(false);
} else {
if (bClear) {
div.innerHTML = gUsageMsg;
}
}
}
/**
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
* Convert the json into string.
* @param {Object} obj
*/
request_jsonstr(obj) {
obj["temperature"] = 0.7;
return JSON.stringify(obj);
}
/**
* Return a string form of json object suitable for chat/completions
*/
request_messages_jsonstr() {
let req = {
messages: this.xchat,
}
return this.request_jsonstr(req);
}
/**
* Return a string form of json object suitable for /completions
*/
request_prompt_jsonstr() {
let prompt = "";
for(const chat of this.xchat) {
prompt += `${chat.role}: ${chat.content}\n`;
}
let req = {
prompt: prompt,
}
return this.request_jsonstr(req);
}
/**
* Allow setting of system prompt, but only at begining.
* @param {string} sysPrompt
* @param {string} msgTag
*/
add_system_begin(sysPrompt, msgTag) {
if (this.xchat.length == 0) {
if (sysPrompt.length > 0) {
return this.add(Roles.System, sysPrompt);
}
} else {
if (sysPrompt.length > 0) {
if (this.xchat[0].role !== Roles.System) {
console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
} else {
if (this.xchat[0].content !== sysPrompt) {
console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
}
}
}
}
return false;
}
/**
* Allow setting of system prompt, at any time.
* @param {string} sysPrompt
* @param {string} msgTag
*/
add_system_anytime(sysPrompt, msgTag) {
if (sysPrompt.length <= 0) {
return false;
}
if (this.iLastSys < 0) {
return this.add(Roles.System, sysPrompt);
}
let lastSys = this.xchat[this.iLastSys].content;
if (lastSys !== sysPrompt) {
return this.add(Roles.System, sysPrompt);
}
return false;
}
/**
* Retrieve the latest system prompt.
*/
get_system_latest() {
if (this.iLastSys == -1) {
return "";
}
let sysPrompt = this.xchat[this.iLastSys].content;
return sysPrompt;
}
}
let gBaseURL = "http://127.0.0.1:8080";
let gChatURL = {
'chat': `${gBaseURL}/chat/completions`,
'completion': `${gBaseURL}/completions`,
}
const gbCompletionFreshChatAlways = true;
/**
* Set the class of the children, based on whether it is the idSelected or not.
* @param {HTMLDivElement} elBase
* @param {string} idSelected
* @param {string} classSelected
* @param {string} classUnSelected
*/
function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
for(let child of elBase.children) {
if (child.id == idSelected) {
child.className = classSelected;
} else {
child.className = classUnSelected;
}
}
}
/**
* Create button and set it up.
* @param {string} id
* @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
* @param {string | undefined} name
* @param {string | undefined} innerText
*/
function el_create_button(id, callback, name=undefined, innerText=undefined) {
if (!name) {
name = id;
}
if (!innerText) {
innerText = id;
}
let btn = document.createElement("button");
btn.id = id;
btn.name = name;
btn.innerText = innerText;
btn.addEventListener("click", callback);
return btn;
}
class MultiChatUI {
constructor() {
/** @type {Object<string, SimpleChat>} */
this.simpleChats = {};
/** @type {string} */
this.curChatId = "";
// the ui elements
this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
this.validate_element(this.elInSystem, "system-in");
this.validate_element(this.elDivChat, "chat-div");
this.validate_element(this.elInUser, "user-in");
this.validate_element(this.elSelectApiEP, "api-ep");
this.validate_element(this.elDivChat, "sessions-div");
}
/**
* Check if the element got
* @param {HTMLElement | null} el
* @param {string} msgTag
*/
validate_element(el, msgTag) {
if (el == null) {
throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
} else {
console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
}
}
/**
* Reset user input ui.
* * clear user input
* * enable user input
* * set focus to user input
*/
ui_reset_userinput() {
this.elInUser.value = "";
this.elInUser.disabled = false;
this.elInUser.focus();
}
/**
* Setup the needed callbacks wrt UI, curChatId to defaultChatId and
* optionally switch to specified defaultChatId.
* @param {string} defaultChatId
* @param {boolean} bSwitchSession
*/
setup_ui(defaultChatId, bSwitchSession=false) {
this.curChatId = defaultChatId;
if (bSwitchSession) {
this.handle_session_switch(this.curChatId);
}
this.elBtnUser.addEventListener("click", (ev)=>{
if (this.elInUser.disabled) {
return;
}
this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
console.debug(msg.replace("\n", ":"));
alert(msg);
this.ui_reset_userinput();
});
});
this.elInUser.addEventListener("keyup", (ev)=> {
// allow user to insert enter into their message using shift+enter.
// while just pressing enter key will lead to submitting.
if ((ev.key === "Enter") && (!ev.shiftKey)) {
this.elBtnUser.click();
ev.preventDefault();
}
});
this.elInSystem.addEventListener("keyup", (ev)=> {
// allow user to insert enter into the system prompt using shift+enter.
// while just pressing enter key will lead to setting the system prompt.
if ((ev.key === "Enter") && (!ev.shiftKey)) {
let chat = this.simpleChats[this.curChatId];
chat.add_system_anytime(this.elInSystem.value, this.curChatId);
chat.show(this.elDivChat);
ev.preventDefault();
}
});
}
/**
* Setup a new chat session and optionally switch to it.
* @param {string} chatId
* @param {boolean} bSwitchSession
*/
new_chat_session(chatId, bSwitchSession=false) {
this.simpleChats[chatId] = new SimpleChat();
if (bSwitchSession) {
this.handle_session_switch(chatId);
}
}
/**
* Handle user query submit request, wrt specified chat session.
* @param {string} chatId
* @param {string} apiEP
*/
async handle_user_submit(chatId, apiEP) {
let chat = this.simpleChats[chatId];
chat.add_system_anytime(this.elInSystem.value, chatId);
let content = this.elInUser.value;
if (!chat.add(Roles.User, content)) {
console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
return;
}
chat.show(this.elDivChat);
let theBody;
let theUrl = gChatURL[apiEP]
if (apiEP == ApiEP.Chat) {
theBody = chat.request_messages_jsonstr();
} else {
theBody = chat.request_prompt_jsonstr();
}
this.elInUser.value = "working...";
this.elInUser.disabled = true;
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
let resp = await fetch(theUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: theBody,
});
let respBody = await resp.json();
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
let assistantMsg;
if (apiEP == ApiEP.Chat) {
assistantMsg = respBody["choices"][0]["message"]["content"];
} else {
try {
assistantMsg = respBody["choices"][0]["text"];
} catch {
assistantMsg = respBody["content"];
}
}
chat.add(Roles.Assistant, assistantMsg);
if (chatId == this.curChatId) {
chat.show(this.elDivChat);
} else {
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
}
// Purposefully clear at end rather than begin of this function
// so that one can switch from chat to completion mode and sequece
// in a completion mode with multiple user-assistant chat data
// from before to be sent/occur once.
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
chat.xchat.length = 0;
}
this.ui_reset_userinput();
}
/**
* Show buttons for NewChat and available chat sessions, in the passed elDiv.
* If elDiv is undefined/null, then use this.elDivSessions.
* Take care of highlighting the selected chat-session's btn.
* @param {HTMLDivElement | undefined} elDiv
*/
show_sessions(elDiv=undefined) {
if (!elDiv) {
elDiv = this.elDivSessions;
}
elDiv.replaceChildren();
// Btn for creating new chat session
let btnNew = el_create_button("New CHAT", (ev)=> {
if (this.elInUser.disabled) {
console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
return;
}
let chatId = `Chat${Object.keys(this.simpleChats).length}`;
let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
if (!chatIdGot) {
console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
return;
}
this.new_chat_session(chatIdGot, true);
this.create_session_btn(elDiv, chatIdGot);
el_children_config_class(elDiv, chatIdGot, "session-selected", "");
});
elDiv.appendChild(btnNew);
// Btns for existing chat sessions
let chatIds = Object.keys(this.simpleChats);
for(let cid of chatIds) {
let btn = this.create_session_btn(elDiv, cid);
if (cid == this.curChatId) {
btn.className = "session-selected";
}
}
}
create_session_btn(elDiv, cid) {
let btn = el_create_button(cid, (ev)=>{
let target = /** @type{HTMLButtonElement} */(ev.target);
console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
if (this.elInUser.disabled) {
console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
return;
}
this.handle_session_switch(target.id);
el_children_config_class(elDiv, target.id, "session-selected", "");
});
elDiv.appendChild(btn);
return btn;
}
/**
* Switch ui to the specified chatId and set curChatId to same.
* @param {string} chatId
*/
async handle_session_switch(chatId) {
let chat = this.simpleChats[chatId];
if (chat == undefined) {
console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
return;
}
this.elInSystem.value = chat.get_system_latest();
this.elInUser.value = "";
chat.show(this.elDivChat);
this.elInUser.focus();
this.curChatId = chatId;
console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
}
}
let gMuitChat;
const gChatIds = [ "Default", "Other" ];
function startme() {
console.log("INFO:SimpleChat:StartMe:Starting...");
gMuitChat = new MultiChatUI();
for (let cid of gChatIds) {
gMuitChat.new_chat_session(cid);
}
gMuitChat.setup_ui(gChatIds[0]);
gMuitChat.show_sessions();
}
document.addEventListener("DOMContentLoaded", startme);

View File

@ -102,7 +102,6 @@ struct slot_params {
bool stream = true; bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
@ -651,9 +650,6 @@ struct server_context {
std::string system_prompt; std::string system_prompt;
std::vector<llama_token> system_tokens; std::vector<llama_token> system_tokens;
std::string name_user; // this should be the antiprompt
std::string name_assistant;
// slots / clients // slots / clients
std::vector<server_slot> slots; std::vector<server_slot> slots;
json default_generation_settings_for_props; json default_generation_settings_for_props;
@ -673,6 +669,15 @@ struct server_context {
llama_free_model(model); llama_free_model(model);
model = nullptr; model = nullptr;
} }
// Clear any sampling context
for (server_slot & slot : slots) {
if (slot.ctx_sampling != nullptr) {
llama_sampling_free(slot.ctx_sampling);
}
}
llama_batch_free(batch);
} }
bool load_model(const gpt_params & params_) { bool load_model(const gpt_params & params_) {
@ -1014,7 +1019,7 @@ struct server_context {
sampler_names.emplace_back(sampler_name); sampler_names.emplace_back(sampler_name);
} }
} }
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
} else { } else {
slot.sparams.samplers_sequence = default_sparams.samplers_sequence; slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
} }
@ -1098,15 +1103,11 @@ struct server_context {
system_need_update = false; system_need_update = false;
} }
void system_prompt_set(const json & sys_props) { bool system_prompt_set(const std::string & sys_prompt) {
system_prompt = sys_props.value("prompt", ""); system_prompt = sys_prompt;
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");
LOG_VERBOSE("system prompt process", { LOG_VERBOSE("system prompt process", {
{"system_prompt", system_prompt}, {"system_prompt", system_prompt},
{"name_user", name_user},
{"name_assistant", name_assistant},
}); });
// release all slots // release all slots
@ -1115,6 +1116,7 @@ struct server_context {
} }
system_need_update = true; system_need_update = true;
return true;
} }
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
@ -1254,14 +1256,14 @@ struct server_context {
std::vector<std::string> samplers_sequence; std::vector<std::string> samplers_sequence;
samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
for (const auto & sampler_type : slot.sparams.samplers_sequence) { for (const auto & sampler_type : slot.sparams.samplers_sequence) {
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
} }
return json { return json {
{"n_ctx", slot.n_ctx}, {"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict}, {"n_predict", slot.n_predict},
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.params.seed}, {"seed", slot.sparams.seed},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@ -1534,7 +1536,8 @@ struct server_context {
} }
if (task.data.contains("system_prompt")) { if (task.data.contains("system_prompt")) {
system_prompt_set(task.data.at("system_prompt")); std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
system_prompt_set(sys_prompt);
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
slot.n_past = 0; slot.n_past = 0;
@ -1978,8 +1981,7 @@ struct server_context {
slot.state = SLOT_STATE_PROCESSING; slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE; slot.command = SLOT_COMMAND_NONE;
slot.release(); slot.release();
slot.print_timings(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
send_final_response(slot);
continue; continue;
} }
} else { } else {
@ -2270,10 +2272,10 @@ struct server_context {
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs); const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
if (n_probs > 0) { if (n_probs > 0) {
const size_t n_considered = slot.ctx_sampling->n_considered; const size_t n_valid = slot.ctx_sampling->n_valid;
// Make sure at least n_probs top tokens are at the front of the vector: // Make sure at least n_probs top tokens are at the front of the vector:
if (slot.sparams.temp == 0.0f && n_probs > n_considered) { if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
llama_sample_top_k(ctx, &cur_p, n_probs, 0); llama_sample_top_k(ctx, &cur_p, n_probs, 0);
} }
@ -2289,7 +2291,7 @@ struct server_context {
for (size_t i = 0; i < n_probs; ++i) { for (size_t i = 0; i < n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
cur_p.data[i].id, cur_p.data[i].id,
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability. i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
}); });
} }
} }
@ -2383,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@ -2435,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break; break;
} }
sparams.port = std::stoi(argv[i]); sparams.port = std::stoi(argv[i]);
} else if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers = argv[i];
} else if (arg == "--host") { } else if (arg == "--host") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -2843,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
if (!parse_kv_override(argv[i], params.kv_overrides)) { if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
break; break;
@ -2918,7 +2927,7 @@ int main(int argc, char ** argv) {
server_params_parse(argc, argv, sparams, params); server_params_parse(argc, argv, sparams, params);
if (!sparams.system_prompt.empty()) { if (!sparams.system_prompt.empty()) {
ctx_server.system_prompt_set(json::parse(sparams.system_prompt)); ctx_server.system_prompt_set(sparams.system_prompt);
} }
if (params.model_alias == "unknown") { if (params.model_alias == "unknown") {
@ -3301,7 +3310,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body); json request_data = json::parse(req.body);
std::string filename = request_data.at("filename"); std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) { if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
@ -3331,7 +3340,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body); json request_data = json::parse(req.body);
std::string filename = request_data.at("filename"); std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) { if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
@ -3407,8 +3416,7 @@ int main(int argc, char ** argv) {
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = { json data = {
{ "user_name", ctx_server.name_user.c_str() }, { "system_prompt", ctx_server.system_prompt.c_str() },
{ "assistant_name", ctx_server.name_assistant.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel } { "total_slots", ctx_server.params.n_parallel }
}; };

View File

@ -13,6 +13,7 @@ Feature: Results
Scenario Outline: consistent results with same seed Scenario Outline: consistent results with same seed
Given <n_slots> slots Given <n_slots> slots
And 1.0 temperature
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy
@ -26,10 +27,12 @@ Feature: Results
Examples: Examples:
| n_slots | | n_slots |
| 1 | | 1 |
| 2 | # FIXME: unified KV cache nondeterminism
# | 2 |
Scenario Outline: different results with different seed Scenario Outline: different results with different seed
Given <n_slots> slots Given <n_slots> slots
And 1.0 temperature
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy
@ -70,12 +73,46 @@ Feature: Results
Then all predictions are equal Then all predictions are equal
Examples: Examples:
| n_parallel | temp | | n_parallel | temp |
| 1 | 0.0 | | 1 | 0.0 |
| 2 | 0.0 | | 1 | 1.0 |
| 4 | 0.0 | # FIXME: unified KV cache nondeterminism
| 1 | 1.0 |
# FIXME: These tests fail on master. The problem seems to be the unified KV cache.
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 . # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# | 2 | 1.0 | # and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 4 | 1.0 | # | 2 | 0.0 |
# | 4 | 0.0 |
# | 2 | 1.0 |
# | 4 | 1.0 |
Scenario Outline: consistent token probs with same seed and prompt
Given <n_slots> slots
And <n_kv> KV cache size
And 1.0 temperature
And <n_predict> max tokens to predict
Then the server is starting
Then the server is healthy
Given 1 prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all token probabilities are equal
Examples:
| n_slots | n_kv | n_predict | n_parallel |
| 4 | 1024 | 1 | 1 |
# FIXME: unified KV cache nondeterminism
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 4 | 1024 | 1 | 4 |
# | 4 | 1024 | 100 | 1 |
# This test still fails even the above patches; the first token probabilities are already different.
# | 4 | 1024 | 100 | 4 |

View File

@ -23,6 +23,7 @@ from prometheus_client import parser
def step_server_config(context, server_fqdn, server_port): def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
context.server_port = int(server_port) context.server_port = int(server_port)
context.n_threads = None
context.n_gpu_layer = None context.n_gpu_layer = None
if 'PORT' in os.environ: if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT']) context.server_port = int(os.environ['PORT'])
@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl context.n_gpu_layer = ngl
@step('{n_threads:d} threads')
def step_n_threads(context, n_threads):
context.n_thread = n_threads
@step('{draft:d} as draft') @step('{draft:d} as draft')
def step_draft(context, draft): def step_draft(context, draft):
context.draft = draft context.draft = draft
@ -193,7 +199,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
case 'ready' | 'idle': case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok', await wait_for_health_status(context, context.base_url, 200, 'ok',
timeout=10, timeout=30,
params={'fail_on_no_slot': 0, 'include_slots': 0}, params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=context.n_slots, slots_idle=context.n_slots,
slots_processing=0, slots_processing=0,
@ -274,13 +280,22 @@ async def step_predictions_equal(context):
@step('all predictions are different') @step('all predictions are different')
@async_run_until_complete @async_run_until_complete
async def step_predictions_equal(context): async def step_predictions_different(context):
n_completions = await gather_tasks_results(context) n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions" assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_different(context.tasks_result) assert_all_predictions_different(context.tasks_result)
context.tasks_result = [] context.tasks_result = []
@step('all token probabilities are equal')
@async_run_until_complete
async def step_token_probabilities_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_token_probabilities_equal(context.tasks_result)
context.tasks_result = []
@step('the completion is truncated') @step('the completion is truncated')
def step_assert_completion_truncated(context): def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '') step_assert_completion_truncated(context, '')
@ -868,7 +883,8 @@ async def request_completion(prompt,
"cache_prompt": cache_prompt, "cache_prompt": cache_prompt,
"id_slot": id_slot, "id_slot": id_slot,
"seed": seed if seed is not None else 42, "seed": seed if seed is not None else 42,
"temperature": temperature if temperature is not None else "0.8f", "temperature": temperature if temperature is not None else 0.8,
"n_probs": 2,
}, },
headers=headers, headers=headers,
timeout=3600) as response: timeout=3600) as response:
@ -887,6 +903,7 @@ async def oai_chat_completions(user_prompt,
base_path, base_path,
async_client, async_client,
debug=False, debug=False,
temperature=None,
model=None, model=None,
n_predict=None, n_predict=None,
enable_streaming=None, enable_streaming=None,
@ -913,7 +930,8 @@ async def oai_chat_completions(user_prompt,
"model": model, "model": model,
"max_tokens": n_predict, "max_tokens": n_predict,
"stream": enable_streaming, "stream": enable_streaming,
"seed": seed "temperature": temperature if temperature is not None else 0.0,
"seed": seed,
} }
if response_format is not None: if response_format is not None:
payload['response_format'] = response_format payload['response_format'] = response_format
@ -978,7 +996,8 @@ async def oai_chat_completions(user_prompt,
max_tokens=n_predict, max_tokens=n_predict,
stream=enable_streaming, stream=enable_streaming,
response_format=payload.get('response_format'), response_format=payload.get('response_format'),
seed=seed seed=seed,
temperature=payload['temperature']
) )
except openai.error.AuthenticationError as e: except openai.error.AuthenticationError as e:
if expect_api_error is not None and expect_api_error: if expect_api_error is not None and expect_api_error:
@ -1120,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
assert content_i != content_j, "contents not different" assert content_i != content_j, "contents not different"
def assert_all_token_probabilities_equal(completion_responses):
n_predict = len(completion_responses[0]['completion_probabilities'])
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
for pos in range(n_predict):
for i, response_i in enumerate(completion_responses):
probs_i = response_i['completion_probabilities'][pos]['probs']
print(f"pos {pos}, probs {i}: {probs_i}")
for pos in range(n_predict):
for i, response_i in enumerate(completion_responses):
probs_i = response_i['completion_probabilities'][pos]['probs']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
probs_j = response_j['completion_probabilities'][pos]['probs']
assert probs_i == probs_j, "contents not equal"
async def gather_tasks_results(context): async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks) n_tasks = len(context.concurrent_tasks)
if context.debug: if context.debug:
@ -1258,6 +1294,8 @@ def start_server_background(context):
server_args.extend(['--batch-size', context.n_batch]) server_args.extend(['--batch-size', context.n_batch])
if context.n_ubatch: if context.n_ubatch:
server_args.extend(['--ubatch-size', context.n_ubatch]) server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_threads:
server_args.extend(['--threads', context.threads])
if context.n_gpu_layer: if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.draft is not None: if context.draft is not None:

View File

@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false); llama_params["stream"] = json_value(body, "stream", false);
llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["temperature"] = json_value(body, "temperature", 1.0);
llama_params["top_p"] = json_value(body, "top_p", 1.0); llama_params["top_p"] = json_value(body, "top_p", 1.0);
// Apply chat template to the list of messages // Apply chat template to the list of messages

View File

@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom( return ggml_rope_ext(
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };

View File

@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
static char * fmt_size(size_t size) { static char * fmt_size(size_t size) {
static char buffer[128]; static char buffer[128];
if (size >= 1024*1024) { if (size >= 1024*1024) {
sprintf(buffer, "%zuM", size/1024/1024); snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
} else { } else {
sprintf(buffer, "%zuK", size/1024); snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
} }
return buffer; return buffer;
} }
@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
tensor->buffer = buffer; tensor->buffer = buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs; tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
tensor->backend = tensor->view_src->backend;
ggml_backend_buffer_init_tensor(buffer, tensor); ggml_backend_buffer_init_tensor(buffer, tensor);
} }

View File

@ -4,7 +4,6 @@
#include "ggml-cuda/common.cuh" #include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh" #include "ggml-cuda/acc.cuh"
#include "ggml-cuda/alibi.cuh"
#include "ggml-cuda/arange.cuh" #include "ggml-cuda/arange.cuh"
#include "ggml-cuda/argsort.cuh" #include "ggml-cuda/argsort.cuh"
#include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/binbcast.cuh"
@ -44,19 +43,59 @@
#include <mutex> #include <mutex>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string> #include <string>
#include <vector> #include <vector>
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
GGML_UNUSED(level);
GGML_UNUSED(user_data);
fprintf(stderr, "%s", msg);
}
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
void * ggml_cuda_log_user_data = NULL;
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml_cuda_log_callback = log_callback;
ggml_cuda_log_user_data = user_data;
}
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
GGML_ATTRIBUTE_FORMAT(2, 3)
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
if (ggml_cuda_log_callback != NULL) {
va_list args;
va_start(args, format);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
} else {
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
va_end(args);
va_start(args, format);
vsnprintf(&buffer2[0], buffer2.size(), format, args);
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
}
va_end(args);
}
}
[[noreturn]] [[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails int id = -1; // in case cudaGetDevice fails
cudaGetDevice(&id); cudaGetDevice(&id);
fprintf(stderr, "CUDA error: %s\n", msg); GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
fprintf(stderr, " %s\n", stmt); GGML_CUDA_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace // abort with GGML_ASSERT to get a stack trace
GGML_ASSERT(!"CUDA error"); GGML_ASSERT(!"CUDA error");
} }
@ -92,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
cudaError_t err = cudaGetDeviceCount(&info.device_count); cudaError_t err = cudaGetDeviceCount(&info.device_count);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
return info; return info;
} }
@ -100,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ) #if defined(GGML_CUDA_FORCE_MMQ)
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else #else
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif #endif
#if defined(CUDA_USE_TENSOR_CORES) #if defined(CUDA_USE_TENSOR_CORES)
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
#else #else
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
#endif #endif
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
@ -130,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
info.default_tensor_split[id] = total_vram; info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
@ -236,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
*actual_size = look_ahead_size; *actual_size = look_ahead_size;
pool_size += look_ahead_size; pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC #ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024)); (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
#endif #endif
return ptr; return ptr;
} }
@ -251,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
return; return;
} }
} }
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
CUDA_CHECK(cudaFree(ptr)); CUDA_CHECK(cudaFree(ptr));
pool_size -= size; pool_size -= size;
@ -500,7 +539,9 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
void * dev_ptr; void * dev_ptr;
cudaError_t err = cudaMalloc(&dev_ptr, size); cudaError_t err = cudaMalloc(&dev_ptr, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); // clear the error
cudaGetLastError();
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
@ -1003,8 +1044,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); cudaGetLastError();
fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size/1024.0/1024.0, cudaGetErrorString(err)); size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
@ -2205,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
ggml_cuda_op_relu(ctx, dst); ggml_cuda_op_relu(ctx, dst);
break; break;
case GGML_UNARY_OP_SIGMOID:
ggml_cuda_op_sigmoid(ctx, dst);
break;
case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSIGMOID:
ggml_cuda_op_hardsigmoid(ctx, dst); ggml_cuda_op_hardsigmoid(ctx, dst);
break; break;
@ -2244,7 +2288,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
return false; return false;
} else { } else {
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@ -2277,9 +2321,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ROPE: case GGML_OP_ROPE:
ggml_cuda_op_rope(ctx, dst); ggml_cuda_op_rope(ctx, dst);
break; break;
case GGML_OP_ALIBI:
ggml_cuda_op_alibi(ctx, dst);
break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst); ggml_cuda_op_im2col(ctx, dst);
break; break;
@ -2301,7 +2342,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst)); GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
CUDA_CHECK(err); CUDA_CHECK(err);
} }
@ -2477,7 +2518,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif #endif
} }
} }
@ -2524,14 +2565,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
#endif #endif
} }
if (node->op == GGML_OP_MUL_MAT_ID) { if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
#endif #endif
} }
@ -2540,7 +2581,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
// Changes in batch size or context size can cause changes to the grid size of some kernels. // Changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false; use_cuda_graph = false;
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif #endif
} }
@ -2559,7 +2600,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
} }
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (cuda_graph_update_required) { if (use_cuda_graph && cuda_graph_update_required) {
cuda_ctx->cuda_graph->number_consecutive_updates++; cuda_ctx->cuda_graph->number_consecutive_updates++;
} else { } else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0; cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@ -2568,7 +2609,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif #endif
} }
} }
@ -2606,7 +2647,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
} }
GGML_ASSERT(ok); GGML_ASSERT(ok);
} }
@ -2625,7 +2666,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
use_cuda_graph = false; use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__); GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif #endif
} else { } else {
graph_evaluated_or_captured = true; // CUDA graph has been captured graph_evaluated_or_captured = true; // CUDA graph has been captured
@ -2692,7 +2733,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) { if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: CUDA graph update failed\n", __func__); GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
#endif #endif
// The pre-existing graph exec cannot be updated due to violated constraints // The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate // so instead clear error and re-instantiate
@ -2714,12 +2755,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
} }
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_QUICK:
@ -2829,7 +2872,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
@ -2841,8 +2883,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
return true; return true;
case GGML_OP_FLASH_ATTN_EXT:
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
#else
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
return true;
}
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
default: default:
return false; return false;
} }
@ -2940,13 +2990,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
return nullptr; return nullptr;
} }
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
if (ctx == nullptr) { if (ctx == nullptr) {
fprintf(stderr, "%s: error: failed to allocate context\n", __func__); GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
return nullptr; return nullptr;
} }
@ -2990,8 +3040,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
// clear the error // clear the error
cudaGetLastError(); cudaGetLastError();
fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__, GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size/1024.0/1024.0, cudaGetErrorString(err)); size / 1024.0 / 1024.0, cudaGetErrorString(err));
return false; return false;
} }
return true; return true;

View File

@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,63 +0,0 @@
#include "alibi.cuh"
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
const int k_rows, const int n_heads_log2_floor, const float m0,
const float m1, cudaStream_t stream) {
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
const dim3 block_nums(num_blocks_x, nrows, 1);
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
}
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
}

View File

@ -1,5 +0,0 @@
#include "common.cuh"
#define CUDA_ALIBI_BLOCK_SIZE 32
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -315,12 +315,30 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#endif #endif
return c; return c;
} }
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
// __shfl_xor() for half2 was added in ROCm 5.6
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
typedef union half2_b32 {
half2 val;
int b32;
} half2_b32_t;
half2_b32_t tmp;
tmp.val = var;
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
return tmp.val;
}
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
#endif // defined(GGML_USE_HIPBLAS) #endif // defined(GGML_USE_HIPBLAS)
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
static bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
}
static bool fp16_mma_available(const int cc) { static bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
} }
@ -459,6 +477,17 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
static __device__ __forceinline__ float get_alibi_slope(
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
) {
if (max_bias <= 0.0f) {
return 1.0f;
}
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
return powf(base, exph);
}
////////////////////// //////////////////////

162
ggml-cuda/fattn-common.cuh Normal file
View File

@ -0,0 +1,162 @@
#include "common.cuh"
#include <cstdint>
#define FATTN_KQ_STRIDE 256
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
typedef void (* fattn_kernel_t)(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3);
template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
float * __restrict__ dst) {
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
dst += D * gridDim.y*blockIdx.x;
const int tid = threadIdx.x;
__builtin_assume(tid < D);
__shared__ float2 meta[parallel_blocks];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
}
__syncthreads();
float kqmax = meta[0].x;
#pragma unroll
for (int l = 1; l < parallel_blocks; ++l) {
kqmax = max(kqmax, meta[l].x);
}
float VKQ_numerator = 0.0f;
float VKQ_denominator = 0.0f;
#pragma unroll
for (int l = 0; l < parallel_blocks; ++l) {
const float diff = meta[l].x - kqmax;
const float KQ_max_scale = expf(diff);
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
VKQ_denominator += KQ_max_scale * meta[l].y;
}
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
}
template <int D, int parallel_blocks>
void launch_fattn(ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, int nwarps, int cols_per_block) {
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const ggml_tensor * mask = dst->src[3];
ggml_tensor * KQV = dst;
GGML_ASSERT(Q->type == GGML_TYPE_F32);
GGML_ASSERT(K->type == GGML_TYPE_F16);
GGML_ASSERT(V->type == GGML_TYPE_F16);
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t main_stream = ctx.stream();
ggml_cuda_pool_alloc<float> dst_tmp(pool);
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
if (parallel_blocks > 1) {
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
}
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
fattn_kernel<<<blocks_num, block_dim, shmem, main_stream>>>(
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale, max_bias, m0, m1, n_head_log2,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
K->nb[1], K->nb[2], K->nb[3],
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
);
CUDA_CHECK(cudaGetLastError());
if ((parallel_blocks) == 1) {
return;
}
const dim3 block_dim_combine(D, 1, 1);
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
const int shmem_combine = 0;
flash_attn_combine_results<D, parallel_blocks>
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
CUDA_CHECK(cudaGetLastError());
}

316
ggml-cuda/fattn-tile-f16.cu Normal file
View File

@ -0,0 +1,316 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile-f16.cuh"
#define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV2 = nb11 / sizeof(half2);
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
__shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
half2 * KQ2 = (half2 *) KQ;
__shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
half kqmax[ncols/nwarps];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
kqmax[j0/nwarps] = -HALF_MAX_HALF;
}
half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
// Convert Q to half2 and store in registers:
__shared__ half2 Q_h2[ncols][D/2];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
__syncthreads();
const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
// Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new[ncols/nwarps];
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
kqmax_new[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
}
}
__syncthreads();
half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
#pragma unroll
for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
half2 Q_k[ncols/nwarps];
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
}
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
}
}
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
#pragma unroll
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
const half2 val = h2exp(diff);
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
const int k = k0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
half2 V_k[(D/2)/WARP_SIZE][2];
half2 KQ_k[ncols/nwarps];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
}
}
}
__syncthreads();
}
#pragma unroll
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
const int j_VKQ = j_VKQ_0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
kqsum_j = warp_reduce_sum(kqsum_j);
#pragma unroll
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
const int i0 = i00 + 2*threadIdx.x;
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
if (parallel_blocks == 1) {
dst_val /= __half2half2(kqsum_j);
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = __low2float(dst_val);
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
}
if (parallel_blocks != 1 && threadIdx.x == 0) {
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
template <int cols_per_block, int parallel_blocks>
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

309
ggml-cuda/fattn-tile-f32.cu Normal file
View File

@ -0,0 +1,309 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile-f32.cuh"
#define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV2 = nb11 / sizeof(half2);
const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
__shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
__shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
float2 * KV_tmp2 = (float2 *) KV_tmp;
float kqmax[ncols/nwarps];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
}
float kqsum[ncols/nwarps] = {0.0f};
float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
// Convert Q to half2 and store in registers:
__shared__ float Q_f[ncols][D];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
}
}
__syncthreads();
const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
// Calculate KQ tile and keep track of new maximum KQ values:
float kqmax_new[ncols/nwarps];
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
kqmax_new[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
}
}
__syncthreads();
float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
#pragma unroll
for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
float Q_k[ncols/nwarps];
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
}
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
}
}
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
float kqsum_add = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
const float val = expf(diff);
kqsum_add += val;
KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
}
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
const int k = k0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
float2 V_k[(D/2)/WARP_SIZE];
float KQ_k[ncols/nwarps];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
}
}
}
__syncthreads();
}
#pragma unroll
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
const int j_VKQ = j_VKQ_0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
float kqsum_j = kqsum[j_VKQ_0/nwarps];
kqsum_j = warp_reduce_sum(kqsum_j);
#pragma unroll
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
const int i0 = i00 + 2*threadIdx.x;
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
if (parallel_blocks == 1) {
dst_val.x /= kqsum_j;
dst_val.y /= kqsum_j;
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
}
if (parallel_blocks != 1 && threadIdx.x == 0) {
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
}
template <int cols_per_block, int parallel_blocks>
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

330
ggml-cuda/fattn-vec-f16.cu Normal file
View File

@ -0,0 +1,330 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-vec-f16.cuh"
template<int D, int ncols, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
half2 * KQ2 = (half2 *) KQ;
half kqmax[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
}
half kqsum[ncols] = {0.0f};
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
// Convert Q to half2 and store in registers:
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
half2 sum2[ncols] = {{0.0f, 0.0f}};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] = warp_reduce_sum(sum2[j]);
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) {
dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * KQV = dst;
ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 256: {
constexpr int D = 256;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
default:
GGML_ASSERT(false);
break;
}
}
template <int cols_per_block, int parallel_blocks>
void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
if (Q->ne[1] == 1) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
}

View File

@ -0,0 +1,5 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

279
ggml-cuda/fattn-vec-f32.cu Normal file
View File

@ -0,0 +1,279 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-vec-f32.cuh"
template<int D, int ncols, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f32(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ float KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -FLT_MAX/2.0f;
}
float kqmax[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -FLT_MAX/2.0f;
}
float kqsum[ncols] = {0.0f};
__shared__ float kqmax_shared[ncols][WARP_SIZE];
__shared__ float kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
// Convert Q to half2 and store in registers:
float2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_h2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
Q_h2[j][i0/WARP_SIZE].x *= scale;
Q_h2[j][i0/WARP_SIZE].y *= scale;
}
}
float VKQ[ncols] = {0.0f};
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
float kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
float sum[ncols] = {0.0f};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum[j] += __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum[j] = warp_reduce_sum(sum[j]);
sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum[j];
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const float val = expf(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= KQ_max_scale;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < D; ++k) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
break;
}
const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_ki*KQ[j*D + k];
}
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
float dst_val = VKQ[j_VKQ];
if (parallel_blocks == 1) {
dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
}
}
template <int cols_per_block, int parallel_blocks>
void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,4 +1,9 @@
#include "common.cuh" #include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile-f16.cuh"
#include "fattn-tile-f32.cuh"
#include "fattn-vec-f16.cuh"
#include "fattn-vec-f32.cuh"
#include "fattn.cuh" #include "fattn.cuh"
#include <cstdint> #include <cstdint>
@ -7,235 +12,6 @@
#include <mma.h> #include <mma.h>
#endif #endif
#define FATTN_KQ_STRIDE 256
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
template<int D, int ncols, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const int ne00,
const int ne01,
const int ne02,
const int ne03,
const int ne10,
const int ne11,
const int ne12,
const int ne13,
const int ne31,
const int nb31,
const int nb01,
const int nb02,
const int nb03,
const int nb11,
const int nb12,
const int nb13,
const int ne0,
const int ne1,
const int ne2,
const int ne3) {
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
half2 * KQ2 = (half2 *) KQ;
half kqmax[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
}
half kqsum[ncols] = {0.0f};
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
// Convert Q to half2 and store in registers:
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
half2 sum2[ncols] = {{0.0f, 0.0f}};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] = warp_reduce_sum(sum2[j]);
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) {
dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
if (parallel_blocks != 1 && tid != 0) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
}
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t> template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -249,6 +25,10 @@ static __global__ void flash_attn_ext_f16(
float * __restrict__ dst, float * __restrict__ dst,
float2 * __restrict__ dst_meta, float2 * __restrict__ dst_meta,
const float scale, const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -305,6 +85,10 @@ static __global__ void flash_attn_ext_f16(
const int stride_Q = nb01 / sizeof(float); const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half); const int stride_KV = nb11 / sizeof(half);
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
const half2 slope2 = make_half2(slopef, slopef);
frag_b Q_b[D/16][ncols/frag_n]; frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts: // A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -421,7 +205,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f; KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]); KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
} }
KQ_max_new = warp_reduce_max(KQ_max_new); KQ_max_new = warp_reduce_max(KQ_max_new);
@ -464,7 +248,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f); KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]); KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
} }
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new)))); KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
@ -621,54 +405,6 @@ static __global__ void flash_attn_ext_f16(
#endif // FP16_MMA_AVAILABLE #endif // FP16_MMA_AVAILABLE
} }
template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
float * __restrict__ dst) {
#if FP16_AVAILABLE
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
dst += D * gridDim.y*blockIdx.x;
const int tid = threadIdx.x;
__builtin_assume(tid < D);
__shared__ float2 meta[parallel_blocks];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
}
__syncthreads();
float kqmax = meta[0].x;
#pragma unroll
for (int l = 1; l < parallel_blocks; ++l) {
kqmax = max(kqmax, meta[l].x);
}
float VKQ_numerator = 0.0f;
float VKQ_denominator = 0.0f;
#pragma unroll
for (int l = 0; l < parallel_blocks; ++l) {
const float diff = meta[l].x - kqmax;
const float KQ_max_scale = expf(diff);
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
VKQ_denominator += KQ_max_scale * meta[l].y;
}
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
constexpr int get_max_power_of_2(int x) { constexpr int get_max_power_of_2(int x) {
return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1; return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
} }
@ -693,262 +429,94 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed."); static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed."); static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16( template <int D, int cols_per_block, int nwarps, typename KQ_acc_t>
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, void launch_fattn_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_pool & pool, cudaStream_t main_stream const ggml_tensor * Q = dst->src[0];
) {
ggml_cuda_pool_alloc<float> dst_tmp(pool);
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
if (parallel_blocks > 1) { constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
}
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
K->nb[1], K->nb[2], K->nb[3],
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
);
CUDA_CHECK(cudaGetLastError());
if (parallel_blocks == 1) {
return;
}
const dim3 block_dim_combine(D, 1, 1);
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
const int shmem_combine = 0;
flash_attn_combine_results<D, parallel_blocks>
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
CUDA_CHECK(cudaGetLastError());
}
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream
) {
ggml_cuda_pool_alloc<float> dst_tmp(pool);
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
if (parallel_blocks > 1) {
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
}
constexpr int frag_m = (cols_per_block) == 8 && (D) % 32 == 0 ? 32 : 16;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
K->nb[1], K->nb[2], K->nb[3],
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
);
CUDA_CHECK(cudaGetLastError());
if ((parallel_blocks) == 1) {
return;
}
const dim3 block_dim_combine(D, 1, 1);
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
const int shmem_combine = 0;
flash_attn_combine_results<D, parallel_blocks>
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
CUDA_CHECK(cudaGetLastError());
}
template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_fattn_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
const int nsm, ggml_cuda_pool & pool, cudaStream_t main_stream
) {
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
if (4*blocks_num_pb1 < 2*nsm) { if (4*blocks_num_pb1 < 2*nsm) {
launch_fattn_f16_impl<D, cols_per_block, nwarps, 4, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream); constexpr int parallel_blocks = 4;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
return; return;
} }
if (2*blocks_num_pb1 < 2*nsm) { if (2*blocks_num_pb1 < 2*nsm) {
launch_fattn_f16_impl<D, cols_per_block, nwarps, 2, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream); constexpr int parallel_blocks = 2;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
return; return;
} }
launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream); constexpr int parallel_blocks = 1;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
} }
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * KQV = dst;
const ggml_tensor * K = dst->src[1]; const ggml_tensor * Q = dst->src[0];
const ggml_tensor * V = dst->src[2];
const ggml_tensor * mask = dst->src[3];
ggml_tensor * KQV = dst;
GGML_ASSERT(Q->type == GGML_TYPE_F32);
GGML_ASSERT(K->type == GGML_TYPE_F16);
GGML_ASSERT(V->type == GGML_TYPE_F16);
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
ggml_cuda_set_device(ctx.device); ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int32_t precision = KQV->op_params[2];
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; // On AMD the tile kernels perform poorly, use the vec kernel instead:
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; if (cc >= CC_OFFSET_AMD) {
if (precision == GGML_PREC_DEFAULT) {
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
} else {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
}
return;
}
const int32_t precision = KQV->op_params[1]; if (!fast_fp16_available(cc)) {
if (Q->ne[1] <= 8) {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
} else {
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
}
return;
}
if (!fp16_mma_available(cc)) { if (!fp16_mma_available(cc)) {
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 8) { if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8; ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
constexpr int parallel_blocks = 4; } else {
switch (Q->ne[0]) { ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
} }
return; return;
} }
if (precision != GGML_PREC_DEFAULT) { if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
return;
}
if (Q->ne[1] <= 32 || Q->ne[0] > 128) { if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
constexpr int cols_per_block = 16; constexpr int cols_per_block = 16;
constexpr int nwarps = 4; constexpr int nwarps = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 80: case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 96: case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 112: case 112:
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 128: case 128:
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 256: case 256:
launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
@ -959,22 +527,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
constexpr int nwarps = 4; constexpr int nwarps = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 64, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 80: case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 80, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 96: case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 96, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 112: case 112:
launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<112, cols_per_block, nwarps, float>(ctx, dst);
break; break;
case 128: case 128:
launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<128, cols_per_block, nwarps, float>(ctx, dst);
break; break;
// case 256: // case 256:
// launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); // launch_fattn_f16<256, cols_per_block, nwarps, float>(ctx, dst);
// break; // break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
@ -985,22 +553,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
} }
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
constexpr int cols_per_block = 1; ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return; return;
} }
@ -1009,16 +562,16 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
constexpr int nwarps = 4; constexpr int nwarps = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 96: case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 128: case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 256: case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
@ -1032,22 +585,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
constexpr int nwarps = 4; constexpr int nwarps = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 80: case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 96: case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 112: case 112:
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 128: case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 256: case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
@ -1060,22 +613,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
constexpr int nwarps = 4; constexpr int nwarps = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 64, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 80: case 80:
launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 80, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 96: case 96:
launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16< 96, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 112: case 112:
launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<112, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 128: case 128:
launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<128, cols_per_block, nwarps, half>(ctx, dst);
break; break;
case 256: case 256:
launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); launch_fattn_f16<256, cols_per_block, nwarps, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);

File diff suppressed because it is too large Load Diff

View File

@ -58,10 +58,10 @@ static __global__ void rope(
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
template<typename T, bool has_pos> template<typename T, bool has_pos, bool has_freq_facs>
static __global__ void rope_neox( static __global__ void rope_neox(
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
) { ) {
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@ -88,7 +88,9 @@ static __global__ void rope_neox(
float cur_rot = inv_ndims * ic - ib; float cur_rot = inv_ndims * ic - ib;
const int p = has_pos ? pos[i2] : 0; const int p = has_pos ? pos[i2] : 0;
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f); const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
float cos_theta, sin_theta; float cos_theta, sin_theta;
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@ -164,7 +166,7 @@ static void rope_cuda(
template<typename T> template<typename T>
static void rope_neox_cuda( static void rope_neox_cuda(
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
) { ) {
GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(ncols % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
@ -175,15 +177,29 @@ static void rope_neox_cuda(
const float inv_ndims = -1.0f / n_dims; const float inv_ndims = -1.0f / n_dims;
if (pos == nullptr) { if (pos == nullptr) {
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>( if (freq_factors == nullptr) {
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
theta_scale, inv_ndims x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
); theta_scale, inv_ndims, freq_factors
);
} else {
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, inv_ndims, freq_factors
);
}
} else { } else {
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>( if (freq_factors == nullptr) {
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
theta_scale, inv_ndims x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
); theta_scale, inv_ndims, freq_factors
);
} else {
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, inv_ndims, freq_factors
);
}
} }
} }
@ -214,24 +230,27 @@ static void rope_cuda_f32(
static void rope_neox_cuda_f16( static void rope_neox_cuda_f16(
const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) { float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
} }
static void rope_neox_cuda_f32( static void rope_neox_cuda_f32(
const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
) { ) {
rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
} }
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data; const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
@ -241,7 +260,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
@ -259,16 +277,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const float * freq_factors = nullptr;
const int32_t * pos = nullptr; const int32_t * pos = nullptr;
if ((mode & 1) == 0) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->ne[0] == ne2);
pos = (const int32_t *) src1_d;
}
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
pos = (const int32_t *) src1_d;
if (is_neox) {
if (src2 != nullptr) {
freq_factors = (const float *) src2->data;
}
} else {
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
}
rope_corr_dims corr_dims; rope_corr_dims corr_dims;
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
@ -280,12 +304,12 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_neox_cuda_f32( rope_neox_cuda_f32(
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, stream attn_factor, corr_dims, freq_factors, stream
); );
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
rope_neox_cuda_f16( rope_neox_cuda_f16(
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, stream attn_factor, corr_dims, freq_factors, stream
); );
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);

View File

@ -1,3 +1,4 @@
#include "common.cuh"
#include "softmax.cuh" #include "softmax.cuh"
template <typename T> template <typename T>
@ -11,7 +12,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
} }
template <bool vals_smem, int ncols_template, int block_size_template, typename T> template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x; const int tid = threadIdx.x;
@ -23,17 +24,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
float slope = 0.0f; const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
// ALiBi
if (max_bias > 0.0f) {
const int h = rowx/nrows_y; // head index
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = powf(base, exp);
}
extern __shared__ float data_soft_max_f32[]; extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
@ -53,7 +44,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int64_t ix = (int64_t)rowx*ncols + col; const int64_t ix = (int64_t)rowx*ncols + col;
const int64_t iy = (int64_t)rowy*ncols + col; const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f); const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
vals[col] = val; vals[col] = val;
max_val = max(max_val, val); max_val = max(max_val, val);
@ -125,7 +116,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
} }
template<typename T> template<typename T>
static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
int nth = WARP_SIZE; int nth = WARP_SIZE;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1); const dim3 block_dims(nth, 1, 1);
@ -133,8 +124,8 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -142,43 +133,42 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) { switch (ncols_x) {
case 32: case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 64: case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 128: case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 256: case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 512: case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 1024: case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 2048: case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 4096: case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
default: default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
} }
} else { } else {
const size_t shmem_low = WARP_SIZE*sizeof(float); const size_t shmem_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
} }
} }
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const void * src1_d = src1 ? (const void *)src1->data : nullptr; const void * src1_d = src1 ? (const void *)src1->data : nullptr;
@ -190,7 +180,6 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -202,26 +191,15 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
void * src2_d = nullptr;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
src2_d = (void *)src2->data;
}
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
if (use_f16) { if (use_f16) {
const half * src1_dd = (const half *)src1_d; const half * src1_dd = (const half *)src1_d;
const half * src2_dd = (const half *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else { } else {
const float * src1_dd = (const float *)src1_d; const float * src1_dd = (const float *)src1_d;
const float * src2_dd = (const float *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} }
} }

View File

@ -48,6 +48,15 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
dst[i] = fmaxf(x[i], 0); dst[i] = fmaxf(x[i], 0);
} }
static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = 1.0f / (1.0f + expf(-x[i]));
}
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) { static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -108,6 +117,11 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k); relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE; const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k); hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -188,6 +202,18 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
} }
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;

View File

@ -4,6 +4,7 @@
#define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256 #define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_SIGMOID_BLOCK_SIZE 256
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256 #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
#define CUDA_HARDSWISH_BLOCK_SIZE 256 #define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256 #define CUDA_SQR_BLOCK_SIZE 256
@ -18,6 +19,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1,35 +1,36 @@
#include "upscale.cuh" #include "upscale.cuh"
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) { static __global__ void upscale_f32(const float * x, float * dst,
// blockIdx.z: idx of ne02*ne03 const int nb00, const int nb01, const int nb02, const int nb03,
// blockIdx.y: idx of ne01*scale_factor aka ne1 const int ne10, const int ne11, const int ne12, const int ne13,
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE const float sf0, const float sf1, const float sf2, const float sf3) {
// ne00xne01: ne00 * ne01 int index = threadIdx.x + blockIdx.x * blockDim.x;
int ne0 = ne00 * scale_factor; if (index >= ne10 * ne11 * ne12 * ne13) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return; return;
} }
// operation
int i00 = nidx / scale_factor; int i10 = index % ne10;
int i01 = blockIdx.y / scale_factor; int i11 = (index / ne10) % ne11;
int offset_src = int i12 = (index / (ne10 * ne11)) % ne12;
i00 + int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
i01 * ne00 +
blockIdx.z * ne00xne01; int i00 = i10 / sf0;
int offset_dst = int i01 = i11 / sf1;
nidx + int i02 = i12 / sf2;
blockIdx.y * ne0 + int i03 = i13 / sf3;
blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src]; dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
} }
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, static void upscale_f32_cuda(const float * x, float * dst,
const int scale_factor, cudaStream_t stream) { const int nb00, const int nb01, const int nb02, const int nb03,
int ne0 = (ne00 * scale_factor); const int ne10, const int ne11, const int ne12, const int ne13,
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; const float sf0, const float sf1, const float sf2, const float sf3,
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03); cudaStream_t stream) {
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor); int dst_size = ne10 * ne11 * ne12 * ne13;
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
} }
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
const int scale_factor = dst->op_params[0]; const float sf0 = (float)dst->ne[0]/src0->ne[0];
const float sf1 = (float)dst->ne[1]/src0->ne[1];
const float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream); upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
} }

View File

@ -17,6 +17,18 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#if defined(_WIN32)
#define m512bh(p) p
#define m512i(p) p
#else
#define m512bh(p) (__m512bh)(p)
#define m512i(p) (__m512i)(p)
#endif
/** /**
* Converts brain16 to float32. * Converts brain16 to float32.
* *
@ -120,9 +132,16 @@ extern "C" {
#ifndef __F16C__ #ifndef __F16C__
#define __F16C__ #define __F16C__
#endif #endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__ #ifndef __SSE3__
#define __SSE3__ #define __SSE3__
#endif #endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif #endif
// 16-bit float // 16-bit float
@ -436,6 +455,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#include <riscv_vector.h> #include <riscv_vector.h>
#endif #endif
#if defined(__loongarch64)
#if defined(__loongarch_asx)
#include <lasxintrin.h>
#endif
#if defined(__loongarch_sx)
#include <lsxintrin.h>
#endif
#endif
#if defined(__loongarch_asx)
typedef union {
int32_t i;
float f;
} ft_union;
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
}
static __m256 __lasx_xvreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
}
#endif
#ifdef __F16C__ #ifdef __F16C__
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
float scale; float scale;
memcpy(&scale, dst->op_params, sizeof(float)); float max_bias;
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support") memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
GGML_ASSERT(src2 == nullptr);
#pragma message("TODO: add ALiBi support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
GGML_ASSERT(max_bias == 0.0f);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
} break; } break;
@ -1671,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
#pragma message("TODO: implement phi3 frequency factors support")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
GGML_ASSERT(ne10 == ne02); GGML_ASSERT(ne10 == ne02);
GGML_ASSERT(src0t == dstt); GGML_ASSERT(src0t == dstt);
// const int n_past = ((int32_t *) dst->op_params)[0]; // const int n_past = ((int32_t *) dst->op_params)[0];

View File

@ -40,6 +40,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CLAMP, GGML_METAL_KERNEL_TYPE_CLAMP,
GGML_METAL_KERNEL_TYPE_TANH, GGML_METAL_KERNEL_TYPE_TANH,
GGML_METAL_KERNEL_TYPE_RELU, GGML_METAL_KERNEL_TYPE_RELU,
GGML_METAL_KERNEL_TYPE_SIGMOID,
GGML_METAL_KERNEL_TYPE_GELU, GGML_METAL_KERNEL_TYPE_GELU,
GGML_METAL_KERNEL_TYPE_GELU_4, GGML_METAL_KERNEL_TYPE_GELU_4,
GGML_METAL_KERNEL_TYPE_GELU_QUICK, GGML_METAL_KERNEL_TYPE_GELU_QUICK,
@ -169,7 +170,6 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ROPE_F16,
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
GGML_METAL_KERNEL_TYPE_IM2COL_F16, GGML_METAL_KERNEL_TYPE_IM2COL_F16,
GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_IM2COL_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
@ -494,6 +494,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
@ -623,7 +624,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
@ -633,14 +633,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
@ -732,6 +732,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
@ -759,7 +760,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return ctx->support_simdgroup_reduction; return ctx->support_simdgroup_reduction;
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ALIBI:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
return true; return true;
@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
return true; return true;
case GGML_OP_FLASH_ATTN_EXT:
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
return ctx->support_simdgroup_reduction && return ctx->support_simdgroup_reduction &&
@ -926,22 +927,32 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t ne10 = src1 ? src1->ne[0] : 0; const int64_t ne10 = src1 ? src1->ne[0] : 0;
const int64_t ne11 = src1 ? src1->ne[1] : 0; const int64_t ne11 = src1 ? src1->ne[1] : 0;
const int64_t ne12 = src1 ? src1->ne[2] : 0; const int64_t ne12 = src1 ? src1->ne[2] : 0;
const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); const int64_t ne13 = src1 ? src1->ne[3] : 0;
const uint64_t nb10 = src1 ? src1->nb[0] : 0; const uint64_t nb10 = src1 ? src1->nb[0] : 0;
const uint64_t nb11 = src1 ? src1->nb[1] : 0; const uint64_t nb11 = src1 ? src1->nb[1] : 0;
const uint64_t nb12 = src1 ? src1->nb[2] : 0; const uint64_t nb12 = src1 ? src1->nb[2] : 0;
const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); const uint64_t nb13 = src1 ? src1->nb[3] : 0;
const int64_t ne0 = dst ? dst->ne[0] : 0; const int64_t ne20 = src2 ? src2->ne[0] : 0;
const int64_t ne1 = dst ? dst->ne[1] : 0; const int64_t ne21 = src2 ? src2->ne[1] : 0;
const int64_t ne2 = dst ? dst->ne[2] : 0; const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
const int64_t ne3 = dst ? dst->ne[3] : 0; const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
const uint64_t nb0 = dst ? dst->nb[0] : 0; const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
const uint64_t nb1 = dst ? dst->nb[1] : 0; const uint64_t nb21 = src2 ? src2->nb[1] : 0;
const uint64_t nb2 = dst ? dst->nb[2] : 0; const uint64_t nb22 = src2 ? src2->nb[2] : 0;
const uint64_t nb3 = dst ? dst->nb[3] : 0; const uint64_t nb23 = src2 ? src2->nb[3] : 0;
const int64_t ne0 = dst ? dst->ne[0] : 0;
const int64_t ne1 = dst ? dst->ne[1] : 0;
const int64_t ne2 = dst ? dst->ne[2] : 0;
const int64_t ne3 = dst ? dst->ne[3] : 0;
const uint64_t nb0 = dst ? dst->nb[0] : 0;
const uint64_t nb1 = dst ? dst->nb[1] : 0;
const uint64_t nb2 = dst ? dst->nb[2] : 0;
const uint64_t nb3 = dst ? dst->nb[3] : 0;
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
@ -1194,24 +1205,24 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
{ {
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
float min; float min;
float max; float max;
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&min length:sizeof(min) atIndex:2]; [encoder setBytes:&min length:sizeof(min) atIndex:2];
[encoder setBytes:&max length:sizeof(max) atIndex:3]; [encoder setBytes:&max length:sizeof(max) atIndex:3];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) { switch (ggml_get_unary_op(gf->nodes[i])) {
// we are not taking into account the strides, so for now require contiguous tensors // we are not taking into account the strides, so for now require contiguous tensors
@ -1239,6 +1250,18 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_SIGMOID:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
@ -1357,16 +1380,15 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
int nth = 32; // SIMD width int nth = 32; // SIMD width
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16); const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (ne00%4 == 0) { if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) { while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
nth *= 2; nth *= 2;
} }
if (use_f16) { if (use_f16) {
@ -1375,7 +1397,7 @@ static enum ggml_status ggml_metal_graph_compute(
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
} }
} else { } else {
while (nth < ne00 && nth < 1024) { while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
nth *= 2; nth *= 2;
} }
if (use_f16) { if (use_f16) {
@ -1394,8 +1416,8 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1]; const int64_t nrows_y = src0->ne[1];
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -1407,20 +1429,15 @@ static enum ggml_status ggml_metal_graph_compute(
} else { } else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
} }
if (id_src2) { [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
} else { [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
} [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
[encoder setBuffer:id_dst offset:offs_dst atIndex:3]; [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; [encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; [encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -1778,16 +1795,6 @@ static enum ggml_status ggml_metal_graph_compute(
const int n_as = src0->ne[2]; const int n_as = src0->ne[2];
// src2 = ids // src2 = ids
const int64_t ne20 = src2->ne[0];
const int64_t ne21 = src2->ne[1];
const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
const uint64_t nb21 = src2->nb[1];
const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
GGML_ASSERT(src2t == GGML_TYPE_I32); GGML_ASSERT(src2t == GGML_TYPE_I32);
@ -2225,49 +2232,6 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ALIBI:
{
GGML_ASSERT((src0t == GGML_TYPE_F32));
const int nth = MIN(1024, ne00);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
GGML_ASSERT(ne10 == ne02); GGML_ASSERT(ne10 == ne02);
@ -2280,7 +2244,13 @@ static enum ggml_status ggml_metal_graph_compute(
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
@ -2288,6 +2258,15 @@ static enum ggml_status ggml_metal_graph_compute(
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
if (!is_neox) {
GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
}
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
switch (src0->type) { switch (src0->type) {
@ -2299,33 +2278,38 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; if (id_src2 != nil) {
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; } else {
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; }
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; [encoder setBuffer:id_dst offset:offs_dst atIndex:3];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&n_past length:sizeof( int) atIndex:19]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&mode length:sizeof( int) atIndex:21]; [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; [encoder setBytes:&n_past length:sizeof( int) atIndex:20];
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; [encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; [encoder setBytes:&mode length:sizeof( int) atIndex:22];
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; [encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
@ -2389,7 +2373,10 @@ static enum ggml_status ggml_metal_graph_compute(
{ {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int sf = dst->op_params[0]; const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
@ -2412,7 +2399,10 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18]; [encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
@ -2548,13 +2538,14 @@ static enum ggml_status ggml_metal_graph_compute(
} break; } break;
case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_FLASH_ATTN_EXT:
{ {
GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne11 % 32 == 0);
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
struct ggml_tensor * src3 = gf->nodes[i]->src[3]; GGML_ASSERT(ggml_are_same_shape (src1, src2));
GGML_ASSERT(ggml_are_same_shape(src1, src2)); struct ggml_tensor * src3 = gf->nodes[i]->src[3];
GGML_ASSERT(src3);
size_t offs_src3 = 0; size_t offs_src3 = 0;
@ -2565,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
const int64_t ne31 = src3 ? src3->ne[1] : 0; //const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
@ -2577,7 +2568,16 @@ static enum ggml_status ggml_metal_graph_compute(
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
float scale; float scale;
memcpy(&scale, dst->op_params, sizeof(float)); float max_bias;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
@ -2614,34 +2614,38 @@ static enum ggml_status ggml_metal_graph_compute(
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; if (id_src3) {
[encoder setBuffer:id_dst offset:offs_dst atIndex:4]; [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5]; } else {
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7]; }
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8]; [encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11]; [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14]; [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15]; [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16]; [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18]; [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19]; [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20]; [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21]; [encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22]; [encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23]; [encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24]; [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&scale length:sizeof( float) atIndex:27]; [encoder setBytes:&scale length:sizeof( float) atIndex:23];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
if (!use_vec_kernel) { if (!use_vec_kernel) {
// half8x8 kernel // half8x8 kernel

View File

@ -229,6 +229,13 @@ kernel void kernel_relu(
dst[tpig] = max(0.0f, src0[tpig]); dst[tpig] = max(0.0f, src0[tpig]);
} }
kernel void kernel_sigmoid(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
}
kernel void kernel_tanh( kernel void kernel_tanh(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -356,7 +363,6 @@ template<typename T>
kernel void kernel_soft_max( kernel void kernel_soft_max(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
device const char * src2,
device char * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -378,10 +384,9 @@ kernel void kernel_soft_max(
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
float slope = 0.0f; float slope = 1.0f;
// ALiBi // ALiBi
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
@ -397,7 +402,7 @@ kernel void kernel_soft_max(
float lmax = -INFINITY; float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
} }
// find the max value in the block // find the max value in the block
@ -422,7 +427,7 @@ kernel void kernel_soft_max(
// parallel sum // parallel sum
float lsum = 0.0f; float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
lsum += exp_psrc0; lsum += exp_psrc0;
pdst[i00] = exp_psrc0; pdst[i00] = exp_psrc0;
} }
@ -461,7 +466,6 @@ template<typename T>
kernel void kernel_soft_max_4( kernel void kernel_soft_max_4(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
device const char * src2,
device char * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -483,10 +487,9 @@ kernel void kernel_soft_max_4(
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
float slope = 0.0f; float slope = 1.0f;
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
const int64_t h = i02; const int64_t h = i02;
@ -501,7 +504,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY; float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))); lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
} }
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@ -527,7 +530,7 @@ kernel void kernel_soft_max_4(
// parallel sum // parallel sum
float4 lsum4 = 0.0f; float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val); const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
lsum4 += exp_psrc4; lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4; pdst4[i00] = exp_psrc4;
} }
@ -1595,60 +1598,6 @@ kernel void kernel_mul_mv_f16_f32_l4(
} }
} }
kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant float & m0,
constant float & m1,
constant int & n_heads_log2_floor,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
const int64_t i3 = n / (ne2*ne1*ne0);
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
const int64_t k = i3*ne3 + i2;
float m_k;
if (k < n_heads_log2_floor) {
m_k = pow(m0, k + 1);
} else {
m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
const float src_v = *(device float *)(src_row + i00*nb00);
device float * dst_v = (device float *)(dst_row + i00*nb0);
*dst_v = i00 * m_k + src_v;
}
}
static float rope_yarn_ramp(const float low, const float high, const int i0) { static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low); const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y)); return 1.0f - min(1.0f, max(0.0f, y));
@ -1691,6 +1640,7 @@ static void rope_yarn_corr_dims(
typedef void (rope_t)( typedef void (rope_t)(
device const void * src0, device const void * src0,
device const int32_t * src1, device const int32_t * src1,
device const float * src2,
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -1726,6 +1676,7 @@ template<typename T>
kernel void kernel_rope( kernel void kernel_rope(
device const void * src0, device const void * src0,
device const int32_t * src1, device const int32_t * src1,
device const float * src2,
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -1795,8 +1746,10 @@ kernel void kernel_rope(
// simplified from `(ib * n_dims + ic) * inv_ndims` // simplified from `(ib * n_dims + ic) * inv_ndims`
const float cur_rot = inv_ndims*ic - ib; const float cur_rot = inv_ndims*ic - ib;
const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
const float theta = theta_0 * pow(freq_base, cur_rot);
float cos_theta, sin_theta; float cos_theta, sin_theta;
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@ -1903,7 +1856,10 @@ kernel void kernel_upscale_f32(
constant uint64_t & nb1, constant uint64_t & nb1,
constant uint64_t & nb2, constant uint64_t & nb2,
constant uint64_t & nb3, constant uint64_t & nb3,
constant int32_t & sf, constant float & sf0,
constant float & sf1,
constant float & sf2,
constant float & sf3,
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) { uint3 ntg[[threads_per_threadgroup]]) {
@ -1912,15 +1868,17 @@ kernel void kernel_upscale_f32(
const int64_t i2 = tgpig.y; const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x; const int64_t i1 = tgpig.x;
const int64_t i03 = i3; const int64_t i03 = i3/sf3;
const int64_t i02 = i2; const int64_t i02 = i2/sf2;
const int64_t i01 = i1/sf; const int64_t i01 = i1/sf1;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = src0_ptr[i0/sf]; const int64_t i00 = i0/sf0;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_ptr[0] = src0_ptr[0];
} }
} }
@ -2100,29 +2058,29 @@ typedef void (flash_attn_ext_f16_t)(
device const char * v, device const char * v,
device const char * mask, device const char * mask,
device float * dst, device float * dst,
constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
constant int64_t & ne03, constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant uint64_t & nb03, constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12, constant int64_t & ne12,
constant int64_t & ne13, constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31, constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared, threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2138,29 +2096,29 @@ kernel void kernel_flash_attn_ext_f16(
device const char * v, device const char * v,
device const char * mask, device const char * mask,
device float * dst, device float * dst,
constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
constant int64_t & ne03, constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant uint64_t & nb03, constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12, constant int64_t & ne12,
constant int64_t & ne13, constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31, constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2225,10 +2183,6 @@ kernel void kernel_flash_attn_ext_f16(
const short ne22 = ne12; const short ne22 = ne12;
const short ne23 = ne13; const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast // broadcast
const short rk2 = ne02/ne12; const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13; const short rk3 = ne03/ne13;
@ -2254,8 +2208,17 @@ kernel void kernel_flash_attn_ext_f16(
// pointer to the mask // pointer to the mask
device const half * mp = (device const half *) (mask + iq1*nb31); device const half * mp = (device const half *) (mask + iq1*nb31);
// prepare diagonal scale matrix float slope = 1.0f;
simdgroup_float8x8 mscale(scale);
// ALiBi
if (max_bias > 0.0f) {
const uint32_t h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = pow(base, exph);
}
// loop over the KV cache // loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns // each simdgroup handles blocks of Q rows and C columns
@ -2279,12 +2242,20 @@ kernel void kernel_flash_attn_ext_f16(
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
} }
// mqk = mqk*scale + mask
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
simdgroup_store(mqk, ss + 8*cc, TF, 0, false); simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
const short tx = tiisg%4;
const short ty = tiisg/4;
if (mask != q) {
// mqk = mqk*scale + mask*slope
ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
} else {
// mqk = mqk*scale
ss[8*cc + ty*TF + 2*tx + 0] *= scale;
ss[8*cc + ty*TF + 2*tx + 1] *= scale;
}
} }
} }
@ -2456,29 +2427,29 @@ kernel void kernel_flash_attn_ext_vec_f16(
device const char * v, device const char * v,
device const char * mask, device const char * mask,
device float * dst, device float * dst,
constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
constant int64_t & ne03, constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant uint64_t & nb03, constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12, constant int64_t & ne12,
constant int64_t & ne13, constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31, constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2497,6 +2468,18 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short T = D + 2*nsg*SH; // shared memory size per query in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half)
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
const uint32_t h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = pow(base, exp);
}
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
@ -2537,10 +2520,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short ne22 = ne12; const short ne22 = ne12;
const short ne23 = ne13; const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast // broadcast
const short rk2 = ne02/ne12; const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13; const short rk3 = ne03/ne13;
@ -2603,10 +2582,9 @@ kernel void kernel_flash_attn_ext_vec_f16(
mqk += simd_shuffle_down(mqk, 2); mqk += simd_shuffle_down(mqk, 2);
mqk += simd_shuffle_down(mqk, 1); mqk += simd_shuffle_down(mqk, 1);
// mqk = mqk*scale + mask // mqk = mqk*scale + mask*slope
if (tiisg == 0) { if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc]; mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
mqk = mqk*scale + mm;
ss4[cc] = mqk; ss4[cc] = mqk;
} }

View File

@ -1,216 +0,0 @@
#include "ggml-mpi.h"
#include "ggml.h"
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define UNUSED GGML_UNUSED
struct ggml_mpi_context {
int rank;
int size;
};
void ggml_mpi_backend_init(void) {
MPI_Init(NULL, NULL);
}
void ggml_mpi_backend_free(void) {
MPI_Finalize();
}
struct ggml_mpi_context * ggml_mpi_init(void) {
struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
return ctx;
}
void ggml_mpi_free(struct ggml_mpi_context * ctx) {
free(ctx);
}
int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
return ctx->rank;
}
void ggml_mpi_eval_init(
struct ggml_mpi_context * ctx_mpi,
int * n_tokens,
int * n_past,
int * n_threads) {
UNUSED(ctx_mpi);
// synchronize the worker node parameters with the root node
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
if (t == NULL) {
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
return -1;
}
for (int i = 0; i < gf->n_nodes; i++) {
if (gf->nodes[i] == t) {
return i;
}
}
fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
return -1;
}
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
MPI_Status status; UNUSED(status);
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
// TODO: there are many improvements that can be done to this implementation
void ggml_mpi_graph_compute_pre(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
if (inp_tokens == NULL) {
fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
return;
}
struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
if (inp0 == NULL) {
fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
return;
}
GGML_ASSERT(inp0 == gf->nodes[0]);
// distribute the compute graph into slices across the MPI nodes
//
// the main node (0) processes the last layers + the remainder of the compute graph
// and is responsible to pass the input tokens to the first node (1)
//
// node 1: [( 0) * n_per_node, ( 1) * n_per_node)
// node 2: [( 1) * n_per_node, ( 2) * n_per_node)
// ...
// node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
// node 0: [(n-1) * n_per_node, n_nodes)
//
if (mpi_rank > 0) {
if (mpi_rank == 1) {
// the first node (1) receives the input tokens from the main node (0)
ggml_mpi_tensor_recv(inp_tokens, 0);
} else {
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
}
} else if (mpi_size > 1) {
// node 0 sends the input tokens to node 1
ggml_mpi_tensor_send(inp_tokens, 1);
// recv the output data from the last node
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
}
{
const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
const int il0 = (mpi_idx + 0) * n_per_node;
const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
char name_l0[GGML_MAX_NAME];
char name_l1[GGML_MAX_NAME];
snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
if (idx_l0 < 0 || idx_l1 < 0) {
fprintf(stderr, "%s: layer input nodes not found\n", __func__);
return;
}
// attach the input data to all nodes that need it
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
for (int i = idx_l0; i < idx_l1; i++) {
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[0] = inp0;
}
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[1] = inp0;
}
}
// TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
for (int i = 1; i < idx_l1 - idx_l0; i++) {
gf->nodes[i] = gf->nodes[idx_l0 + i];
gf->grads[i] = gf->grads[idx_l0 + i];
}
// the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
if (mpi_idx != 0) {
gf->nodes[0]->op = GGML_OP_NONE;
}
gf->n_nodes = idx_l1 - idx_l0;
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
}
}
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
UNUSED(n_layers);
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
// send the output data to the next node
if (mpi_rank > 0) {
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
}
}

View File

@ -1,39 +0,0 @@
#pragma once
struct ggml_context;
struct ggml_tensor;
struct ggml_cgraph;
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_mpi_context;
void ggml_mpi_backend_init(void);
void ggml_mpi_backend_free(void);
struct ggml_mpi_context * ggml_mpi_init(void);
void ggml_mpi_free(struct ggml_mpi_context * ctx);
int ggml_mpi_rank(struct ggml_mpi_context * ctx);
void ggml_mpi_eval_init(
struct ggml_mpi_context * ctx_mpi,
int * n_tokens,
int * n_past,
int * n_threads);
void ggml_mpi_graph_compute_pre(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
#ifdef __cplusplus
}
#endif

View File

@ -1,4 +1,4 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-opencl.h" #include "ggml-opencl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
@ -1835,7 +1835,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
} }
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { int64_t i12 = i02 * r2;
int64_t e12 = i12 + r2;
events.reserve(e12 - i12);
for (; i12 < e12; i12++) {
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device // copy src1 to device
events.emplace_back(); events.emplace_back();

File diff suppressed because it is too large Load Diff

1155
ggml-rpc.cpp Normal file

File diff suppressed because it is too large Load Diff

24
ggml-rpc.h Normal file
View File

@ -0,0 +1,24 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_RPC_MAX_SERVERS 16
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus
}
#endif

View File

@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
#define SYCL_SCALE_BLOCK_SIZE 256 #define SYCL_SCALE_BLOCK_SIZE 256
#define SYCL_CLAMP_BLOCK_SIZE 256 #define SYCL_CLAMP_BLOCK_SIZE 256
#define SYCL_ROPE_BLOCK_SIZE 256 #define SYCL_ROPE_BLOCK_SIZE 256
#define SYCL_ALIBI_BLOCK_SIZE 32
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
#define SYCL_QUANTIZE_BLOCK_SIZE 256 #define SYCL_QUANTIZE_BLOCK_SIZE 256
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256 #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
} }
} }
static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor, static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
const sycl::nd_item<3> &item_ct1) { const int nb02, const int nb03, const int ne10, const int ne11,
int ne0 = ne00 * scale_factor; const int ne12, const int ne13, const float sf0, const float sf1,
int nidx = item_ct1.get_local_id(2) + const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
item_ct1.get_group(2) * item_ct1.get_local_range(2); int index = item_ct1.get_local_id(0) +
if (nidx >= ne0) { item_ct1.get_group(0) * item_ct1.get_local_range(0);
if (index >= ne10 * ne11 * ne12 * ne13) {
return; return;
} }
// operation // operation
int i00 = nidx / scale_factor; int i10 = index % ne10;
int i01 = item_ct1.get_group(1) / scale_factor; int i11 = (index / ne10) % ne11;
int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02; int i12 = (index / (ne10 * ne11)) % ne12;
int offset_dst = nidx + item_ct1.get_group(1) * ne0 + int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = x[offset_src]; int i00 = i10 / sf0;
int i01 = i11 / sf1;
int i02 = i12 / sf2;
int i03 = i13 / sf3;
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
} }
static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@ -8330,24 +8335,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
const int blocks_per_row = ncols / qk; const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi; const int blocks_per_warp = vdr * WARP_SIZE / qi;
// partial sum for each thread const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
// partial sum for each thread
float tmp = 0.0f; float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx; const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy; const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
i += blocks_per_warp) { i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i; // x block index const int ibx = row * blocks_per_row + i; // x block index
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
const int iqs = const int iqs =
vdr * vdr *
(item_ct1.get_local_id(2) % (item_ct1.get_local_id(2) -
(qi / vdr)); // x block quant index when casting the quants to int i * qi_vdr); // x block quant index when casting the quants to int
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
} }
// sum up partial sums and write back result // sum up partial sums and write back result
@ -9314,32 +9321,6 @@ static void rope_glm_f32(
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
} }
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1,
const sycl::nd_item<3> &item_ct1) {
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
if (col >= ncols) {
return;
}
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1);
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = dpct::pow(m0, k + 1);
} else {
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void k_sum_rows_f32(const float * x, float * dst, const int ncols, static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
const sycl::nd_item<3> &item_ct1) { const sycl::nd_item<3> &item_ct1) {
const int row = item_ct1.get_group(1); const int row = item_ct1.get_group(1);
@ -9441,7 +9422,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par, static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) { const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@ -9455,7 +9436,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
float slope = 0.0f; float slope = 1.0f;
// ALiBi // ALiBi
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
@ -9480,7 +9461,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int ix = rowx*ncols + col; const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col; const int iy = rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
vals[col] = val; vals[col] = val;
max_val = sycl::max(max_val, val); max_val = sycl::max(max_val, val);
@ -10110,18 +10091,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
}); });
} }
static void upscale_f32_sycl(const float *x, float *dst, const int ne00, static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
const int ne01, const int ne02, const int nb02, const int nb03, const int ne10, const int ne11,
const int scale_factor, dpct::queue_ptr stream) { const int ne12, const int ne13, const float sf0, const float sf1,
int ne0 = (ne00 * scale_factor); const float sf2, const float sf3, dpct::queue_ptr stream) {
int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; int dst_size = ne10 * ne11 * ne12 * ne13;
sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks); int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
stream->parallel_for( stream->parallel_for(
sycl::nd_range<3>(gridDim * sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE), [=](sycl::nd_item<1> item_ct1) {
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)), upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
[=](sycl::nd_item<3> item_ct1) {
upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
}); });
} }
@ -12962,20 +12942,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
}); });
} }
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, const int k_rows,
const int n_heads_log2_floor, const float m0,
const float m1, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
alibi_f32(x, dst, ncols, k_rows,
n_heads_log2_floor, m0, m1, item_ct1);
});
}
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, dpct::queue_ptr stream) { const int nrows, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE); const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@ -13056,7 +13022,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
} }
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par, static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
const size_t n_local_scratch, dpct::queue_ptr stream) { const size_t n_local_scratch, dpct::queue_ptr stream) {
@ -13066,7 +13032,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par, soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
nrows_y, scale, max_bias, m0, nrows_y, scale, max_bias, m0,
m1, n_head_log2, item_ct1, m1, n_head_log2, item_ct1,
local_buf_acc.get_pointer()); local_buf_acc.get_pointer());
@ -13074,7 +13040,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
}); });
} }
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos, static void soft_max_f32_sycl(const float * x, const float * mask,
float * dst, const int ncols_x, const int nrows_x, float * dst, const int ncols_x, const int nrows_x,
const int nrows_y, const float scale, const float max_bias, const int nrows_y, const float scale, const float max_bias,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
@ -13096,60 +13062,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>(); const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
if (n_local_scratch*sizeof(float) < local_mem_size) { if (n_local_scratch*sizeof(float) < local_mem_size) {
if (ncols_x > max_block_size) { if (ncols_x > max_block_size) {
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
return; return;
} }
switch (ncols_x) { switch (ncols_x) {
case 32: case 32:
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 64: case 64:
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 128: case 128:
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 256: case 256:
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 512: case 512:
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 1024: case 1024:
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 2048: case 2048:
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 4096: case 4096:
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
default: default:
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
} }
} else { } else {
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, WARP_SIZE, stream); block_dims, WARP_SIZE, stream);
} }
@ -14024,11 +13990,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
const int scale_factor = dst->op_params[0]; const float sf0 = (float)dst->ne[0]/src0->ne[0];
const float sf1 = (float)dst->ne[1]/src0->ne[1];
const float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream); upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
main_stream);
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -14484,6 +14454,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd, ggml_tensor *dst, const float *src0_dd,
const float *src1_dd, float *dst_dd, const float *src1_dd, float *dst_dd,
const dpct::queue_ptr &main_stream) { const dpct::queue_ptr &main_stream) {
#pragma message("TODO: implement phi3 frequency factors support")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -14560,36 +14533,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
(void) src1_dd; (void) src1_dd;
} }
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd,
const float *src1_dd, float *dst_dd,
const dpct::queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
(void) src1;
(void) src1_dd;
}
static void ggml_sycl_op_pool2d(const ggml_tensor *src0, static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd, const float *src0_dd, const float *src1_dd,
@ -14744,12 +14687,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const ggml_tensor * src2 = dst->src[2]; #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -14761,25 +14701,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
memcpy(&scale, dst->op_params + 0, sizeof(float)); memcpy(&scale, dst->op_params + 0, sizeof(float));
memcpy(&max_bias, dst->op_params + 1, sizeof(float)); memcpy(&max_bias, dst->op_params + 1, sizeof(float));
// positions tensor soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
float * src2_dd = nullptr;
sycl_pool_alloc<float> src2_f;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
}
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
nrows_x, nrows_y, scale, max_bias, main_stream); nrows_x, nrows_y, scale, max_bias, main_stream);
} }
@ -15654,26 +15576,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
const int64_t r2 = ne12/ne02; const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03; const int64_t r3 = ne13/ne03;
#if 0
// use syclGemmEx
{
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
int i03 = i13 / r3;
int i02 = i12 / r2;
SYCL_CHECK(
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
#else
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3 // there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@ -15685,7 +15587,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
nb11 / nb10, nb12 / nb10, beta, nb11 / nb10, nb12 / nb10, beta,
(char *)dst_t, cu_data_type, ne01, nb2 / nb0, (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
ne12 * ne13, cu_compute_type))); ne12 * ne13, cu_compute_type)));
g_sycl_handles[g_main_device]->wait();
} else { } else {
const int ne23 = ne12*ne13; const int ne23 = ne12*ne13;
@ -15716,7 +15617,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
nb02, nb03, nb12_scaled, nb13_scaled, nb02, nb03, nb12_scaled, nb13_scaled,
nbd2, nbd3, r2, r3, item_ct1); nbd2, nbd3, r2, r3, item_ct1);
}); });
}).wait(); });
} }
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans, *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@ -15727,9 +15628,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
dpct::library_data_t::real_half, nb11 / nb10, beta, dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
cu_compute_type))); cu_compute_type)));
g_sycl_handles[g_main_device]->wait();
} }
#endif
if (no_mixed_dtypes) { if (no_mixed_dtypes) {
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@ -16230,10 +16129,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
} }
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
}
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
} }
@ -16610,9 +16505,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_OP_ROPE: case GGML_OP_ROPE:
func = ggml_sycl_rope; func = ggml_sycl_rope;
break; break;
case GGML_OP_ALIBI:
func = ggml_sycl_alibi;
break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
func = ggml_sycl_im2col; func = ggml_sycl_im2col;
break; break;
@ -17742,7 +17634,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More