Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2
This commit is contained in:
commit
6282537a8b
|
|
@ -69,13 +69,6 @@ jobs:
|
||||||
key: macOS-latest-cmake-arm64
|
key: macOS-latest-cmake-arm64
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
brew install curl
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -83,6 +76,8 @@ jobs:
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||||
-DGGML_METAL_USE_BF16=ON \
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||||
|
|
@ -110,13 +105,6 @@ jobs:
|
||||||
key: macOS-latest-cmake-x64
|
key: macOS-latest-cmake-x64
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
brew install curl
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -126,6 +114,8 @@ jobs:
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||||
-DGGML_METAL=OFF \
|
-DGGML_METAL=OFF \
|
||||||
-DGGML_RPC=ON \
|
-DGGML_RPC=ON \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||||
|
|
@ -151,13 +141,6 @@ jobs:
|
||||||
key: macOS-latest-cmake-arm64-webgpu
|
key: macOS-latest-cmake-arm64-webgpu
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
brew install curl
|
|
||||||
|
|
||||||
- name: Dawn Dependency
|
- name: Dawn Dependency
|
||||||
id: dawn-depends
|
id: dawn-depends
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -217,7 +200,7 @@ jobs:
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y --no-install-recommends \
|
sudo apt-get install -y --no-install-recommends \
|
||||||
python3 python3-pip python3-dev \
|
python3 python3-pip python3-dev \
|
||||||
libjpeg-dev build-essential libcurl4-openssl-dev \
|
libjpeg-dev build-essential libssl-dev \
|
||||||
git-lfs
|
git-lfs
|
||||||
|
|
||||||
- name: Python Dependencies
|
- name: Python Dependencies
|
||||||
|
|
@ -238,6 +221,8 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DGGML_RPC=ON
|
-DGGML_RPC=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
@ -294,13 +279,15 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
sudo apt-get install build-essential libssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
|
|
@ -311,6 +298,8 @@ jobs:
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
|
@ -335,7 +324,7 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
sudo apt-get install build-essential libssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
@ -343,6 +332,8 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. \
|
cmake .. \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DLLAMA_LLGUIDANCE=ON
|
-DLLAMA_LLGUIDANCE=ON
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
@ -373,12 +364,14 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
sudo apt-get install build-essential libssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DGGML_RPC=ON
|
-DGGML_RPC=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
|
@ -405,12 +398,14 @@ jobs:
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
|
sudo apt-get install -y glslc libvulkan-dev libssl-dev
|
||||||
|
|
||||||
- name: Configure
|
- name: Configure
|
||||||
id: cmake_configure
|
id: cmake_configure
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||||
-DGGML_BACKEND_DL=ON \
|
-DGGML_BACKEND_DL=ON \
|
||||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||||
|
|
@ -440,7 +435,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||||
sudo apt-get update -y
|
sudo apt-get update -y
|
||||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
|
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||||
|
|
||||||
- name: Get latest Vulkan SDK version
|
- name: Get latest Vulkan SDK version
|
||||||
id: vulkan_sdk_version
|
id: vulkan_sdk_version
|
||||||
|
|
@ -466,6 +461,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
source ./vulkan_sdk/setup-env.sh
|
source ./vulkan_sdk/setup-env.sh
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DGGML_VULKAN=ON
|
-DGGML_VULKAN=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
|
@ -497,7 +494,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||||
sudo apt-get update -y
|
sudo apt-get update -y
|
||||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
|
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||||
|
|
||||||
- name: Get latest Vulkan SDK version
|
- name: Get latest Vulkan SDK version
|
||||||
id: vulkan_sdk_version
|
id: vulkan_sdk_version
|
||||||
|
|
@ -537,7 +534,10 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||||
cmake -B build -DGGML_WEBGPU=ON
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DGGML_WEBGPU=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
|
@ -560,7 +560,7 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
|
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
|
||||||
|
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
|
@ -572,6 +572,8 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . \
|
cmake -B build -S . \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||||
-DGGML_HIP=ON
|
-DGGML_HIP=ON
|
||||||
|
|
@ -590,7 +592,7 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
apt-get update
|
apt-get update
|
||||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libssl-dev
|
||||||
|
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
|
@ -602,6 +604,8 @@ jobs:
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . \
|
cmake -B build -S . \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DGGML_MUSA=ON
|
-DGGML_MUSA=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
|
@ -626,7 +630,7 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
|
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||||
|
|
||||||
- name: install oneAPI MKL library
|
- name: install oneAPI MKL library
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
@ -648,6 +652,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DGGML_SYCL=ON \
|
-DGGML_SYCL=ON \
|
||||||
-DCMAKE_C_COMPILER=icx \
|
-DCMAKE_C_COMPILER=icx \
|
||||||
-DCMAKE_CXX_COMPILER=icpx
|
-DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
@ -674,7 +680,7 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
|
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
|
||||||
|
|
||||||
- name: install oneAPI MKL library
|
- name: install oneAPI MKL library
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
@ -696,6 +702,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DGGML_SYCL=ON \
|
-DGGML_SYCL=ON \
|
||||||
-DCMAKE_C_COMPILER=icx \
|
-DCMAKE_C_COMPILER=icx \
|
||||||
-DCMAKE_CXX_COMPILER=icpx \
|
-DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
|
@ -722,12 +730,6 @@ jobs:
|
||||||
key: macOS-latest-cmake-ios
|
key: macOS-latest-cmake-ios
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -759,12 +761,6 @@ jobs:
|
||||||
key: macOS-latest-cmake-tvos
|
key: macOS-latest-cmake-tvos
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -790,12 +786,6 @@ jobs:
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -838,12 +828,6 @@ jobs:
|
||||||
name: llama-xcframework
|
name: llama-xcframework
|
||||||
path: build-apple/llama.xcframework/
|
path: build-apple/llama.xcframework/
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Build llama.cpp with CMake
|
- name: Build llama.cpp with CMake
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
|
@ -995,21 +979,12 @@ jobs:
|
||||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
cmake --build build-arm64-release --target install --config release
|
cmake --build build-arm64-release --target install --config release
|
||||||
|
|
||||||
- name: libCURL
|
|
||||||
id: get_libcurl
|
|
||||||
uses: ./.github/actions/windows-setup-curl
|
|
||||||
with:
|
|
||||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
env:
|
|
||||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
|
||||||
run: |
|
run: |
|
||||||
cmake -S . -B build ${{ matrix.defines }} `
|
cmake -S . -B build ${{ matrix.defines }} `
|
||||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
-DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
|
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
|
|
@ -1053,7 +1028,7 @@ jobs:
|
||||||
DEBIAN_FRONTEND: noninteractive
|
DEBIAN_FRONTEND: noninteractive
|
||||||
run: |
|
run: |
|
||||||
apt update
|
apt update
|
||||||
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
|
apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
|
||||||
|
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
|
@ -1064,10 +1039,12 @@ jobs:
|
||||||
- name: Build with CMake
|
- name: Build with CMake
|
||||||
run: |
|
run: |
|
||||||
cmake -S . -B build -G Ninja \
|
cmake -S . -B build -G Ninja \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||||
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
||||||
-DLLAMA_FATAL_WARNINGS=ON \
|
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DGGML_CUDA=ON
|
-DGGML_CUDA=ON
|
||||||
cmake --build build
|
cmake --build build
|
||||||
|
|
@ -1101,25 +1078,20 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
choco install ninja
|
choco install ninja
|
||||||
|
|
||||||
- name: libCURL
|
|
||||||
id: get_libcurl
|
|
||||||
uses: ./.github/actions/windows-setup-curl
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
shell: cmd
|
shell: cmd
|
||||||
env:
|
|
||||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
|
||||||
run: |
|
run: |
|
||||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||||
-DLLAMA_BUILD_SERVER=ON ^
|
-DLLAMA_BUILD_SERVER=ON ^
|
||||||
|
-DLLAMA_CURL=OFF ^
|
||||||
|
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||||
-DGGML_NATIVE=OFF ^
|
-DGGML_NATIVE=OFF ^
|
||||||
-DGGML_BACKEND_DL=ON ^
|
-DGGML_BACKEND_DL=ON ^
|
||||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||||
-DGGML_CUDA=ON ^
|
-DGGML_CUDA=ON ^
|
||||||
-DGGML_RPC=ON ^
|
-DGGML_RPC=ON
|
||||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
|
|
||||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
|
@ -1151,7 +1123,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||||
|
|
||||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
@ -1208,14 +1180,8 @@ jobs:
|
||||||
key: ${{ github.job }}
|
key: ${{ github.job }}
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: libCURL
|
|
||||||
id: get_libcurl
|
|
||||||
uses: ./.github/actions/windows-setup-curl
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
env:
|
|
||||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
|
||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
|
|
@ -1224,11 +1190,12 @@ jobs:
|
||||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
|
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
|
||||||
-DCMAKE_BUILD_TYPE=Release `
|
-DCMAKE_BUILD_TYPE=Release `
|
||||||
|
-DLLAMA_CURL=OFF `
|
||||||
|
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||||
-DROCM_DIR="${env:HIP_PATH}" `
|
-DROCM_DIR="${env:HIP_PATH}" `
|
||||||
-DGGML_HIP=ON `
|
-DGGML_HIP=ON `
|
||||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||||
-DGGML_RPC=ON `
|
-DGGML_RPC=ON
|
||||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
|
||||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ jobs:
|
||||||
curl \
|
curl \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en \
|
language-pack-en \
|
||||||
libcurl4-openssl-dev
|
libssl-dev
|
||||||
|
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
|
|
@ -242,7 +242,7 @@ jobs:
|
||||||
curl \
|
curl \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en \
|
language-pack-en \
|
||||||
libcurl4-openssl-dev
|
libssl-dev
|
||||||
|
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
|
|
@ -283,6 +283,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
|
@ -295,6 +297,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
|
|
@ -306,6 +310,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
|
||||||
|
|
@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
||||||
|
- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -212,7 +212,6 @@ struct handle_model_result {
|
||||||
static handle_model_result common_params_handle_model(
|
static handle_model_result common_params_handle_model(
|
||||||
struct common_params_model & model,
|
struct common_params_model & model,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
const std::string & model_path_default,
|
|
||||||
bool offline) {
|
bool offline) {
|
||||||
handle_model_result result;
|
handle_model_result result;
|
||||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||||
|
|
@ -257,8 +256,6 @@ static handle_model_result common_params_handle_model(
|
||||||
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (model.path.empty()) {
|
|
||||||
model.path = model_path_default;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -405,7 +402,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
|
|
||||||
// handle model and download
|
// handle model and download
|
||||||
{
|
{
|
||||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||||
if (params.no_mmproj) {
|
if (params.no_mmproj) {
|
||||||
params.mmproj = {};
|
params.mmproj = {};
|
||||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||||
|
|
@ -415,12 +412,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
// only download mmproj if the current example is using it
|
// only download mmproj if the current example is using it
|
||||||
for (auto & ex : mmproj_examples) {
|
for (auto & ex : mmproj_examples) {
|
||||||
if (ctx_arg.ex == ex) {
|
if (ctx_arg.ex == ex) {
|
||||||
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
|
||||||
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||||
|
}
|
||||||
|
|
||||||
|
// model is required (except for server)
|
||||||
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||||
|
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
|
||||||
|
throw std::invalid_argument("error: --model is required\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
|
|
@ -2072,11 +2075,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-m", "--model"}, "FNAME",
|
{"-m", "--model"}, "FNAME",
|
||||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||||
? std::string("model path from which to load base model")
|
? "model path from which to load base model"
|
||||||
: string_format(
|
: "model path to load",
|
||||||
"model path (default: `models/$filename` with filename from `--hf-file` "
|
|
||||||
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
|
||||||
),
|
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.model.path = value;
|
params.model.path = value;
|
||||||
}
|
}
|
||||||
|
|
@ -2474,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--models-dir"}, "PATH",
|
||||||
|
"directory containing models for the router server (default: disabled)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.models_dir = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--max-models"}, "N",
|
||||||
|
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.max_models = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--jinja"},
|
{"--jinja"},
|
||||||
"use jinja template for chat (default: disabled)",
|
"use jinja template for chat (default: disabled)",
|
||||||
|
|
|
||||||
|
|
@ -911,7 +911,7 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
return cache_directory + filename;
|
return cache_directory + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_file_info> fs_list_files(const std::string & path) {
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
|
||||||
std::vector<common_file_info> files;
|
std::vector<common_file_info> files;
|
||||||
if (path.empty()) return files;
|
if (path.empty()) return files;
|
||||||
|
|
||||||
|
|
@ -928,12 +928,20 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
||||||
common_file_info info;
|
common_file_info info;
|
||||||
info.path = p.string();
|
info.path = p.string();
|
||||||
info.name = p.filename().string();
|
info.name = p.filename().string();
|
||||||
|
info.is_dir = false;
|
||||||
try {
|
try {
|
||||||
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
||||||
} catch (const std::filesystem::filesystem_error &) {
|
} catch (const std::filesystem::filesystem_error &) {
|
||||||
info.size = 0;
|
info.size = 0;
|
||||||
}
|
}
|
||||||
files.push_back(std::move(info));
|
files.push_back(std::move(info));
|
||||||
|
} else if (include_directories && std::filesystem::is_directory(p)) {
|
||||||
|
common_file_info info;
|
||||||
|
info.path = p.string();
|
||||||
|
info.name = p.filename().string();
|
||||||
|
info.size = 0; // Directories have no size
|
||||||
|
info.is_dir = true;
|
||||||
|
files.push_back(std::move(info));
|
||||||
}
|
}
|
||||||
} catch (const std::filesystem::filesystem_error &) {
|
} catch (const std::filesystem::filesystem_error &) {
|
||||||
// skip entries we cannot inspect
|
// skip entries we cannot inspect
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,6 @@
|
||||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
||||||
|
|
||||||
struct common_time_meas {
|
struct common_time_meas {
|
||||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||||
~common_time_meas();
|
~common_time_meas();
|
||||||
|
|
@ -460,6 +458,10 @@ struct common_params {
|
||||||
bool endpoint_props = false; // only control POST requests, not GET
|
bool endpoint_props = false; // only control POST requests, not GET
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
// router server configs
|
||||||
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
|
int max_models = 4; // maximum number of models to load simultaneously
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
std::string slot_save_path;
|
std::string slot_save_path;
|
||||||
|
|
@ -623,8 +625,9 @@ struct common_file_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
std::string name;
|
std::string name;
|
||||||
size_t size = 0; // in bytes
|
size_t size = 0; // in bytes
|
||||||
|
bool is_dir = false;
|
||||||
};
|
};
|
||||||
std::vector<common_file_info> fs_list_files(const std::string & path);
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
|
|
||||||
|
|
@ -1047,7 +1047,7 @@ std::string common_docker_resolve_model(const std::string &) {
|
||||||
std::vector<common_cached_model_info> common_list_cached_models() {
|
std::vector<common_cached_model_info> common_list_cached_models() {
|
||||||
std::vector<common_cached_model_info> models;
|
std::vector<common_cached_model_info> models;
|
||||||
const std::string cache_dir = fs_get_cache_directory();
|
const std::string cache_dir = fs_get_cache_directory();
|
||||||
const std::vector<common_file_info> files = fs_list_files(cache_dir);
|
const std::vector<common_file_info> files = fs_list(cache_dir, false);
|
||||||
for (const auto & file : files) {
|
for (const auto & file : files) {
|
||||||
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
|
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
|
||||||
common_cached_model_info model_info;
|
common_cached_model_info model_info;
|
||||||
|
|
|
||||||
|
|
@ -2246,8 +2246,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||||
bool & use_cann_graph,
|
bool & use_cann_graph,
|
||||||
bool & cann_graph_update_required) {
|
bool & cann_graph_update_required) {
|
||||||
#ifdef USE_ACL_GRAPH
|
#ifdef USE_ACL_GRAPH
|
||||||
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture
|
||||||
if (use_cann_graph && cann_graph_update_required) {
|
|
||||||
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
||||||
}
|
}
|
||||||
#endif // USE_ACL_GRAPH
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
@ -2271,12 +2270,14 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_ACL_GRAPH
|
#ifdef USE_ACL_GRAPH
|
||||||
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
if (use_cann_graph) {
|
||||||
|
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
||||||
|
|
||||||
|
if (cann_graph_update_required) { // End CANN graph capture
|
||||||
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_cann_graph) {
|
// Execute CANN graph
|
||||||
// Execute graph
|
|
||||||
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
|
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
|
||||||
}
|
}
|
||||||
#endif // USE_ACL_GRAPH
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
|
||||||
|
|
@ -224,6 +224,10 @@ static const char * cu_get_error_str(CUresult err) {
|
||||||
#define AMD_MFMA_AVAILABLE
|
#define AMD_MFMA_AVAILABLE
|
||||||
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIP) && defined(RDNA4)
|
||||||
|
#define AMD_WMMA_AVAILABLE
|
||||||
|
#endif // defined(GGML_USE_HIP) && defined(RDNA4)
|
||||||
|
|
||||||
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
|
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
|
||||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
#define VOLTA_MMA_AVAILABLE
|
#define VOLTA_MMA_AVAILABLE
|
||||||
|
|
@ -283,6 +287,10 @@ static bool amd_mfma_available(const int cc) {
|
||||||
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool amd_wmma_available(const int cc) {
|
||||||
|
return GGML_CUDA_CC_IS_RDNA4(cc);
|
||||||
|
}
|
||||||
|
|
||||||
static bool volta_mma_available(const int cc) {
|
static bool volta_mma_available(const int cc) {
|
||||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
|
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,15 @@ template<typename dst_t, typename src_t>
|
||||||
return __float2bfloat16(float(x));
|
return __float2bfloat16(float(x));
|
||||||
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
|
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
|
||||||
return __bfloat162float(x);
|
return __bfloat162float(x);
|
||||||
|
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
|
||||||
|
return __float22half2_rn(x);
|
||||||
|
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
|
||||||
|
// bypass compile error on cuda 12.0.1
|
||||||
|
#ifdef GGML_USE_HIP
|
||||||
|
return __float22bfloat162_rn(x);
|
||||||
|
#else
|
||||||
|
return {x.x, x.y};
|
||||||
|
#endif // GGML_USE_HIP
|
||||||
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
|
||||||
return int32_t(x);
|
return int32_t(x);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,33 @@ namespace ggml_cuda_mma {
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
|
#if defined(RDNA4)
|
||||||
|
static constexpr int ne = I * J / 32;
|
||||||
|
T x[ne] = {0};
|
||||||
|
|
||||||
|
static constexpr __device__ bool supported() {
|
||||||
|
if (I == 16 && J == 16) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 16) {
|
||||||
|
return 8 * (threadIdx.x / 16) + l;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 16) {
|
||||||
|
return threadIdx.x % 16;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
static constexpr int ne = I * J / 64;
|
static constexpr int ne = I * J / 64;
|
||||||
T x[ne] = {0};
|
T x[ne] = {0};
|
||||||
|
|
||||||
|
|
@ -119,6 +146,7 @@ namespace ggml_cuda_mma {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif // defined(RDNA4)
|
||||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
static constexpr int ne = I * J / 32;
|
static constexpr int ne = I * J / 32;
|
||||||
T x[ne] = {0};
|
T x[ne] = {0};
|
||||||
|
|
@ -236,6 +264,32 @@ namespace ggml_cuda_mma {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
static constexpr int ne = I * J / 32;
|
||||||
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
static constexpr __device__ bool supported() {
|
||||||
|
if (I == 16 && J == 8) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
return threadIdx.x % 16;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
return 4 * (threadIdx.x / 16) + l;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
static constexpr int ne = I * J / WARP_SIZE;
|
static constexpr int ne = I * J / WARP_SIZE;
|
||||||
half2 x[ne] = {{0.0f, 0.0f}};
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
@ -285,6 +339,34 @@ namespace ggml_cuda_mma {
|
||||||
struct tile<I_, J_, nv_bfloat162> {
|
struct tile<I_, J_, nv_bfloat162> {
|
||||||
static constexpr int I = I_;
|
static constexpr int I = I_;
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
|
|
||||||
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
static constexpr int ne = I * J / 32;
|
||||||
|
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
static constexpr __device__ bool supported() {
|
||||||
|
if (I == 16 && J == 8) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
return threadIdx.x % 16;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
|
if constexpr (I == 16 && J == 8) {
|
||||||
|
return 4 * (threadIdx.x / 16) + l;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
static constexpr int ne = I * J / WARP_SIZE;
|
static constexpr int ne = I * J / WARP_SIZE;
|
||||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
|
@ -320,6 +402,7 @@ namespace ggml_cuda_mma {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
};
|
};
|
||||||
|
|
||||||
template <int I, int J>
|
template <int I, int J>
|
||||||
|
|
@ -353,6 +436,8 @@ namespace ggml_cuda_mma {
|
||||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||||
xi[0] = xs[0];
|
xi[0] = xs[0];
|
||||||
}
|
}
|
||||||
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||||
#else
|
#else
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int l = 0; l < t.ne; ++l) {
|
for (int l = 0; l < t.ne; ++l) {
|
||||||
|
|
@ -639,12 +724,34 @@ namespace ggml_cuda_mma {
|
||||||
: "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
: "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
||||||
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
||||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||||
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
|
||||||
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
|
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
||||||
|
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
||||||
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // TURING_MMA_AVAILABLE
|
#endif // TURING_MMA_AVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void mma(
|
||||||
|
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
||||||
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
||||||
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
|
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
||||||
|
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
||||||
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
|
#else
|
||||||
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // AMPERE_MMA_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void mma(
|
static __device__ __forceinline__ void mma(
|
||||||
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
||||||
#if defined(AMD_MFMA_AVAILABLE)
|
#if defined(AMD_MFMA_AVAILABLE)
|
||||||
|
|
|
||||||
|
|
@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (src1_ncols > 16) {
|
if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return ampere_mma_available(cc);
|
return ampere_mma_available(cc);
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return volta_mma_available(cc) || turing_mma_available(cc);
|
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
return ampere_mma_available(cc);
|
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#include "mma.cuh"
|
#include "mma.cuh"
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
#include "convert.cuh"
|
||||||
|
|
||||||
using namespace ggml_cuda_mma;
|
using namespace ggml_cuda_mma;
|
||||||
|
|
||||||
|
|
@ -27,20 +28,35 @@ static __global__ void mul_mat_f(
|
||||||
const int stride_col_id, const int stride_row_id,
|
const int stride_col_id, const int stride_row_id,
|
||||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
|
||||||
|
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||||
|
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
|
||||||
|
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
|
||||||
|
typedef tile<16, 8, T> tile_A;
|
||||||
|
typedef tile<tile_B_I, 8, T> tile_B;
|
||||||
|
typedef tile<16, tile_C_J, float> tile_C;
|
||||||
|
|
||||||
|
constexpr bool a_supported = tile_A::supported();
|
||||||
|
constexpr bool b_supported = tile_B::supported();
|
||||||
|
constexpr bool c_supported = tile_C::supported();
|
||||||
|
constexpr bool supported = a_supported && b_supported && c_supported;
|
||||||
|
#else
|
||||||
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
||||||
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
||||||
|
constexpr bool supported = I_16_supported || I_32_supported;
|
||||||
if (!I_16_supported && !I_32_supported) {
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
|
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
|
||||||
|
|
||||||
typedef tile<I_preferred, 8, T> tile_A;
|
typedef tile<I_preferred, 8, T> tile_A;
|
||||||
typedef tile<8, 8, T> tile_B;
|
typedef tile<8, 8, T> tile_B;
|
||||||
typedef tile<I_preferred, 8, float> tile_C;
|
typedef tile<I_preferred, 8, float> tile_C;
|
||||||
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
|
if constexpr (!supported) {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
constexpr int tile_k_padded = warp_size + 4;
|
constexpr int tile_k_padded = warp_size + 4;
|
||||||
|
|
@ -161,11 +177,11 @@ static __global__ void mul_mat_f(
|
||||||
|
|
||||||
if constexpr (!has_ids) {
|
if constexpr (!has_ids) {
|
||||||
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
|
||||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
|
||||||
} else {
|
} else {
|
||||||
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
|
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
|
||||||
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
|
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
|
||||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -239,7 +255,7 @@ static __global__ void mul_mat_f(
|
||||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||||
}
|
}
|
||||||
|
|
||||||
//This kernel is for larger batch sizes of mul_mat_id
|
//This kernel is for larger batch sizes of mul_mat_id
|
||||||
|
|
@ -253,20 +269,35 @@ static __global__ void mul_mat_f_ids(
|
||||||
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||||
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||||
const uint3 sis1_fd, const uint3 nch_fd) {
|
const uint3 sis1_fd, const uint3 nch_fd) {
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
|
||||||
|
#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
// Special case for tf32, just dummy mma layout as wmma doesn't support it.
|
||||||
|
constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
|
||||||
|
constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
|
||||||
|
typedef tile<16, 8, T> tile_A;
|
||||||
|
typedef tile<tile_B_I, 8, T> tile_B;
|
||||||
|
typedef tile<16, tile_C_J, float> tile_C;
|
||||||
|
|
||||||
|
constexpr bool a_supported = tile_A::supported();
|
||||||
|
constexpr bool b_supported = tile_B::supported();
|
||||||
|
constexpr bool c_supported = tile_C::supported();
|
||||||
|
constexpr bool supported = a_supported && b_supported && c_supported;
|
||||||
|
#else
|
||||||
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
||||||
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
||||||
|
constexpr bool supported = I_16_supported || I_32_supported;
|
||||||
|
|
||||||
if (!I_16_supported && !I_32_supported) {
|
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
|
||||||
NO_DEVICE_CODE;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
|
|
||||||
|
|
||||||
typedef tile<I_preferred, 8, T> tile_A;
|
typedef tile<I_preferred, 8, T> tile_A;
|
||||||
typedef tile<8, 8, T> tile_B;
|
typedef tile<8, 8, T> tile_B;
|
||||||
typedef tile<I_preferred, 8, float> tile_C;
|
typedef tile<I_preferred, 8, float> tile_C;
|
||||||
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
|
if constexpr (!supported) {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
constexpr int tile_k_padded = warp_size + 4;
|
constexpr int tile_k_padded = warp_size + 4;
|
||||||
|
|
@ -408,7 +439,7 @@ static __global__ void mul_mat_f_ids(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
for (int j0 = 0; j0 < tile_B::I; ++j0) {
|
||||||
const float2 tmp = vals_buf[curr_buf][j0];
|
const float2 tmp = vals_buf[curr_buf][j0];
|
||||||
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
|
tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (itB + 1 < ntB) {
|
if (itB + 1 < ntB) {
|
||||||
|
|
@ -492,7 +523,7 @@ static __global__ void mul_mat_f_ids(
|
||||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
|
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int cols_per_block, int nwarps>
|
template<typename T, int cols_per_block, int nwarps>
|
||||||
|
|
@ -554,7 +585,8 @@ void mul_mat_f_cuda(
|
||||||
cudaStream_t stream, const mmf_ids_data * ids_data) {
|
cudaStream_t stream, const mmf_ids_data * ids_data) {
|
||||||
typedef tile<16, 8, T> tile_A_16;
|
typedef tile<16, 8, T> tile_A_16;
|
||||||
typedef tile<32, 8, T> tile_A_32;
|
typedef tile<32, 8, T> tile_A_32;
|
||||||
typedef tile< 8, 8, T> tile_B;
|
typedef tile<16, 8, T> tile_B_16;
|
||||||
|
typedef tile< 8, 8, T> tile_B_8;
|
||||||
|
|
||||||
GGML_ASSERT(ncols_x % 2 == 0);
|
GGML_ASSERT(ncols_x % 2 == 0);
|
||||||
GGML_ASSERT(stride_row % 2 == 0);
|
GGML_ASSERT(stride_row % 2 == 0);
|
||||||
|
|
@ -581,7 +613,8 @@ void mul_mat_f_cuda(
|
||||||
|
|
||||||
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
|
||||||
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
|
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
|
||||||
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
|
const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I;
|
||||||
|
const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4;
|
||||||
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
|
||||||
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
|
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
|
||||||
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
|
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
|
||||||
|
|
|
||||||
|
|
@ -106,33 +106,32 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
t1 = HAP_perf_get_qtimer_count();
|
t1 = HAP_perf_get_qtimer_count();
|
||||||
|
|
||||||
int is_aligned = 1;
|
int is_aligned = 1;
|
||||||
int opt_path = 0;
|
|
||||||
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
|
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
|
||||||
is_aligned = 0;
|
is_aligned = 0;
|
||||||
FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
||||||
}
|
}
|
||||||
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
|
|
||||||
opt_path = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
|
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
|
||||||
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
|
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
|
||||||
uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
||||||
|
|
||||||
bool src1_valid = src1->ne[0];
|
const bool src1_valid = src1->ne[0];
|
||||||
|
const int nc = (src1_valid) ? ne00 : ne00 / 2;
|
||||||
if (!src1_valid) {
|
if (!src1_valid) {
|
||||||
|
const int32_t swapped = op_params[1];
|
||||||
data_src1 = data_src0;
|
data_src1 = data_src0;
|
||||||
src1_row_size = src0_row_size;
|
src1_row_size = src0_row_size;
|
||||||
|
|
||||||
|
const size_t nc_in_bytes = nc * SIZEOF_FP32;
|
||||||
|
data_src0 += swapped ? nc_in_bytes : 0;
|
||||||
|
data_src1 += swapped ? 0 : nc_in_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
|
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
|
||||||
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
|
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
|
||||||
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
|
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
|
||||||
|
|
||||||
const int32_t swapped = op_params[1];
|
const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
|
||||||
|
|
||||||
const int nc = (src1_valid) ? ne0 : ne0 / 2;
|
|
||||||
|
|
||||||
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
|
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
|
||||||
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
|
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
|
||||||
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
|
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
|
||||||
|
|
@ -142,12 +141,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
|
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!src1_valid) {
|
if (opt_path) {
|
||||||
src0 += swapped ? nc : 0;
|
|
||||||
src1 += swapped ? 0 : nc;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1 == opt_path) {
|
|
||||||
hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
|
hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
|
||||||
hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
|
hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
|
||||||
(uint8_t *) dst, nc);
|
(uint8_t *) dst, nc);
|
||||||
|
|
@ -218,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
|
||||||
const float alpha = ((const float *) (op_params))[2];
|
const float alpha = ((const float *) (op_params))[2];
|
||||||
const float limit = ((const float *) (op_params))[3];
|
const float limit = ((const float *) (op_params))[3];
|
||||||
|
|
||||||
const int nc = (src1_valid) ? ne0 : ne0 / 2;
|
const int nc = (src1_valid) ? ne00 : ne00 / 2;
|
||||||
|
|
||||||
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
|
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
|
||||||
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
|
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,19 @@
|
||||||
#include "hvx-utils.h"
|
#include "hvx-utils.h"
|
||||||
#include "ops-utils.h"
|
#include "ops-utils.h"
|
||||||
|
|
||||||
|
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
|
||||||
|
static const float kInf = INFINITY;
|
||||||
|
static const float kMaxExp = 88.02f; // log(INF)
|
||||||
|
|
||||||
|
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||||
|
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||||
|
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
||||||
|
|
||||||
|
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
|
||||||
|
|
||||||
|
return Q6_V_vmux_QVV(pred0, inf, out);
|
||||||
|
}
|
||||||
|
|
||||||
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
|
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
|
||||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||||
int num_elems_whole = num_elems - left_over;
|
int num_elems_whole = num_elems - left_over;
|
||||||
|
|
@ -42,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
if (true == negate) {
|
if (true == negate) {
|
||||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
||||||
*p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in);
|
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||||
} else {
|
} else {
|
||||||
*p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
|
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -54,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||||
|
|
||||||
if (true == negate) {
|
if (true == negate) {
|
||||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
|
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||||
} else {
|
} else {
|
||||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
|
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||||
if (true == negate) {
|
if (true == negate) {
|
||||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||||
|
|
||||||
vec_out = hvx_vec_exp_fp32(neg_vec_in);
|
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||||
} else {
|
} else {
|
||||||
vec_out = hvx_vec_exp_fp32(in);
|
vec_out = hvx_vec_exp_fp32_guard(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
||||||
|
|
|
||||||
|
|
@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
*p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
|
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
|
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
||||||
float * dstf = (float *) dst + num_elems_whole;
|
float * dstf = (float *) dst + num_elems_whole;
|
||||||
|
|
||||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||||
HVX_Vector out = hvx_vec_inverse_fp32(in);
|
HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
|
||||||
|
|
||||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -401,6 +401,8 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
|
FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const float kInf = INFINITY;
|
||||||
|
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||||
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
||||||
|
|
||||||
if (0 == unaligned_loop) {
|
if (0 == unaligned_loop) {
|
||||||
|
|
@ -409,17 +411,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
|
HVX_Vector in = *vec_in1++;
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
|
||||||
|
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
|
||||||
|
v = Q6_Vsf_equals_Vqf32(v);
|
||||||
|
v = Q6_V_vmux_QVV(pred_inf, inf, v);
|
||||||
|
*vec_out++ = v;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||||
|
|
||||||
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
|
||||||
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
|
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
|
||||||
|
out = Q6_Vsf_equals_Vqf32(out);
|
||||||
|
out = Q6_V_vmux_QVV(pred_inf, inf, out);
|
||||||
|
|
||||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -429,8 +438,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
||||||
|
|
||||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||||
|
|
||||||
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
|
||||||
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
|
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
|
||||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
|
out = Q6_Vsf_equals_Vqf32(out);
|
||||||
|
out = Q6_V_vmux_QVV(pred_inf, inf, out);
|
||||||
|
|
||||||
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,15 @@
|
||||||
#define VLEN_FP32 (VLEN / SIZEOF_FP32)
|
#define VLEN_FP32 (VLEN / SIZEOF_FP32)
|
||||||
#define VLEN_FP16 (VLEN / SIZEOF_FP16)
|
#define VLEN_FP16 (VLEN / SIZEOF_FP16)
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
HVX_Vector v;
|
||||||
|
uint8_t b[VLEN];
|
||||||
|
uint16_t h[VLEN_FP16];
|
||||||
|
uint32_t w[VLEN_FP32];
|
||||||
|
__fp16 fp16[VLEN_FP16];
|
||||||
|
float fp32[VLEN_FP32];
|
||||||
|
} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
||||||
union {
|
union {
|
||||||
float f;
|
float f;
|
||||||
|
|
@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||||
union {
|
HVX_VectorAlias u = { .v = v };
|
||||||
HVX_Vector v;
|
|
||||||
__fp16 d[64];
|
|
||||||
} u = { .v = v };
|
|
||||||
|
|
||||||
const uint32_t n0 = n / 16;
|
const uint32_t n0 = n / 16;
|
||||||
const uint32_t n1 = n % 16;
|
const uint32_t n1 = n % 16;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < n0; i++) {
|
for (; i < n0; i++) {
|
||||||
htp_dump_fp16_line(pref, u.d + (16 * i), 16);
|
htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
|
||||||
}
|
}
|
||||||
if (n1) {
|
if (n1) {
|
||||||
htp_dump_fp16_line(pref, u.d + (16 * i), n1);
|
htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -491,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
|
||||||
static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
|
static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
|
||||||
// neg by setting the fp16 sign bit
|
// neg by setting the fp16 sign bit
|
||||||
HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
|
HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
|
||||||
return Q6_V_vor_VV(v, mask);
|
return Q6_V_vxor_VV(v, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
|
static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
|
||||||
|
|
@ -506,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
|
||||||
#else
|
#else
|
||||||
// neg by setting the fp32 sign bit
|
// neg by setting the fp32 sign bit
|
||||||
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
|
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
|
||||||
return Q6_V_vor_VV(v, mask);
|
return Q6_V_vxor_VV(v, mask);
|
||||||
#endif // __HTP_ARCH__ > 75
|
#endif // __HTP_ARCH__ > 75
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -720,6 +726,24 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
|
||||||
return Q6_Vsf_equals_Vqf32(r_qf);
|
return Q6_Vsf_equals_Vqf32(r_qf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
|
||||||
|
static const float kInf = INFINITY;
|
||||||
|
static const uint32_t kNanMask = 0x7fffffff;
|
||||||
|
static const uint32_t kNanMin = 0x7f800000;
|
||||||
|
|
||||||
|
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||||
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
|
||||||
|
|
||||||
|
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
||||||
|
|
||||||
|
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
|
||||||
|
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
|
||||||
|
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
|
||||||
|
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
|
||||||
|
|
||||||
|
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
|
||||||
|
}
|
||||||
|
|
||||||
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
||||||
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
||||||
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
|
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
|
||||||
|
|
@ -934,6 +958,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
|
||||||
return Q6_Vsf_equals_Vqf32(temp);
|
return Q6_Vsf_equals_Vqf32(temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
|
||||||
|
static const float kMaxExp = -88.02f; // log(INF)
|
||||||
|
|
||||||
|
const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
|
||||||
|
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
|
||||||
|
|
||||||
|
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
|
||||||
|
return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
|
||||||
|
}
|
||||||
|
|
||||||
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||||
int step_of_1 = num_elems >> 5;
|
int step_of_1 = num_elems >> 5;
|
||||||
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
||||||
|
|
@ -945,7 +979,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int i = 0; i < step_of_1; i++) {
|
for (int i = 0; i < step_of_1; i++) {
|
||||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
|
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6895,10 +6895,24 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||||
cl_context context = backend_ctx->context;
|
cl_context context = backend_ctx->context;
|
||||||
|
|
||||||
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
||||||
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0){
|
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
|
||||||
|
// For KQ
|
||||||
|
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
||||||
|
nb00 <= nb02 &&
|
||||||
|
nb02 <= nb01 &&
|
||||||
|
nb01 <= nb03 &&
|
||||||
|
nb10 <= nb12 &&
|
||||||
|
nb12 <= nb11 &&
|
||||||
|
nb11 <= nb13) {
|
||||||
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// For KQV
|
||||||
|
if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
|
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
|
if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
|
||||||
|
|
|
||||||
|
|
@ -513,6 +513,7 @@ struct vk_device_struct {
|
||||||
vk_queue compute_queue;
|
vk_queue compute_queue;
|
||||||
vk_queue transfer_queue;
|
vk_queue transfer_queue;
|
||||||
bool single_queue;
|
bool single_queue;
|
||||||
|
bool support_async;
|
||||||
uint32_t subgroup_size;
|
uint32_t subgroup_size;
|
||||||
uint32_t shader_core_count;
|
uint32_t shader_core_count;
|
||||||
bool uma;
|
bool uma;
|
||||||
|
|
@ -4273,6 +4274,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device->vendor_id = device->properties.vendorID;
|
device->vendor_id = device->properties.vendorID;
|
||||||
device->driver_id = driver_props.driverID;
|
device->driver_id = driver_props.driverID;
|
||||||
|
|
||||||
|
// Implementing the async backend interfaces seems broken on older Intel HW,
|
||||||
|
// see https://github.com/ggml-org/llama.cpp/issues/17302.
|
||||||
|
device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
|
||||||
|
std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) &&
|
||||||
|
getenv("GGML_VK_DISABLE_ASYNC") == nullptr;
|
||||||
|
|
||||||
|
if (!device->support_async) {
|
||||||
|
GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n");
|
||||||
|
}
|
||||||
|
|
||||||
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
|
||||||
|
|
||||||
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
|
if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
|
||||||
|
|
@ -13187,6 +13198,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
ctx->device->perf_logger->print_timings();
|
ctx->device->perf_logger->print_timings();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!ctx->device->support_async) {
|
||||||
|
ggml_vk_synchronize(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
|
|
@ -13480,6 +13495,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (!ctx->device->support_async) {
|
||||||
|
vk_backend->iface.get_tensor_async = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
return vk_backend;
|
return vk_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1593,7 +1593,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
{
|
{
|
||||||
bool is_lite = (hparams.n_layer == 27);
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||||
|
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||||
if (!is_lite) {
|
if (!is_lite) {
|
||||||
|
|
@ -4581,7 +4582,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
{
|
{
|
||||||
const bool is_lite = (hparams.n_layer == 27);
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||||
|
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||||
|
|
||||||
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,8 @@
|
||||||
|
|
||||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
bool is_lite = (hparams.n_layer == 27);
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||||
|
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||||
|
|
||||||
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7819,6 +7819,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
|
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
|
||||||
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
|
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
|
||||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||||
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048));
|
||||||
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
|
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -7827,6 +7828,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
|
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
|
||||||
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
|
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
|
||||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||||
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048));
|
||||||
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
|
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -7837,6 +7839,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
for (int bs : {1, 4, 8, 512}) {
|
for (int bs : {1, 4, 8, 512}) {
|
||||||
for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
|
for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
|
||||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||||
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880));
|
||||||
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
|
test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct quantize_stats_params {
|
struct quantize_stats_params {
|
||||||
std::string model = DEFAULT_MODEL_PATH;
|
std::string model = "models/7B/ggml-model-f16.gguf";
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool per_layer_stats = false;
|
bool per_layer_stats = false;
|
||||||
bool print_histogram = false;
|
bool print_histogram = false;
|
||||||
|
|
|
||||||
|
|
@ -1364,18 +1364,32 @@ llama-server -hf <user>/<model>:<tag>
|
||||||
|
|
||||||
*The server must be restarted after adding a new model.*
|
*The server must be restarted after adding a new model.*
|
||||||
|
|
||||||
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**:
|
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
llama-3.2-1b-Q4_K_M.gguf
|
llama-server --models-dir ./models_directory
|
||||||
gemma-3-4b-it-Q8_0.gguf
|
|
||||||
mmproj-gemma-3-4b-it-Q8_0.gguf # must be "mmproj-" + text model filename
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Example:
|
If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
llama-server --models-dir ./path/to/models
|
models_directory
|
||||||
|
│
|
||||||
|
│ # single file
|
||||||
|
├─ llama-3.2-1b-Q4_K_M.gguf
|
||||||
|
├─ Qwen3-8B-Q4_K_M.gguf
|
||||||
|
│
|
||||||
|
│ # multimodal
|
||||||
|
├─ gemma-3-4b-it-Q8_0
|
||||||
|
│ ├─ gemma-3-4b-it-Q8_0.gguf
|
||||||
|
│ └─ mmproj-F16.gguf # file name must start with "mmproj"
|
||||||
|
│
|
||||||
|
│ # multi-shard
|
||||||
|
├─ gemma-3-4b-it-Q8_0
|
||||||
|
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
|
||||||
|
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
|
||||||
|
│ ├─ ...
|
||||||
|
│ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
You may also specify default arguments that will be passed to every loaded model instance:
|
You may also specify default arguments that will be passed to every loaded model instance:
|
||||||
|
|
@ -1384,6 +1398,8 @@ You may also specify default arguments that will be passed to every loaded model
|
||||||
llama-server -ctx 8192 -n 1024 -np 2
|
llama-server -ctx 8192 -n 1024 -np 2
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note: model instances inherit both command line arguments and environment variables from the router server.
|
||||||
|
|
||||||
### Routing requests
|
### Routing requests
|
||||||
|
|
||||||
Requests are routed according to the requested model name.
|
Requests are routed according to the requested model name.
|
||||||
|
|
@ -1393,7 +1409,12 @@ For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, et
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||||
...
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "hello"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -1405,15 +1426,92 @@ GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
|
||||||
|
|
||||||
### GET `/models`: List available models
|
### GET `/models`: List available models
|
||||||
|
|
||||||
TODO
|
Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"data": [{
|
||||||
|
"name": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||||
|
"id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||||
|
"in_cache": true,
|
||||||
|
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
|
||||||
|
"status": {
|
||||||
|
"value": "loaded"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
|
||||||
|
|
||||||
|
The `status` object can be:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"status": {
|
||||||
|
"value": "unloaded"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
"status": {
|
||||||
|
"value": "loading"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
"status": {
|
||||||
|
"value": "failed"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
"status": {
|
||||||
|
"value": "loaded"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### POST `/models/load`: Load a model
|
### POST `/models/load`: Load a model
|
||||||
|
|
||||||
TODO
|
|
||||||
|
Load a model
|
||||||
|
|
||||||
|
Payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### POST `/models/unload`: Unload a model
|
### POST `/models/unload`: Unload a model
|
||||||
|
|
||||||
TODO
|
|
||||||
|
Unload a model
|
||||||
|
|
||||||
|
Payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#include <winsock2.h>
|
#include <winsock2.h>
|
||||||
|
|
@ -57,7 +58,7 @@ static std::filesystem::path get_server_exec_path() {
|
||||||
return std::filesystem::path(buf.data());
|
return std::filesystem::path(buf.data());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return std::filesystem::path(std::string(buf.data(), (size > 0) ? size : 0));
|
throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
char path[FILENAME_MAX];
|
char path[FILENAME_MAX];
|
||||||
|
|
@ -69,6 +70,64 @@ static std::filesystem::path get_server_exec_path() {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct local_model {
|
||||||
|
std::string name;
|
||||||
|
std::string path;
|
||||||
|
std::string path_mmproj;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<local_model> list_local_models(const std::string & dir) {
|
||||||
|
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
|
||||||
|
throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<local_model> models;
|
||||||
|
auto scan_subdir = [&models](const std::string & subdir_path, const std::string name) {
|
||||||
|
auto files = fs_list(subdir_path, false);
|
||||||
|
common_file_info model_file;
|
||||||
|
common_file_info first_shard_file;
|
||||||
|
common_file_info mmproj_file;
|
||||||
|
for (const auto & file : files) {
|
||||||
|
if (string_ends_with(file.name, ".gguf")) {
|
||||||
|
if (file.name.find("mmproj") != std::string::npos) {
|
||||||
|
mmproj_file = file;
|
||||||
|
} else if (file.name.find("-00001-of-") != std::string::npos) {
|
||||||
|
first_shard_file = file;
|
||||||
|
} else {
|
||||||
|
model_file = file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// single file model
|
||||||
|
local_model model{
|
||||||
|
/* name */ name,
|
||||||
|
/* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
|
||||||
|
/* path_mmproj */ mmproj_file.path // can be empty
|
||||||
|
};
|
||||||
|
if (!model.path.empty()) {
|
||||||
|
models.push_back(model);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto files = fs_list(dir, true);
|
||||||
|
for (const auto & file : files) {
|
||||||
|
if (file.is_dir) {
|
||||||
|
scan_subdir(file.path, file.name);
|
||||||
|
} else if (string_ends_with(file.name, ".gguf")) {
|
||||||
|
// single file model
|
||||||
|
std::string name = file.name;
|
||||||
|
string_replace_all(name, ".gguf", "");
|
||||||
|
local_model model{
|
||||||
|
/* name */ name,
|
||||||
|
/* path */ file.path,
|
||||||
|
/* path_mmproj */ ""
|
||||||
|
};
|
||||||
|
models.push_back(model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return models;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// server_models
|
// server_models
|
||||||
//
|
//
|
||||||
|
|
@ -85,15 +144,17 @@ server_models::server_models(
|
||||||
base_env.push_back(std::string(*env));
|
base_env.push_back(std::string(*env));
|
||||||
}
|
}
|
||||||
// TODO: allow refreshing cached model list
|
// TODO: allow refreshing cached model list
|
||||||
|
// add cached models
|
||||||
auto cached_models = common_list_cached_models();
|
auto cached_models = common_list_cached_models();
|
||||||
for (const auto & model : cached_models) {
|
for (const auto & model : cached_models) {
|
||||||
server_model_meta meta{
|
server_model_meta meta{
|
||||||
/* name */ model.to_string(),
|
/* name */ model.to_string(),
|
||||||
/* path */ model.manifest_path,
|
/* path */ model.manifest_path,
|
||||||
/* path_mmproj */ "",
|
/* path_mmproj */ "", // auto-detected when loading
|
||||||
/* in_cache */ true,
|
/* in_cache */ true,
|
||||||
/* port */ 0,
|
/* port */ 0,
|
||||||
/* status */ SERVER_MODEL_STATUS_UNLOADED
|
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||||
|
/* last_used */ 0
|
||||||
};
|
};
|
||||||
mapping[meta.name] = instance_t{
|
mapping[meta.name] = instance_t{
|
||||||
/* subproc */ std::make_shared<subprocess_s>(),
|
/* subproc */ std::make_shared<subprocess_s>(),
|
||||||
|
|
@ -101,6 +162,30 @@ server_models::server_models(
|
||||||
/* meta */ meta
|
/* meta */ meta
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
// add local models specificed via --models-dir
|
||||||
|
if (!params.models_dir.empty()) {
|
||||||
|
auto local_models = list_local_models(params.models_dir);
|
||||||
|
for (const auto & model : local_models) {
|
||||||
|
if (mapping.find(model.name) != mapping.end()) {
|
||||||
|
// already exists in cached models, skip
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
server_model_meta meta{
|
||||||
|
/* name */ model.name,
|
||||||
|
/* path */ model.path,
|
||||||
|
/* path_mmproj */ model.path_mmproj,
|
||||||
|
/* in_cache */ false,
|
||||||
|
/* port */ 0,
|
||||||
|
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||||
|
/* last_used */ 0
|
||||||
|
};
|
||||||
|
mapping[meta.name] = instance_t{
|
||||||
|
/* subproc */ std::make_shared<subprocess_s>(),
|
||||||
|
/* th */ std::thread(),
|
||||||
|
/* meta */ meta
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
||||||
|
|
@ -207,11 +292,39 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::load(const std::string & name) {
|
void server_models::unload_lru() {
|
||||||
|
if (base_params.max_models <= 0) {
|
||||||
|
return; // no limit
|
||||||
|
}
|
||||||
|
// remove one of the servers if we passed the max_models (least recently used - LRU)
|
||||||
|
std::string lru_model_name = "";
|
||||||
|
int64_t lru_last_used = ggml_time_ms();
|
||||||
|
size_t count_active = 0;
|
||||||
|
{
|
||||||
std::lock_guard<std::mutex> lk(mutex);
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
if (mapping.find(name) == mapping.end()) {
|
for (const auto & m : mapping) {
|
||||||
|
if (m.second.meta.is_active()) {
|
||||||
|
count_active++;
|
||||||
|
if (m.second.meta.last_used < lru_last_used) {
|
||||||
|
lru_model_name = m.first;
|
||||||
|
lru_last_used = m.second.meta.last_used;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
|
||||||
|
SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
||||||
|
unload(lru_model_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void server_models::load(const std::string & name) {
|
||||||
|
if (!has_model(name)) {
|
||||||
throw std::runtime_error("model name=" + name + " is not found");
|
throw std::runtime_error("model name=" + name + " is not found");
|
||||||
}
|
}
|
||||||
|
unload_lru();
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
|
|
||||||
auto meta = mapping[name].meta;
|
auto meta = mapping[name].meta;
|
||||||
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||||
|
|
@ -224,6 +337,7 @@ void server_models::load(const std::string & name) {
|
||||||
inst.meta = meta;
|
inst.meta = meta;
|
||||||
inst.meta.port = get_free_port();
|
inst.meta.port = get_free_port();
|
||||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||||
|
inst.meta.last_used = ggml_time_ms();
|
||||||
|
|
||||||
if (inst.meta.port <= 0) {
|
if (inst.meta.port <= 0) {
|
||||||
throw std::runtime_error("failed to get a port number");
|
throw std::runtime_error("failed to get a port number");
|
||||||
|
|
@ -385,7 +499,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) {
|
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
|
||||||
auto meta = get_meta(name);
|
auto meta = get_meta(name);
|
||||||
if (!meta.has_value()) {
|
if (!meta.has_value()) {
|
||||||
throw std::runtime_error("model name=" + name + " is not found");
|
throw std::runtime_error("model name=" + name + " is not found");
|
||||||
|
|
@ -393,6 +507,10 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
||||||
if (ensure_model_loaded(name)) {
|
if (ensure_model_loaded(name)) {
|
||||||
meta = get_meta(name); // refresh meta
|
meta = get_meta(name); // refresh meta
|
||||||
}
|
}
|
||||||
|
if (update_last_used) {
|
||||||
|
std::unique_lock<std::mutex> lk(mutex);
|
||||||
|
mapping[name].meta.last_used = ggml_time_ms();
|
||||||
|
}
|
||||||
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
||||||
auto proxy = std::make_unique<server_http_proxy>(
|
auto proxy = std::make_unique<server_http_proxy>(
|
||||||
method,
|
method,
|
||||||
|
|
@ -572,8 +690,11 @@ server_http_proxy::server_http_proxy(
|
||||||
|
|
||||||
// wait for the first chunk (headers)
|
// wait for the first chunk (headers)
|
||||||
msg_t header;
|
msg_t header;
|
||||||
pipe->read(header, should_stop);
|
if (pipe->read(header, should_stop)) {
|
||||||
SRV_DBG("%s", "received response headers\n");
|
SRV_DBG("%s", "received response headers\n");
|
||||||
this->status = header.status;
|
this->status = header.status;
|
||||||
this->headers = header.headers;
|
this->headers = header.headers;
|
||||||
|
} else {
|
||||||
|
SRV_DBG("%s", "no response headers received (request cancelled?)\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,8 @@ struct server_model_meta {
|
||||||
bool in_cache = false; // if true, use -hf; use -m otherwise
|
bool in_cache = false; // if true, use -hf; use -m otherwise
|
||||||
int port = 0;
|
int port = 0;
|
||||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||||
|
int64_t last_used = 0;
|
||||||
|
|
||||||
bool is_active() const {
|
bool is_active() const {
|
||||||
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
||||||
}
|
}
|
||||||
|
|
@ -81,6 +83,9 @@ private:
|
||||||
|
|
||||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||||
|
|
||||||
|
// unload least recently used models if the limit is reached
|
||||||
|
void unload_lru();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
||||||
|
|
||||||
|
|
@ -109,7 +114,7 @@ public:
|
||||||
bool ensure_model_loaded(const std::string & name);
|
bool ensure_model_loaded(const std::string & name);
|
||||||
|
|
||||||
// proxy an HTTP request to the model instance
|
// proxy an HTTP request to the model instance
|
||||||
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name);
|
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
|
||||||
|
|
||||||
// notify the router server that a model instance is ready
|
// notify the router server that a model instance is ready
|
||||||
static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
|
static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
|
||||||
|
|
|
||||||
|
|
@ -5143,7 +5143,7 @@ public:
|
||||||
std::string method = "GET";
|
std::string method = "GET";
|
||||||
std::string name = req.get_param("model");
|
std::string name = req.get_param("model");
|
||||||
models->ensure_model_loaded(name);
|
models->ensure_model_loaded(name);
|
||||||
return models->proxy_request(req, method, name);
|
return models->proxy_request(req, method, name, false);
|
||||||
};
|
};
|
||||||
|
|
||||||
server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
|
server_http_context::handler_t proxy_post = [this](const server_http_req & req) {
|
||||||
|
|
@ -5151,7 +5151,7 @@ public:
|
||||||
json body = json::parse(req.body);
|
json body = json::parse(req.body);
|
||||||
std::string name = json_value(body, "model", std::string());
|
std::string name = json_value(body, "model", std::string());
|
||||||
models->ensure_model_loaded(name);
|
models->ensure_model_loaded(name);
|
||||||
return models->proxy_request(req, method, name);
|
return models->proxy_request(req, method, name, true); // update last usage for POST request only
|
||||||
};
|
};
|
||||||
|
|
||||||
server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) {
|
server_http_context::handler_t post_router_models_load = [this](const server_http_req & req) {
|
||||||
|
|
@ -5189,9 +5189,10 @@ public:
|
||||||
auto all_models = models->get_all_meta();
|
auto all_models = models->get_all_meta();
|
||||||
for (const auto & model : all_models) {
|
for (const auto & model : all_models) {
|
||||||
models_json.push_back(json {
|
models_json.push_back(json {
|
||||||
{"model", model.name},
|
|
||||||
{"name", model.name},
|
{"name", model.name},
|
||||||
{"id", model.name},
|
{"id", model.name},
|
||||||
|
{"in_cache", model.in_cache},
|
||||||
|
{"path", model.path},
|
||||||
// TODO: other fields...
|
// TODO: other fields...
|
||||||
{"status", {
|
{"status", {
|
||||||
{"value", server_model_status_to_string(model.status)}
|
{"value", server_model_status_to_string(model.status)}
|
||||||
|
|
@ -5662,8 +5663,7 @@ int main(int argc, char ** argv, char ** envp) {
|
||||||
// register API routes
|
// register API routes
|
||||||
server_routes routes(params, ctx_server, ctx_http);
|
server_routes routes(params, ctx_server, ctx_http);
|
||||||
|
|
||||||
// TODO: improve this by changing arg.cpp
|
bool is_router_server = params.model.path.empty();
|
||||||
bool is_router_server = params.model.path == DEFAULT_MODEL_PATH;
|
|
||||||
if (is_router_server) {
|
if (is_router_server) {
|
||||||
// setup server instances manager
|
// setup server instances manager
|
||||||
routes.models.reset(new server_models(params, argc, argv, envp));
|
routes.models.reset(new server_models(params, argc, argv, envp));
|
||||||
|
|
|
||||||
|
|
@ -267,7 +267,6 @@
|
||||||
if (activeId) {
|
if (activeId) {
|
||||||
return options.find((option) => option.id === activeId);
|
return options.find((option) => option.id === activeId);
|
||||||
}
|
}
|
||||||
|
|
||||||
return options[0];
|
return options[0];
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,38 @@ target_compile_definitions(${TARGET} PRIVATE
|
||||||
CPPHTTPLIB_TCP_NODELAY=1
|
CPPHTTPLIB_TCP_NODELAY=1
|
||||||
)
|
)
|
||||||
|
|
||||||
if (LLAMA_OPENSSL)
|
if (LLAMA_BUILD_BORINGSSL)
|
||||||
|
set(OPENSSL_NO_ASM ON CACHE BOOL "Disable OpenSSL ASM code (BoringSSL)")
|
||||||
|
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
|
||||||
|
|
||||||
|
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
|
||||||
|
set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version")
|
||||||
|
|
||||||
|
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
|
||||||
|
|
||||||
|
include(FetchContent)
|
||||||
|
FetchContent_Declare(
|
||||||
|
boringssl
|
||||||
|
GIT_REPOSITORY ${BORINGSSL_GIT}
|
||||||
|
GIT_TAG ${BORINGSSL_VERSION}
|
||||||
|
PATCH_COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/patch-boringssl.cmake"
|
||||||
|
)
|
||||||
|
|
||||||
|
set(SAVED_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
|
||||||
|
set(SAVED_BUILD_TESTING ${BUILD_TESTING})
|
||||||
|
|
||||||
|
set(BUILD_SHARED_LIBS OFF)
|
||||||
|
set(BUILD_TESTING OFF)
|
||||||
|
|
||||||
|
FetchContent_MakeAvailable(boringssl)
|
||||||
|
|
||||||
|
set(BUILD_SHARED_LIBS ${SAVED_BUILD_SHARED_LIBS})
|
||||||
|
set(BUILD_TESTING ${SAVED_BUILD_TESTING})
|
||||||
|
|
||||||
|
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
|
||||||
|
target_link_libraries(${TARGET} PUBLIC ssl crypto)
|
||||||
|
|
||||||
|
elseif (LLAMA_OPENSSL)
|
||||||
find_package(OpenSSL)
|
find_package(OpenSSL)
|
||||||
if (OpenSSL_FOUND)
|
if (OpenSSL_FOUND)
|
||||||
include(CheckCSourceCompiles)
|
include(CheckCSourceCompiles)
|
||||||
|
|
@ -44,17 +75,20 @@ if (LLAMA_OPENSSL)
|
||||||
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
|
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
|
||||||
if (OPENSSL_VERSION_SUPPORTED)
|
if (OPENSSL_VERSION_SUPPORTED)
|
||||||
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
|
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
|
||||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
|
set(CPPHTTPLIB_OPENSSL_SUPPORT TRUE)
|
||||||
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||||
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
|
||||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
|
||||||
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
|
|
||||||
find_library(SECURITY_FRAMEWORK Security REQUIRED)
|
|
||||||
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "OpenSSL not found, SSL support disabled")
|
message(STATUS "OpenSSL not found, SSL support disabled")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
|
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp
|
||||||
|
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||||
|
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
||||||
|
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
|
||||||
|
find_library(SECURITY_FRAMEWORK Security REQUIRED)
|
||||||
|
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
# Remove bssl
|
||||||
|
file(READ "CMakeLists.txt" content)
|
||||||
|
string(REPLACE "add_executable(bssl" "#add_executable(bssl" content "${content}")
|
||||||
|
string(REPLACE "target_link_libraries(bssl" "#target_link_libraries(bssl" content "${content}")
|
||||||
|
string(REPLACE "install(TARGETS bssl" "#install(TARGETS bssl" content "${content}")
|
||||||
|
file(WRITE "CMakeLists.txt" "${content}")
|
||||||
Loading…
Reference in New Issue