Build with CCCL 3.2 for CUDA backends

Gives best perf for backend-sampling on CUDA. Flag can be removed once CCCL 3.2 is bundled within CTK and that CTK version is used in llama.cpp
2025-12-19 16:10:51 +01:00 · 2025-12-19 16:10:51 +01:00 · 1da013c66e
parent b5ec0fd76c
commit 1da013c66e
3 changed files with 11 additions and 4 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1079,6 +1079,7 @@ jobs:
            evict-old-files: 1d

        - name: Build with CMake
+# Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled withing CTK and that CTK version is used in this project
          run: |
            cmake -S . -B build -G Ninja \
              -DLLAMA_CURL=OFF \
@ -1088,7 +1089,8 @@ jobs:
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
+              -DGGML_CUDA=ON \
+              -DGGML_CUDA_CUB_3DOT2=ON
            cmake --build build

  windows-2022-cmake-cuda:
@ -1123,6 +1125,7 @@ jobs:
      - name: Build
        id: cmake_build
        shell: cmd
+        # Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled withing CTK and that CTK version is used in this project
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
@ -1133,7 +1136,8 @@ jobs:
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=ON ^
            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON ^
+            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -448,6 +448,7 @@ jobs:
      - name: Build
        id: cmake_build
        shell: cmd
+# Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled withing CTK and that CTK version is used in this project
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
@ -455,7 +456,8 @@ jobs:
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF
+            -DLLAMA_CURL=OFF ^
+            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda

--- a/ci/run.sh
+++ b/ci/run.sh
@ -52,7 +52,8 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
+# Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled withing CTK and that CTK version is used in this project
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"

    if command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')