From 027ef82d5cae40fd6ad433c1fa7bed3267c8f40f Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Sat, 31 Jan 2026 12:29:04 +0530 Subject: [PATCH] Fix Issue !19219 Hangs were reported on Jetson Orin AGX if we set CUDA_SCALE_LAUNCH_QUEUES=4x. Reverting the previous PR (#19042) and updating the document to consider setting CUDA_SCALE_LAUNCH_QUEUES=4x for faster throughput on multi-GPU systems. --- docs/build.md | 4 +--- ggml/src/ggml-cuda/ggml-cuda.cu | 10 ---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/docs/build.md b/docs/build.md index 3a43f2a45a..fd447424c7 100644 --- a/docs/build.md +++ b/docs/build.md @@ -252,9 +252,7 @@ CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.ggu The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/environment-variables.html#cuda-scale-launch-queues) controls the size of CUDA's command buffer, which determines how many GPU operations can be queued before the CPU must wait for the GPU to catch up. A larger buffer reduces CPU-side stalls and allows more work to be queued on a GPU. -**Default behavior:** llama.cpp automatically sets `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs. - -See PR [#19042](https://github.com/ggml-org/llama.cpp/pull/19042) for performance benchmarks and technical details. +Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs. ### Unified Memory diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 08383edb40..1bcd1ab1f8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -5049,16 +5049,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { - // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance - // PR: https://github.com/ggml-org/llama.cpp/pull/19042 - if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) { -#ifdef _WIN32 - _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x"); -#else - setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set -#endif // _WIN32 - } - ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;