From f709c7a33fd89dd4de3f0eeccbfd2da8297fe9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 14 Jan 2026 07:46:27 +0100 Subject: [PATCH 01/11] ci, tests : use cmake to download models and remove libcurl dependency (#18791) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci, tests : use cmake to download models and remove libcurl dependency * llama_dl_model -> llama_download_model * use EXPECTED_HASH for robust model downloading * Move llama_download_model to cmake/common.cmake Signed-off-by: Adrien Gallouët --- .github/workflows/build.yml | 19 +++++++++---------- ci/run.sh | 2 +- cmake/common.cmake | 22 ++++++++++++++++++++++ examples/eval-callback/CMakeLists.txt | 8 +++----- tests/CMakeLists.txt | 16 +++++----------- tests/test-arg-parser.cpp | 2 +- 6 files changed, 41 insertions(+), 28 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3c89b4fab6..e2573fecf8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -92,7 +92,7 @@ jobs: id: cmake_test run: | cd build - ctest -L 'main|curl' --verbose --timeout 900 + ctest -L main --verbose --timeout 900 macOS-latest-cmake-x64: runs-on: macos-15-intel @@ -237,7 +237,7 @@ jobs: id: cmake_test run: | cd build - ctest -L 'main|curl' --verbose --timeout 900 + ctest -L main --verbose --timeout 900 - name: Test llama2c conversion id: llama2c_test @@ -1499,7 +1499,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential - name: Test id: ggml-ci @@ -1525,7 +1525,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential - name: Test id: ggml-ci @@ -1551,7 +1551,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential - name: Test id: ggml-ci @@ -1577,7 +1577,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential - name: Test id: ggml-ci @@ -1603,7 +1603,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential - name: Test id: ggml-ci @@ -1767,7 +1767,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install -y build-essential libcurl4-openssl-dev + sudo apt-get install -y build-essential - name: Test id: ggml-ci @@ -1853,7 +1853,7 @@ jobs: id: cmake_test run: | cd build - ctest -L 'main|curl' --verbose --timeout 900 + ctest -L main --verbose --timeout 900 - name: Test llama2c conversion id: llama2c_test @@ -2129,7 +2129,6 @@ jobs: sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ apt-get install -y \ build-essential \ - libcurl4-openssl-dev \ python3-venv \ gpg \ wget \ diff --git a/ci/run.sh b/ci/run.sh index 67b9784ef4..d4ce6c9196 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -45,7 +45,7 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON" +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=OFF -DGGML_SCHED_NO_REALLOC=ON" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" diff --git a/cmake/common.cmake b/cmake/common.cmake index a5bb787f15..76770817d9 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -33,3 +33,25 @@ function(llama_add_compile_flags) endif() endif() endfunction() + +function(llama_download_model NAME HASH) + set(DEST "${CMAKE_BINARY_DIR}/${NAME}") + get_filename_component(DEST_DIR "${DEST}" DIRECTORY) + file(MAKE_DIRECTORY "${DEST_DIR}") + if(NOT EXISTS "${DEST}") + message(STATUS "Downloading ${NAME} from ggml-org/models...") + endif() + file(DOWNLOAD + "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true" + "${DEST}" + TLS_VERIFY ON + EXPECTED_HASH ${HASH} + STATUS status + ) + list(GET status 0 code) + if(NOT code EQUAL 0) + list(GET status 1 msg) + message(FATAL_ERROR "Failed to download ${NAME}: ${msg}") + endif() + set(LLAMA_DOWNLOAD_MODEL "${DEST}" PARENT_SCOPE) +endfunction() diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index c514e4317e..454ce3a8a4 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -6,10 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TEST_TARGET test-eval-callback) if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") - add_test(NAME ${TEST_TARGET} - COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) + llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739) else() - add_test(NAME ${TEST_TARGET} - COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0) + llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d) endif() -set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) +add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${LLAMA_DOWNLOAD_MODEL}" --prompt hello --seed 42 -ngl 0) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a5ab25065b..58443be2da 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -202,15 +202,13 @@ llama_build_and_test( llama_build_and_test(test-regex-partial.cpp) if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") - llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) + llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739) else() - llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) + llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d) endif() +llama_build_and_test(test-thread-safety.cpp ARGS -m "${LLAMA_DOWNLOAD_MODEL}" -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) -# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) -if (NOT WIN32) - llama_build_and_test(test-arg-parser.cpp) -endif() +llama_build_and_test(test-arg-parser.cpp) if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) # TODO: repair known memory leaks @@ -225,11 +223,7 @@ llama_build_and_test(test-backend-sampler.cpp LABEL "model") # Test for state restore with fragmented KV cache # Requires a model, uses same args pattern as test-thread-safety -if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") - llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf) -else() - llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf) -endif() +llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${LLAMA_DOWNLOAD_MODEL}") if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index c7be0021be..67f8ca632c 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -173,7 +173,7 @@ int main(void) { assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 - printf("test-arg-parser: test curl-related functions\n\n"); + printf("test-arg-parser: test download functions\n\n"); const char * GOOD_URL = "http://ggml.ai/"; const char * BAD_URL = "http://ggml.ai/404"; From d34aa07193d27aa04da9a77c63ee125ec614714a Mon Sep 17 00:00:00 2001 From: Daniel Benjaminsson Date: Wed, 14 Jan 2026 08:11:05 +0100 Subject: [PATCH 02/11] mmap: add Haiku support by skipping RLIMIT_MEMLOCK check (#18819) Haiku OS does not support RLIMIT_MEMLOCK, similar to visionOS/tvOS. Skip the resource limit check on Haiku to allow mlock functionality to work without compile errors. Tested on Haiku with NVIDIA RTX 3080 Ti using Vulkan backend. --- src/llama-mmap.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 0c43495b11..fe0847fe1a 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -614,9 +614,9 @@ struct llama_mlock::impl { char* errmsg = std::strerror(errno); bool suggest = (errno == ENOMEM); -#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) - // visionOS/tvOS dont't support RLIMIT_MEMLOCK - // Skip resource limit checks on visionOS/tvOS +#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__) + // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK + // Skip resource limit checks on these platforms suggest = false; #else struct rlimit lock_limit; From 7d587e5544bf9e781c198c55697b928663faf0b4 Mon Sep 17 00:00:00 2001 From: Perry Naseck <4472083+DaAwesomeP@users.noreply.github.com> Date: Wed, 14 Jan 2026 02:22:25 -0500 Subject: [PATCH 03/11] ggml-metal: do not copy headers for embedded, use current binary dir for embedded (#18705) --- ggml/src/ggml-metal/CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 63418fe143..9c0b3db859 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -23,11 +23,6 @@ if (GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG) endif() -# copy metal files to bin directory -configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) -configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) -configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY) - set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h") if (GGML_METAL_EMBED_LIBRARY) enable_language(ASM) @@ -37,12 +32,12 @@ if (GGML_METAL_EMBED_LIBRARY) set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h") - file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated") # merge ggml-common.h and ggml-metal.metal into a single file - set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") - set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") - set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") + set(METALLIB_EMBED_ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") + set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") add_custom_command( OUTPUT "${METALLIB_EMBED_ASM}" @@ -62,6 +57,11 @@ if (GGML_METAL_EMBED_LIBRARY) target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}") else() + # copy metal files to bin directory + configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY) + if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air From 635ef78ec5dda84b0708b67d97291c7ab6740a8d Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 14 Jan 2026 09:41:23 +0100 Subject: [PATCH 04/11] vulkan: work around Intel fp16 bug in mmq (#18814) --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl index 7f32dadf17..9c297d1c60 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl @@ -264,7 +264,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) { const i8vec2 scales = i8vec2(unpack8(uint32_t(((data_a_packed16[ib_k].scales[(is % 8 ) / 2] >> (4 * (is / 8))) & 0x0F0F) | (((data_a_packed16[ib_k].scales[(8 + (is % 4)) / 2] >> (2 * (is / 4))) & 0x0303) << 4))).xy); // vec4 used due to #12147 - buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales - 32); + buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales - 32)); } } @@ -334,7 +334,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) { (data_a[ib_k].scales[is+4] >> 4) | ((data_a[ib_k].scales[is ] & 0xC0) >> 2)); } - buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm); + buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(vec2(data_a_packed32[ib_k].dm) * vec2(scale_dm)); } } @@ -385,7 +385,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) { const uint is = iqs_k / 4; const i8vec2 scales = unpack8(int32_t(data_a_packed16[ib_k].scales[is / 2])).xy; - buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales); + buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales)); } } From 01cbdfd7eb3dd6c0512daddb487b4cf382a9b016 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 14 Jan 2026 10:31:49 +0100 Subject: [PATCH 05/11] CUDA : fix typo in clang pragma comment [no ci] (#18830) --- ggml/src/ggml-cuda/fattn-vec.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 4d167b95a0..86f4dc0f7f 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() { return 128; } -// Currenlty llvm with the amdgcn target dose not support unrolling loops +// Currenlty llvm with the amdgcn target does not support unrolling loops // that contain a break that can not be resolved at compile time. #ifdef __clang__ #pragma clang diagnostic push From 47f9612492eafe665b6781aefc1afa3c85bae458 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 14 Jan 2026 17:55:15 +0800 Subject: [PATCH 06/11] llama-model: fix unfortunate typo (#18832) --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2d0c589bf5..75f9691807 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -446,7 +446,7 @@ struct llama_model::impl { llama_mlocks mlock_bufs; llama_mlocks mlock_mmaps; - // contexts where the model tensors metadata is stored as well ass the corresponding buffers: + // contexts where the model tensors metadata is stored as well as the corresponding buffers: std::vector>> ctxs_bufs; buft_list_t cpu_buft_list; From 3e4bb2966685facd549ac99bde1e02633e024920 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 14 Jan 2026 03:59:05 -0600 Subject: [PATCH 07/11] vulkan: Check maxStorageBufferRange in supports_op (#18709) * vulkan: Check maxStorageBufferRange in supports_op * skip maxStorageBufferRange check when shader64BitIndexing is enabled --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index deed5055d5..0fabbcec31 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14413,13 +14413,29 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); + const bool uses_bda = (op->op == GGML_OP_IM2COL || op->op == GGML_OP_IM2COL_3D) && + device->shader_int64 && device->buffer_device_address; + + auto const & tensor_size_supported = [&](size_t tensor_size) { + if (tensor_size > device->max_buffer_size) { + return false; + } + // For im2col shaders using BDA, maxStorageBufferRange limit doesn't apply. + // If shader64BitIndexing is enabled, maxStorageBufferRange limit doesn't apply. + if (!uses_bda && !device->shader_64b_indexing) { + if (tensor_size > device->properties.limits.maxStorageBufferRange) { + return false; + } + } + return true; + }; // reject any tensors larger than the max buffer size for (int i = 0; i < GGML_MAX_SRC; i++) { - if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) { + if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) { return false; } } - if (ggml_nbytes(op) > device->max_buffer_size) { + if (!tensor_size_supported(ggml_nbytes(op))) { return false; } From 516a4ca9b5f2fa72c2a71f412929a67cf76a6213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 14 Jan 2026 18:02:47 +0100 Subject: [PATCH 08/11] refactor : remove libcurl, use OpenSSL when available (#18828) --- .devops/cann.Dockerfile | 2 +- .devops/cpu.Dockerfile | 2 +- .devops/cuda-new.Dockerfile | 2 +- .devops/cuda.Dockerfile | 2 +- .devops/intel.Dockerfile | 2 +- .devops/llama-cli-cann.Dockerfile | 2 +- .devops/musa.Dockerfile | 2 +- .devops/nix/package.nix | 5 +- .devops/rocm.Dockerfile | 2 +- .devops/s390x.Dockerfile | 2 +- .devops/vulkan.Dockerfile | 4 +- .github/workflows/build-cmake-pkg.yml | 2 +- .github/workflows/build-linux-cross.yml | 12 +- .github/workflows/build.yml | 58 +--- .github/workflows/copilot-setup-steps.yml | 2 +- .github/workflows/release.yml | 19 +- .github/workflows/server-webui.yml | 6 - .github/workflows/server.yml | 4 +- CMakeLists.txt | 16 +- README.md | 1 - build-xcframework.sh | 14 +- ci/run.sh | 2 +- common/CMakeLists.txt | 12 +- common/arg.cpp | 2 +- common/download.cpp | 342 +------------------- docs/backend/hexagon/CMakeUserPresets.json | 4 +- docs/build-riscv64-spacemit.md | 2 +- docs/build.md | 8 +- examples/llama.android/lib/build.gradle.kts | 2 +- examples/sycl/build.sh | 4 +- examples/sycl/win-build-sycl.bat | 4 +- licenses/LICENSE-curl | 22 -- scripts/debug-test.sh | 3 +- scripts/serve-static.js | 2 +- scripts/tool_bench.py | 2 +- tools/tts/README.md | 2 +- 36 files changed, 74 insertions(+), 500 deletions(-) delete mode 100644 licenses/LICENSE-curl diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index db221b0b81..97ee3eedb6 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3. FROM ${CANN_BASE_IMAGE} AS build # -- Install build dependencies -- -RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \ +RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \ yum clean all && \ rm -rf /var/cache/yum diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index b9e84ab986..c70a2de562 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build ARG TARGETARCH RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev + apt-get install -y build-essential git cmake libssl-dev WORKDIR /app diff --git a/.devops/cuda-new.Dockerfile b/.devops/cuda-new.Dockerfile index 62443e17f2..98dc147d7e 100644 --- a/.devops/cuda-new.Dockerfile +++ b/.devops/cuda-new.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build ARG CUDA_DOCKER_ARCH=default RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 WORKDIR /app diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index fed5863157..52f103bc31 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build ARG CUDA_DOCKER_ARCH=default RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 WORKDIR /app diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index adebf08229..35ea4ade8e 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ - apt-get install -y git libcurl4-openssl-dev + apt-get install -y git libssl-dev WORKDIR /app diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index 6581187f32..5bbc9ee43b 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -6,7 +6,7 @@ WORKDIR /app COPY . . -RUN yum install -y gcc g++ cmake make libcurl-devel +RUN yum install -y gcc g++ cmake make openssl-devel ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH} diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 34d6ad9f40..9eb4985204 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -18,7 +18,7 @@ RUN apt-get update && \ python3 \ python3-pip \ git \ - libcurl4-openssl-dev \ + libssl-dev \ libgomp1 WORKDIR /app diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index a13996bd68..79a7270e5d 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -32,7 +32,6 @@ useMpi ? false, useRocm ? config.rocmSupport, rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets, - enableCurl ? true, useVulkan ? false, useRpc ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -160,15 +159,13 @@ effectiveStdenv.mkDerivation (finalAttrs: { ++ optionals useMpi [ mpi ] ++ optionals useRocm rocmBuildInputs ++ optionals useBlas [ blas ] - ++ optionals useVulkan vulkanBuildInputs - ++ optionals enableCurl [ curl ]; + ++ optionals useVulkan vulkanBuildInputs; cmakeFlags = [ (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) - (cmakeBool "LLAMA_CURL" enableCurl) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 53c3ed8d88..14936f8e9c 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -27,7 +27,7 @@ RUN apt-get update \ build-essential \ cmake \ git \ - libcurl4-openssl-dev \ + libssl-dev \ curl \ libgomp1 diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 1e66f061d5..757cd97cd4 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ apt install -y --no-install-recommends \ git cmake ccache ninja-build \ # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster. - libopenblas-dev libcurl4-openssl-dev && \ + libopenblas-dev libssl-dev && \ rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index 89831ed5c2..9797c5e0f3 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget xz-utils -# Install cURL and Vulkan SDK dependencies -RUN apt install -y libcurl4-openssl-dev curl \ +# Install SSL and Vulkan SDK dependencies +RUN apt install -y libssl-dev curl \ libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc # Build it diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index fee2ab96bd..510352a5cc 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -20,7 +20,7 @@ jobs: run: | PREFIX="$(pwd)"/inst cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ - -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release cmake --build build --config Release cmake --install build --prefix "$PREFIX" --config Release diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index c2c6ea12ae..4d3b687a51 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -30,7 +30,7 @@ jobs: # - name: Build # run: | - # cmake -B build -DLLAMA_CURL=OFF \ + # cmake -B build -DLLAMA_OPENSSL=OFF \ # -DCMAKE_BUILD_TYPE=Release \ # -DGGML_OPENMP=OFF \ # -DLLAMA_BUILD_EXAMPLES=ON \ @@ -76,7 +76,7 @@ jobs: # - name: Build # run: | - # cmake -B build -DLLAMA_CURL=OFF \ + # cmake -B build -DLLAMA_OPENSSL=OFF \ # -DCMAKE_BUILD_TYPE=Release \ # -DGGML_VULKAN=ON \ # -DGGML_OPENMP=OFF \ @@ -122,7 +122,7 @@ jobs: # - name: Build # run: | - # cmake -B build -DLLAMA_CURL=OFF \ + # cmake -B build -DLLAMA_OPENSSL=OFF \ # -DCMAKE_BUILD_TYPE=Release \ # -DGGML_VULKAN=ON \ # -DGGML_OPENMP=OFF \ @@ -178,7 +178,7 @@ jobs: - name: Build run: | - cmake -B build -DLLAMA_CURL=OFF \ + cmake -B build -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ @@ -235,7 +235,7 @@ jobs: - name: Build run: | - cmake -B build -DLLAMA_CURL=OFF \ + cmake -B build -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_VULKAN=ON \ -DGGML_OPENMP=OFF \ @@ -281,7 +281,7 @@ jobs: - name: Build run: | export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain - cmake -B build -DLLAMA_CURL=OFF \ + cmake -B build -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e2573fecf8..e3b120fcda 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -79,7 +79,6 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=OFF \ @@ -118,7 +117,6 @@ jobs: cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL=OFF \ -DGGML_RPC=ON \ @@ -227,8 +225,6 @@ jobs: id: cmake_build run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -293,8 +289,6 @@ jobs: if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} @@ -305,8 +299,6 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ @@ -336,14 +328,10 @@ jobs: - name: Build id: cmake_build run: | - mkdir build - cd build - cmake .. \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ + cmake -B build \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_LLGUIDANCE=ON - cmake --build . --config Release -j $(nproc) + cmake --build build --config Release -j $(nproc) - name: Test id: cmake_test @@ -377,8 +365,6 @@ jobs: id: cmake_build run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(nproc) @@ -412,8 +398,6 @@ jobs: id: cmake_configure run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DGGML_BACKEND_DL=ON \ -DGGML_CPU_ALL_VARIANTS=ON \ @@ -470,8 +454,6 @@ jobs: run: | source ./vulkan_sdk/setup-env.sh cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_VULKAN=ON cmake --build build --config Release -j $(nproc) @@ -545,8 +527,6 @@ jobs: run: | export Dawn_DIR=dawn/lib64/cmake/Dawn cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_WEBGPU=ON cmake --build build --config Release -j $(nproc) @@ -593,7 +573,7 @@ jobs: source emsdk/emsdk_env.sh emcmake cmake -B build-wasm \ -DGGML_WEBGPU=ON \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg cmake --build build-wasm --target test-backend-ops -j $(nproc) @@ -624,8 +604,6 @@ jobs: id: cmake_build run: | cmake -B build -S . \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON @@ -657,8 +635,6 @@ jobs: id: cmake_build run: | cmake -B build -S . \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_MUSA=ON cmake --build build --config Release -j $(nproc) @@ -706,8 +682,6 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx @@ -757,8 +731,6 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_SYCL=ON \ -DCMAKE_C_COMPILER=icx \ -DCMAKE_CXX_COMPILER=icpx \ @@ -893,7 +865,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -1043,7 +1015,7 @@ jobs: id: cmake_build run: | cmake -S . -B build ${{ matrix.defines }} ` - -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON + -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Add libopenblas.dll @@ -1101,8 +1073,6 @@ jobs: # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project run: | cmake -S . -B build -G Ninja \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CUDA_ARCHITECTURES=89-real \ @@ -1150,7 +1120,6 @@ jobs: call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ - -DLLAMA_CURL=OFF ^ -DLLAMA_BUILD_BORINGSSL=ON ^ -DGGML_NATIVE=OFF ^ -DGGML_BACKEND_DL=ON ^ @@ -1258,7 +1227,6 @@ jobs: -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" ` -DCMAKE_BUILD_TYPE=Release ` - -DLLAMA_CURL=OFF ` -DLLAMA_BUILD_BORINGSSL=ON ` -DROCM_DIR="${env:HIP_PATH}" ` -DGGML_HIP=ON ` @@ -1285,7 +1253,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -1352,7 +1320,7 @@ jobs: matrix: include: - build: 'arm64-cpu' - defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF' + defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF' - build: 'arm64-snapdragon' defines: '--preset arm64-android-snapdragon-release' @@ -1469,8 +1437,6 @@ jobs: export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} cmake -S . -B build \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_CANN=on \ -DSOC_TYPE=${SOC_TYPE} cmake --build build -j $(nproc) @@ -1834,8 +1800,6 @@ jobs: id: cmake_build run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ @@ -1928,7 +1892,7 @@ jobs: if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DGGML_OPENMP=ON \ -DLLAMA_BUILD_EXAMPLES=ON \ @@ -1947,7 +1911,7 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ @@ -2018,7 +1982,7 @@ jobs: id: cmake_build run: | cmake -B build \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ @@ -2092,8 +2056,6 @@ jobs: id: cmake_build run: | cmake -B build \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_EXAMPLES=ON \ diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 3645e30378..5f733e684e 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -38,7 +38,7 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev + sudo apt-get install build-essential libssl-dev # Install git-clang-format script for formatting only changed code wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 35e1fae697..272701fb9e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -45,7 +45,6 @@ jobs: -DCMAKE_INSTALL_RPATH='@loader_path' \ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ @@ -95,7 +94,6 @@ jobs: -DCMAKE_INSTALL_RPATH='@loader_path' \ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=OFF \ -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL=OFF \ -DGGML_RPC=ON \ @@ -161,8 +159,6 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -212,8 +208,6 @@ jobs: cmake -B build \ -DCMAKE_INSTALL_RPATH='$ORIGIN' \ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_BACKEND_DL=ON \ -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ @@ -269,7 +263,6 @@ jobs: call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }} cmake -S . -B build -G "Ninja Multi-Config" ^ -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^ - -DLLAMA_CURL=OFF ^ -DLLAMA_BUILD_BORINGSSL=ON ^ -DGGML_NATIVE=OFF ^ -DGGML_BACKEND_DL=ON ^ @@ -358,7 +351,7 @@ jobs: - name: Build id: cmake_build run: | - cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF + cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release --target ${{ matrix.target }} - name: Pack artifacts @@ -412,7 +405,7 @@ jobs: -DGGML_NATIVE=OFF ^ -DGGML_CPU=OFF ^ -DGGML_CUDA=ON ^ - -DLLAMA_CURL=OFF ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ -DGGML_CUDA_CUB_3DOT2=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda @@ -481,7 +474,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release ^ -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ -DGGML_CPU=OFF -DGGML_SYCL=ON ^ - -DLLAMA_CURL=OFF + -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --target ggml-sycl -j - name: Build the release package @@ -608,7 +601,7 @@ jobs: -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" ` -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` - -DLLAMA_CURL=OFF + -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" md "build\bin\hipblaslt\library" @@ -649,7 +642,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -734,8 +727,6 @@ jobs: export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} cmake -S . -B build \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DGGML_CANN=on \ -DSOC_TYPE=${SOC_TYPE} cmake --build build -j $(nproc) diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml index 544c4ad408..318003c5cc 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/server-webui.yml @@ -168,8 +168,6 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ @@ -182,8 +180,6 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; @@ -195,8 +191,6 @@ jobs: run: | cmake -B build \ -DGGML_NATIVE=OFF \ - -DLLAMA_CURL=OFF \ - -DLLAMA_OPENSSL=ON \ -DLLAMA_BUILD_SERVER=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 5694feb2c9..ab7c520e11 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -72,7 +72,7 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON + cmake -B build -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup @@ -108,7 +108,7 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON + cmake -B build -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup diff --git a/CMakeLists.txt b/CMakeLists.txt index 44c2166210..d24fa080ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) -option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON) -option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF) +option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON) +option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +# deprecated +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) +if (LLAMA_CURL) + message(WARNING "LLAMA_CURL option is deprecated and will be ignored") +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) @@ -212,11 +217,6 @@ add_subdirectory(src) # utils, programs, examples and tests # -if (NOT LLAMA_BUILD_COMMON) - message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL") - set(LLAMA_CURL OFF) -endif() - if (LLAMA_BUILD_COMMON) add_subdirectory(common) if (LLAMA_HTTPLIB) diff --git a/README.md b/README.md index 0d9d1ef6b4..42b1432a99 100644 --- a/README.md +++ b/README.md @@ -586,6 +586,5 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License -- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html) - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain - [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain diff --git a/build-xcframework.sh b/build-xcframework.sh index 81280f7497..0eec871139 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -S . cmake --build build-ios-sim --config Release -- -quiet @@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -S . cmake --build build-ios-device --config Release -- -quiet @@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \ -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -S . cmake --build build-macos --config Release -- -quiet @@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DLLAMA_HTTPLIB=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -S . @@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DLLAMA_HTTPLIB=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -S . @@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -S . cmake --build build-tvos-sim --config Release -- -quiet @@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -S . cmake --build build-tvos-device --config Release -- -quiet diff --git a/ci/run.sh b/ci/run.sh index d4ce6c9196..6ca6ea5669 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -45,7 +45,7 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=OFF -DGGML_SCHED_NO_REALLOC=ON" +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 55222bdf61..3451a311d0 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -95,17 +95,7 @@ endif() # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...) set(LLAMA_COMMON_EXTRA_LIBS build_info) -if (LLAMA_CURL) - # Use curl to download model url - find_package(CURL) - if (NOT CURL_FOUND) - message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF") - endif() - target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) - include_directories(${CURL_INCLUDE_DIRS}) - set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES}) -elseif (LLAMA_HTTPLIB) - # otherwise, use cpp-httplib +if (LLAMA_HTTPLIB) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib) endif() diff --git a/common/arg.cpp b/common/arg.cpp index 4b96c312f3..ceb4d74111 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -341,7 +341,7 @@ static handle_model_result common_params_handle_model( if (model.path.empty()) { auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline); if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { - exit(1); // built without CURL, error message already printed + exit(1); // error message already printed } model.name = model.hf_repo; // repo name with tag model.hf_repo = auto_detected.repo; // repo name without tag diff --git a/common/download.cpp b/common/download.cpp index dc7d5c8478..a37780421a 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -19,10 +19,7 @@ #include #include -#if defined(LLAMA_USE_CURL) -#include -#include -#elif defined(LLAMA_USE_HTTPLIB) +#if defined(LLAMA_USE_HTTPLIB) #include "http.h" #endif @@ -171,336 +168,7 @@ std::pair common_download_split_repo_tag(const std::st return {hf_repo, tag}; } -#ifdef LLAMA_USE_CURL - -// -// CURL utils -// - -using curl_ptr = std::unique_ptr; - -// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one -struct curl_slist_ptr { - struct curl_slist * ptr = nullptr; - ~curl_slist_ptr() { - if (ptr) { - curl_slist_free_all(ptr); - } - } -}; - -static CURLcode common_curl_perf(CURL * curl) { - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - LOG_ERR("%s: curl_easy_perform() failed\n", __func__); - } - - return res; -} - -// Send a HEAD request to retrieve the etag and last-modified headers -struct common_load_model_from_url_headers { - std::string etag; - std::string last_modified; - std::string accept_ranges; -}; - -struct FILE_deleter { - void operator()(FILE * f) const { fclose(f); } -}; - -static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) { - common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; - static std::regex header_regex("([^:]+): (.*)\r\n"); - static std::regex etag_regex("ETag", std::regex_constants::icase); - static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); - static std::regex accept_ranges_regex("Accept-Ranges", std::regex_constants::icase); - std::string header(buffer, n_items); - std::smatch match; - if (std::regex_match(header, match, header_regex)) { - const std::string & key = match[1]; - const std::string & value = match[2]; - if (std::regex_match(key, match, etag_regex)) { - headers->etag = value; - } else if (std::regex_match(key, match, last_modified_regex)) { - headers->last_modified = value; - } else if (std::regex_match(key, match, accept_ranges_regex)) { - headers->accept_ranges = value; - } - } - - return n_items; -} - -static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) { - return std::fwrite(data, size, nmemb, static_cast(fd)); -} - -// helper function to hide password in URL -static std::string llama_download_hide_password_in_url(const std::string & url) { - // Use regex to match and replace the user[:password]@ pattern in URLs - // Pattern: scheme://[user[:password]@]host[...] - static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)"); - std::smatch match; - - if (std::regex_match(url, match, url_regex)) { - // match[1] = scheme (e.g., "https://") - // match[2] = user[:password]@ part - // match[3] = rest of URL (host and path) - return match[1].str() + "********@" + match[3].str(); - } - - return url; // No credentials found or malformed URL -} - -static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) { - // Set the URL, allow to follow http redirection - curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - -# if defined(_WIN32) - // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of - // operating system. Currently implemented under MS-Windows. - curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -# endif - - curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback); -} - -static void common_curl_easy_setopt_get(CURL * curl) { - curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback); - - // display download progress - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); -} - -static bool common_pull_file(CURL * curl, const std::string & path_temporary) { - if (std::filesystem::exists(path_temporary)) { - const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary)); - LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str()); - const std::string range_str = partial_size + "-"; - curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str()); - } - - // Always open file in append mode could be resuming - std::unique_ptr outfile(fopen(path_temporary.c_str(), "ab")); - if (!outfile) { - LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str()); - return false; - } - - common_curl_easy_setopt_get(curl); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get()); - - return common_curl_perf(curl) == CURLE_OK; -} - -static bool common_download_head(CURL * curl, - curl_slist_ptr & http_headers, - const std::string & url, - const std::string & bearer_token) { - if (!curl) { - LOG_ERR("%s: error initializing libcurl\n", __func__); - return false; - } - - http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - // Check if hf-token or bearer-token was specified - if (!bearer_token.empty()) { - std::string auth_header = "Authorization: Bearer " + bearer_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - } - - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr); - common_curl_easy_setopt_head(curl, url); - return common_curl_perf(curl) == CURLE_OK; -} - -// download one single file from remote URL to local path -// returns status code or -1 on error -static int common_download_file_single_online(const std::string & url, - const std::string & path, - const std::string & bearer_token, - const common_header_list & custom_headers) { - static const int max_attempts = 3; - static const int retry_delay_seconds = 2; - - for (int i = 0; i < max_attempts; ++i) { - std::string etag; - - // Check if the file already exists locally - const auto file_exists = std::filesystem::exists(path); - if (file_exists) { - etag = read_etag(path); - } else { - LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); - } - - bool head_request_ok = false; - bool should_download = !file_exists; // by default, we should download if the file does not exist - - // Initialize libcurl - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - common_load_model_from_url_headers headers; - curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - curl_slist_ptr http_headers; - - for (const auto & h : custom_headers) { - std::string s = h.first + ": " + h.second; - http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str()); - } - const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token); - if (!was_perform_successful) { - head_request_ok = false; - } - - long http_code = 0; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); - if (http_code == 200) { - head_request_ok = true; - } else { - LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); - head_request_ok = false; - } - - // if head_request_ok is false, we don't have the etag or last-modified headers - // we leave should_download as-is, which is true if the file does not exist - bool should_download_from_scratch = false; - if (head_request_ok) { - // check if ETag or Last-Modified headers are different - // if it is, we need to download the file again - if (!etag.empty() && etag != headers.etag) { - LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), - headers.etag.c_str()); - should_download = true; - should_download_from_scratch = true; - } - } - - const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none"; - if (should_download) { - if (file_exists && - !accept_ranges_supported) { // Resumable downloads not supported, delete and start again. - LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); - if (remove(path.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); - return -1; - } - } - - const std::string path_temporary = path + ".downloadInProgress"; - if (should_download_from_scratch) { - if (std::filesystem::exists(path_temporary)) { - if (remove(path_temporary.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str()); - return -1; - } - } - - if (std::filesystem::exists(path)) { - if (remove(path.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); - return -1; - } - } - } - if (head_request_ok) { - write_etag(path, headers.etag); - } - - // start the download - LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", - __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(), - headers.etag.c_str(), headers.last_modified.c_str()); - const bool was_pull_successful = common_pull_file(curl.get(), path_temporary); - if (!was_pull_successful) { - if (i + 1 < max_attempts) { - const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000; - LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay); - std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); - } else { - LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); - } - - continue; - } - - long http_code = 0; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); - - int status = static_cast(http_code); - if (!is_http_status_ok(http_code)) { - LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); - return status; // TODO: maybe only return on certain codes - } - - if (rename(path_temporary.c_str(), path.c_str()) != 0) { - LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); - return -1; - } - - return static_cast(http_code); - } else { - LOG_INF("%s: using cached file: %s\n", __func__, path.c_str()); - - return 304; // Not Modified - fake cached response - } - } - - return -1; // max attempts reached -} - -std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params) { - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; - std::vector res_buffer; - - curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); - curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L); - typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); - auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { - auto data_vec = static_cast *>(data); - data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb); - return size * nmemb; - }; - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer); -#if defined(_WIN32) - curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -#endif - if (params.timeout > 0) { - curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout); - } - if (params.max_size > 0) { - curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size); - } - http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - - for (const auto & header : params.headers) { - std::string header_ = header.first + ": " + header.second; - http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str()); - } - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); - - CURLcode res = curl_easy_perform(curl.get()); - - if (res != CURLE_OK) { - std::string error_msg = curl_easy_strerror(res); - throw std::runtime_error("error: cannot make GET request: " + error_msg); - } - - long res_code; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); - - return { res_code, std::move(res_buffer) }; -} - -#elif defined(LLAMA_USE_HTTPLIB) +#if defined(LLAMA_USE_HTTPLIB) class ProgressBar { static inline std::mutex mutex; @@ -797,10 +465,6 @@ std::pair> common_remote_get_content(const std::string return { res->status, std::move(buf) }; } -#endif // LLAMA_USE_CURL - -#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB) - int common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, @@ -1151,7 +815,7 @@ int common_download_file_single(const std::string &, throw std::runtime_error("download functionality is not enabled in this build"); } -#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB +#endif // defined(LLAMA_USE_HTTPLIB) std::vector common_list_cached_models() { std::vector models; diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json index 98d7221b3a..a1d99018b1 100644 --- a/docs/backend/hexagon/CMakeUserPresets.json +++ b/docs/backend/hexagon/CMakeUserPresets.json @@ -23,7 +23,7 @@ "GGML_OPENCL": "ON", "GGML_HEXAGON": "ON", "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", - "LLAMA_CURL": "OFF" + "LLAMA_OPENSSL": "OFF" } }, @@ -38,7 +38,7 @@ "GGML_OPENCL": "ON", "GGML_HEXAGON": "ON", "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", - "LLAMA_CURL": "OFF" + "LLAMA_OPENSSL": "OFF" } }, diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md index 79bd4de63a..cd6bbe199d 100644 --- a/docs/build-riscv64-spacemit.md +++ b/docs/build-riscv64-spacemit.md @@ -15,7 +15,7 @@ Below is the build script: it requires utilizing RISC-V vector instructions for cmake -B build \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_CPU_RISCV64_SPACEMIT=ON \ - -DLLAMA_CURL=OFF \ + -DLLAMA_OPENSSL=OFF \ -DGGML_RVV=ON \ -DGGML_RV_ZFH=ON \ -DGGML_RV_ZICBOP=ON \ diff --git a/docs/build.md b/docs/build.md index 63fd8b4fcd..fce9361b2d 100644 --- a/docs/build.md +++ b/docs/build.md @@ -65,10 +65,10 @@ cmake --build build --config Release cmake --preset x64-windows-llvm-release cmake --build build-x64-windows-llvm-release ``` -- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl. - - **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev` # (or `libcurl4-gnutls-dev` if you prefer GnuTLS) - - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel` - - **Arch / Manjaro:** `sudo pacman -S curl` # includes libcurl headers +- If you want HTTPS/TLS features, you may install OpenSSL development libraries. If not installed, the project will build and run without SSL support. + - **Debian / Ubuntu:** `sudo apt-get install libssl-dev` + - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install openssl-devel` + - **Arch / Manjaro:** `sudo pacman -S openssl` ## BLAS Build diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts index 5255f0c17b..9b290d6d4a 100644 --- a/examples/llama.android/lib/build.gradle.kts +++ b/examples/llama.android/lib/build.gradle.kts @@ -26,7 +26,7 @@ android { arguments += "-DBUILD_SHARED_LIBS=ON" arguments += "-DLLAMA_BUILD_COMMON=ON" - arguments += "-DLLAMA_CURL=OFF" + arguments += "-DLLAMA_OPENSSL=OFF" arguments += "-DGGML_NATIVE=OFF" arguments += "-DGGML_BACKEND_DL=ON" diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index 1993520ebd..635e74fe64 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -8,10 +8,10 @@ cd build source /opt/intel/oneapi/setvars.sh #for FP16 -#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference +#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference #for FP32 -cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF +cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF #build example/main #cmake --build . --config Release --target main diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index 862998e737..fc8b33bbc2 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON +:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON :: for FP32 -cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR :: build all binary diff --git a/licenses/LICENSE-curl b/licenses/LICENSE-curl deleted file mode 100644 index 2f71d999a9..0000000000 --- a/licenses/LICENSE-curl +++ /dev/null @@ -1,22 +0,0 @@ -COPYRIGHT AND PERMISSION NOTICE - -Copyright (c) 1996 - 2026, Daniel Stenberg, , and many -contributors, see the THANKS file. - -All rights reserved. - -Permission to use, copy, modify, and distribute this software for any purpose -with or without fee is hereby granted, provided that the above copyright -notice and this permission notice appear in all copies. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN -NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE -OR OTHER DEALINGS IN THE SOFTWARE. - -Except as contained in this notice, the name of a copyright holder shall not -be used in advertising or otherwise to promote the sale, use or other dealings -in this Software without prior written authorization of the copyright holder. diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 7e9e8421b0..ead7ea15d1 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -109,8 +109,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" # Step 2: Setup Build Environment and Compile Test Binaries ########################################################### -# Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 || abort "Failed to build environment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 diff --git a/scripts/serve-static.js b/scripts/serve-static.js index 8ddc04aad9..df4953e61e 100644 --- a/scripts/serve-static.js +++ b/scripts/serve-static.js @@ -4,7 +4,7 @@ const path = require('path'); // This file is used for testing wasm build from emscripten // Example build command: -// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF +// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_OPENSSL=OFF // cmake --build build-wasm --target test-backend-ops -j const PORT = 8080; diff --git a/scripts/tool_bench.py b/scripts/tool_bench.py index e1512a49fd..d9f5583d4a 100755 --- a/scripts/tool_bench.py +++ b/scripts/tool_bench.py @@ -7,7 +7,7 @@ Simple usage example: - cmake -B build -DLLAMA_CURL=1 && cmake --build build --config Release -j -t llama-server + cmake -B build && cmake --build build --config Release -j -t llama-server export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp} diff --git a/tools/tts/README.md b/tools/tts/README.md index 557014aebb..48302c070b 100644 --- a/tools/tts/README.md +++ b/tools/tts/README.md @@ -4,7 +4,7 @@ This example demonstrates the Text To Speech feature. It uses a [outeai](https://www.outeai.com/). ## Quickstart -If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the +If you have built llama.cpp with SSL support you can simply run the following command and the required models will be downloaded automatically: ```console $ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav From 8fb717557638f819e668e87f6d7dc0f39eb09c68 Mon Sep 17 00:00:00 2001 From: Junwon Hwang Date: Thu, 15 Jan 2026 03:38:21 +0900 Subject: [PATCH 09/11] model : clean up and fix EXAONE-MoE configuration (#18840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix mismatch of EXAONE-MoE configuration * ensure gating func is set, cleanup --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 6 +----- src/llama-model.cpp | 8 ++------ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index be83e3108e..464ecbaab9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8772,11 +8772,7 @@ class ExaoneMoEModel(Exaone4Model): self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) self.gguf_writer.add_leading_dense_block_count(n_dense_layer) - # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE, - # so that we can convert MTP weights to GGUF format for speculative decoding. - # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now. - # Will be updated when HF config is updated. - self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1)) + self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 75f9691807..eaedc66b63 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, true); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); - ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); - ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); From d98b548120eecf98f0f6eaa1ba7e29b3afda9f2e Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Wed, 14 Jan 2026 20:29:35 +0100 Subject: [PATCH 10/11] Restore clip's cb() to its rightful glory - extract common debugging elements in llama (#17914) * Extract common debugging functions; plug eval-callback and mtmd's MTMD_DEBUG_GRAPH with same functionality * Move to common * Remove unneeded header * Unlink from common * chore: update webui build output * Cleanup; properly pass params to mtmd without depending on common; factorize debug.cpp to use common debug code. * Revert change to webapp * Post-merge adjust * Apply suggestions from code review Co-authored-by: Xuan-Son Nguyen * Apply code review changes * Remove changes to server-context * Remove mtmd.h include * Remove utility functions from header * Apply suggestions from code review Co-authored-by: Xuan-Son Nguyen * Rename functions * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen --------- Co-authored-by: Xuan-Son Nguyen --- common/CMakeLists.txt | 2 + common/debug.cpp | 165 ++++++++++++++++++ common/debug.h | 43 +++++ docs/backend/hexagon/CMakeUserPresets.json | 2 +- examples/debug/debug.cpp | 192 +-------------------- examples/eval-callback/eval-callback.cpp | 161 +---------------- tools/mtmd/clip-graph.h | 4 - tools/mtmd/clip.cpp | 52 ++---- tools/mtmd/clip.h | 3 + tools/mtmd/mtmd-cli.cpp | 7 + tools/mtmd/mtmd.cpp | 4 + tools/mtmd/mtmd.h | 20 ++- 12 files changed, 259 insertions(+), 396 deletions(-) create mode 100644 common/debug.cpp create mode 100644 common/debug.h diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3451a311d0..723973ed70 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -60,6 +60,8 @@ add_library(${TARGET} STATIC common.h console.cpp console.h + debug.cpp + debug.h download.cpp download.h http.h diff --git a/common/debug.cpp b/common/debug.cpp new file mode 100644 index 0000000000..fdaddb1443 --- /dev/null +++ b/common/debug.cpp @@ -0,0 +1,165 @@ +#include "debug.h" + +#include "log.h" + +#include +#include + +static std::string common_ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static float common_ggml_get_float_value(const uint8_t * data, + ggml_type type, + const size_t * nb, + size_t i0, + size_t i1, + size_t i2, + size_t i3) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(const float *) &data[i]; + } else if (type == GGML_TYPE_I64) { + v = (float) *(const int64_t *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(const int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(const int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(const int8_t *) &data[i]; + } else if (type == GGML_TYPE_BF16) { + v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); + } else { + GGML_ABORT("fatal error"); + } + return v; +} + +template +void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + GGML_ASSERT(n > 0); + float sum = 0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3); + sum += v; + } + } + } + } + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + LOG_ERR(" [\n"); + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2 * n) { + LOG_ERR(" ..., \n"); + i2 = ne[2] - n; + } + LOG_ERR(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2 * n) { + LOG_ERR(" ..., \n"); + i1 = ne[1] - n; + } + LOG_ERR(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2 * n) { + LOG_ERR("..., "); + i0 = ne[0] - n; + } + const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3); + LOG_ERR("%12.4f", v); + if (i0 < ne[0] - 1) { + LOG_ERR(", "); + } + } + LOG_ERR("],\n"); + } + LOG_ERR(" ],\n"); + } + LOG_ERR(" ]\n"); + LOG_ERR(" sum = %f\n", sum); + } + + if constexpr (abort) { + if (std::isnan(sum)) { + LOG_ERR("encountered NaN - aborting\n"); + exit(0); + } + } +} + +/** + * GGML operations callback during the graph execution. + * + * @param t current tensor + * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor + * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. + * see ggml_backend_sched_eval_callback + * @param user_data user data to pass at each call back + * @return true to receive data or continue the graph, false otherwise + */ +template bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (base_callback_data *) user_data; + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + + bool matches_filter = cb_data->tensor_filters.empty(); + + if (!matches_filter) { + for (const auto & filter : cb_data->tensor_filters) { + if (std::regex_search(t->name, filter)) { + matches_filter = true; + break; + } + } + } + + char src1_str[128] = { 0 }; + if (src1) { + snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); + } + + if (matches_filter) { + LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type), + ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "", + common_ggml_ne_string(t).c_str()); + } + + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (!ggml_is_quantized(t->type) && matches_filter) { + uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + common_debug_print_tensor(data, t->type, t->ne, t->nb, 3); + } + + return true; +} + +// Explicit template instantiations +template bool common_debug_cb_eval(ggml_tensor *, bool, void *); +template bool common_debug_cb_eval(ggml_tensor *, bool, void *); +template void common_debug_print_tensor(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t); +template void common_debug_print_tensor(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t); diff --git a/common/debug.h b/common/debug.h new file mode 100644 index 0000000000..0c55963258 --- /dev/null +++ b/common/debug.h @@ -0,0 +1,43 @@ +#pragma once +#include "common.h" +#include +#include +#include + +// common debug functions and structs + +// Print a tensor's detailed data +// data - the tensor's data in byte format +// type - the tensor's quantization type +// ne - the tensor dimensions array +// nb - the tensor strides array +// n - the number of rows/columns to fully print +template void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n); + +// Intended to use as callback for ggml_backend_sched_eval_callback +// prints tensors that are processed in the computation graph +// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with +// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns +// The template parameter determins whether an error should be thrown whenever a NaN is encountered +// in a tensor (useful for stopping debug sessions on first erroneous tensor) +// The callback data will be passed as the third parameter (user_data) +template bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data); +struct base_callback_data { + std::vector data; + std::vector tensor_filters; + + base_callback_data() = default; + + base_callback_data(common_params & params, const std::vector & filter_patterns) { + for (const auto & pattern : filter_patterns) { + try { + std::string anchored_pattern = "^" + pattern; + tensor_filters.emplace_back(anchored_pattern, std::regex::optimize); + } catch (const std::regex_error & e) { + throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what()); + } + } + params.cb_eval = common_debug_cb_eval; + params.cb_eval_user_data = this; + } +}; diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json index a1d99018b1..1f2676c0bc 100644 --- a/docs/backend/hexagon/CMakeUserPresets.json +++ b/docs/backend/hexagon/CMakeUserPresets.json @@ -1,4 +1,4 @@ -{ +{ "version": 4, "configurePresets": [ { diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index 63be40c842..88947acbd3 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -1,11 +1,9 @@ +#include "debug.h" #include "arg.h" #include "common.h" #include "log.h" #include "llama.h" -#include "ggml.h" -#include -#include #include #include #include @@ -13,7 +11,7 @@ #include #include -static void print_usage(int, char ** argv) { +static void print_usage(int /*argc*/, char ** argv) { const std::string usage_template = R"( example usage: @@ -35,28 +33,6 @@ static void print_usage(int, char ** argv) { LOG("%s\n", usage.c_str()); } -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data); - -struct callback_data { - std::vector data; - std::vector tensor_filters; - - callback_data() = default; - - callback_data(common_params & params, const std::vector & filter_patterns) { - for (const auto & pattern : filter_patterns) { - try { - std::string anchored_pattern = "^" + pattern; - tensor_filters.emplace_back(anchored_pattern, std::regex::optimize); - } catch (const std::regex_error & e) { - throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what()); - } - } - params.cb_eval = ggml_debug; - params.cb_eval_user_data = this; - } -}; - static bool has_pooling(llama_context * ctx) { switch (llama_pooling_type(ctx)) { case LLAMA_POOLING_TYPE_NONE: @@ -120,168 +96,6 @@ struct output_data { } }; -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -static float ggml_get_float_value(const uint8_t * data, ggml_type type, - const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - switch (type) { - case GGML_TYPE_F16: - return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); - case GGML_TYPE_F32: - return *(const float *) &data[i]; - case GGML_TYPE_I64: - return (float) *(const int64_t *) &data[i]; - case GGML_TYPE_I32: - return (float) *(const int32_t *) &data[i]; - case GGML_TYPE_I16: - return (float) *(const int16_t *) &data[i]; - case GGML_TYPE_I8: - return (float) *(const int8_t *) &data[i]; - case GGML_TYPE_BF16: - return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); - default: - GGML_ABORT("fatal error"); - } -} - -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - float sum_sq = 0.0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - sum += v; - sum_sq += v * v; - } - } - } - } - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG_DBG(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - LOG_DBG(" ..., \n"); - i2 = ne[2] - n; - } - LOG_DBG(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - LOG_DBG(" ..., \n"); - i1 = ne[1] - n; - } - LOG_DBG(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - LOG_DBG("..., "); - i0 = ne[0] - n; - } - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG_DBG("%12.4f", v); - if (i0 < ne[0] - 1) { - LOG_DBG(", "); - } - } - LOG_DBG("],\n"); - } - LOG_DBG(" ],\n"); - } - LOG_DBG(" ]\n"); - LOG_DBG(" sum = %f\n", sum); - LOG_DBG(" sum_sq = %f\n", sum_sq); - } - - if (std::isnan(sum)) { - LOG_ERR("encountered NaN - aborting\n"); - exit(0); - } -} - -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - - bool matches_filter = cb_data->tensor_filters.empty(); - - if (!matches_filter) { - for (const auto & filter : cb_data->tensor_filters) { - if (std::regex_search(t->name, filter)) { - matches_filter = true; - break; - } - } - } - - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - if (matches_filter) { - LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, - ggml_type_name(t->type), - ggml_op_desc(t), - src0->name, - ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - } - - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); - } - - if (!ggml_is_quantized(t->type) && matches_filter) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); - } - - return true; -} - - static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) { std::filesystem::create_directory(output_dir); auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix); @@ -408,7 +222,7 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - callback_data cb_data(params, params.tensor_filter); + base_callback_data cb_data(params, params.tensor_filter); auto llama_init = common_init_from_params(params); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 408338f1af..bd58734979 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,165 +1,12 @@ #include "arg.h" #include "common.h" +#include "debug.h" #include "log.h" #include "llama.h" -#include "ggml.h" - -#include -#include +#include "llama-cpp.h" #include #include -/** - * This the arbitrary data which will be passed to each callback. - * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. - */ -struct callback_data { - std::vector data; -}; - -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} - -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; - if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); - } else if (type == GGML_TYPE_F32) { - v = *(const float *) &data[i]; - } else if (type == GGML_TYPE_I64) { - v = (float) *(const int64_t *) &data[i]; - } else if (type == GGML_TYPE_I32) { - v = (float) *(const int32_t *) &data[i]; - } else if (type == GGML_TYPE_I16) { - v = (float) *(const int16_t *) &data[i]; - } else if (type == GGML_TYPE_I8) { - v = (float) *(const int8_t *) &data[i]; - } else if (type == GGML_TYPE_BF16) { - v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); - } else { - GGML_ABORT("fatal error"); - } - return v; -} - -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); - float sum = 0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - sum += v; - } - } - } - } - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - LOG(" ..., \n"); - i2 = ne[2] - n; - } - LOG(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - LOG(" ..., \n"); - i1 = ne[1] - n; - } - LOG(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - LOG("..., "); - i0 = ne[0] - n; - } - const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG("%12.4f", v); - if (i0 < ne[0] - 1) LOG(", "); - } - LOG("],\n"); - } - LOG(" ],\n"); - } - LOG(" ]\n"); - LOG(" sum = %f\n", sum); - } - - // TODO: make this abort configurable/optional? - if (std::isnan(sum)) { - LOG_ERR("encountered NaN - aborting\n"); - exit(0); - } -} - -/** - * GGML operations callback during the graph execution. - * - * @param t current tensor - * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor - * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. - * see ggml_backend_sched_eval_callback - * @param user_data user data to pass at each call back - * @return true to receive data or continue the graph, false otherwise - */ -static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { - auto * cb_data = (callback_data *) user_data; - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - - char src1_str[128] = {0}; - if (src1) { - snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); - } - - if (!ggml_is_quantized(t->type)) { - uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); - } - - return true; -} - static bool run(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -182,7 +29,7 @@ static bool run(llama_context * ctx, const common_params & params) { } int main(int argc, char ** argv) { - callback_data cb_data; + base_callback_data cb_data; common_params params; @@ -197,7 +44,7 @@ int main(int argc, char ** argv) { // pass the callback to the backend scheduler // it will be executed for each node during the graph computation - params.cb_eval = ggml_debug; + params.cb_eval = common_debug_cb_eval; params.cb_eval_user_data = &cb_data; params.warmup = false; diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 2b1915779f..4c7f7504cf 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -32,10 +32,6 @@ struct clip_graph { const float kq_scale; const clip_flash_attn_type flash_attn_type; - // for debugging - const bool debug_graph; - std::vector & debug_print_tensors; - ggml_context_ptr ctx0_ptr; ggml_context * ctx0; ggml_cgraph * gf; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fd2fb07fd2..9b076e0c56 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -152,18 +152,14 @@ struct clip_ctx { ggml_backend_t backend_cpu = nullptr; ggml_backend_buffer_ptr buf; + int max_nodes = 8192; ggml_backend_sched_ptr sched; clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; bool is_allocated = false; - // for debugging - bool debug_graph = false; - std::vector debug_print_tensors; - clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; - debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr; backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (!backend_cpu) { throw std::runtime_error("failed to initialize CPU backend"); @@ -204,6 +200,10 @@ struct clip_ctx { sched.reset( ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) ); + + if (ctx_params.cb_eval != nullptr) { + ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data); + } } ~clip_ctx() { @@ -239,9 +239,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : n_mmproj_embd(clip_n_mmproj_embd(ctx)), eps(hparams.eps), kq_scale(1.0f / sqrtf((float)d_head)), - flash_attn_type(ctx->flash_attn_type), - debug_graph(ctx->debug_graph), - debug_print_tensors(ctx->debug_print_tensors) { + flash_attn_type(ctx->flash_attn_type) { struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), /*.mem_buffer =*/ ctx->buf_compute_meta.data(), @@ -252,14 +250,11 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); } -void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { - if (debug_graph) { - ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); - std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; - ggml_set_name(cur, cur_name.c_str()); - ggml_set_output(cur); - ggml_build_forward_expand(gf, cur); - debug_print_tensors.push_back(cur); +void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); } } @@ -1519,8 +1514,8 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); - model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight")); - model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI)); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: @@ -1761,8 +1756,8 @@ struct clip_model_loader { model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); - model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight")); - model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_EOI)); } break; case PROJECTOR_TYPE_LLAMA4: { @@ -3339,7 +3334,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph - ctx->debug_print_tensors.clear(); ggml_backend_sched_reset(ctx->sched.get()); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); @@ -3709,18 +3703,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - // print debug nodes - if (ctx->debug_graph) { - LOG_INF("\n\n---\n\n"); - LOG_INF("\n\nDebug graph:\n\n"); - for (ggml_tensor * t : ctx->debug_print_tensors) { - std::vector data(ggml_nbytes(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - print_tensor_shape(t); - print_tensor_data(t, data.data(), 3); - } - } - // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); @@ -3872,7 +3854,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { // // API for debugging // - void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { clip_image_f32 img; img.nx = w; @@ -3881,9 +3862,6 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { for (int i = 0; i < h * w * 3; i++) { img.buf[i] = static_cast(fill_value); } - bool cur_debug_graph = ctx->debug_graph; - ctx->debug_graph = true; clip_image_encode(ctx, 1, &img, nullptr); - ctx->debug_graph = cur_debug_graph; GGML_ASSERT(img.buf.empty() && "expected, always stop here"); } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 27ee020182..71b58484d6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "mtmd.h" #include #include @@ -37,6 +38,8 @@ struct clip_context_params { int image_min_tokens; int image_max_tokens; bool warmup; + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; struct clip_init_result { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 1ba02a5233..054c7faa6a 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -1,4 +1,5 @@ #include "arg.h" +#include "debug.h" #include "log.h" #include "common.h" #include "sampling.h" @@ -88,6 +89,8 @@ struct mtmd_cli_context { int n_threads = 1; llama_pos n_past = 0; + base_callback_data cb_data; + mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init->model(); lctx = llama_init->context(); @@ -139,6 +142,10 @@ struct mtmd_cli_context { mparams.warmup = params.warmup; mparams.image_min_tokens = params.image_min_tokens; mparams.image_max_tokens = params.image_max_tokens; + if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) { + mparams.cb_eval_user_data = &cb_data; + mparams.cb_eval = common_debug_cb_eval; + } ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f25706987e..32a24bfcea 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -111,6 +111,8 @@ mtmd_context_params mtmd_context_params_default() { /* warmup */ true, /* image_min_tokens */ -1, /* image_max_tokens */ -1, + /* cb_eval */ nullptr, + /* cb_eval_user_data */ nullptr, }; return params; } @@ -176,6 +178,8 @@ struct mtmd_context { /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, /* warmup */ ctx_params.warmup, + /* cb_eval */ ctx_params.cb_eval, + /* cb_eval_user_data */ ctx_params.cb_eval_user_data, }; auto res = clip_init(mmproj_fname, ctx_clip_params); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 44d05ceaee..a12c28ef22 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -95,6 +95,10 @@ struct mtmd_context_params { // limit number of image tokens, only for vision models with dynamic resolution int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) + + // callback function passed over to mtmd proper + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; }; MTMD_API const char * mtmd_default_marker(void); @@ -273,12 +277,12 @@ struct bitmap { ptr.reset(mtmd_bitmap_init(nx, ny, data)); } ~bitmap() = default; - uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); } - uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); } - const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); } - size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); } - std::string id() { return mtmd_bitmap_get_id(ptr.get()); } - void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); } + uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } + uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } + size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } + std::string id() const { return mtmd_bitmap_get_id(ptr.get()); } + void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); } }; struct bitmaps { @@ -302,8 +306,8 @@ struct input_chunks { input_chunks() = default; input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} ~input_chunks() = default; - size_t size() { return mtmd_input_chunks_size(ptr.get()); } - const mtmd_input_chunk * operator[](size_t idx) { + size_t size() const { return mtmd_input_chunks_size(ptr.get()); } + const mtmd_input_chunk * operator[](size_t idx) const { return mtmd_input_chunks_get(ptr.get(), idx); } }; From 36f0132464096e49ed344cdeeee65e39e2b43b14 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Thu, 15 Jan 2026 03:44:54 +0100 Subject: [PATCH 11/11] CUDA: Factor out and re-use `block_reduce` function (#18785) * CUDA: Refactor and expose two_stage_warp_reduce_* function * Use `two_stage_warp_reduce` also in softmax kernel, move smem out of it Moving smem out of `__device__` function to `__global__` function allows for explicit smem reuse, as either compiler or cuda rt seem to not free it afterwards (`cudaFuncSetAttribute` fails when not accounting for it once for each call to two_stage_warp_reduce) * Update ggml/src/ggml-cuda/common.cuh Co-authored-by: Aman Gupta * Use two_stage_warp_reduce in group_norm_f32 * Use two_stage_warp_reduce in rms_norm_f32 * Fix smem calculation which expects bytes * Make `two_stage_warp_reduce` accept all values warp_reduce accepts Also integrate it into norm_f32 function * Use two_stage_warp_reduce in l2_norm_f32 * Use type traits for block reduction for better legibility Also adresss other requests by @am17an such as variable renaming * Make norm tests cover all cuda paths * Mark columns % WARP_SIZE !=0 as supported for RMS_NORM_BACK Unit-tests passed locally, let's see if they pass in the CI as well * Use `enum class` for `block_reduce_method` This is more type-safe than plain enum * Rename variables as suggested in code review by @am17an * Rename two_stage_warp_reduce -> block_reduce * Fix trailing whitespace in common.cuh * Make condition of static_assert type-dependent This delays evaluation until the template is actually instantiated. Otherwise, some compilers may evaluate the assert when parsing the template, resulting in build errors as observed here: https://github.com/ggml-org/llama.cpp/actions/runs/20960323123/job/60235530068?pr=18785 * Inline definitions --------- Co-authored-by: Aman Gupta --- ggml/src/ggml-cuda/common.cuh | 80 +++++++++++++++++++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- ggml/src/ggml-cuda/norm.cu | 94 ++++++------------------------ ggml/src/ggml-cuda/reduce_rows.cuh | 18 +----- ggml/src/ggml-cuda/softmax.cu | 89 +++------------------------- tests/test-backend-ops.cpp | 33 ++++++----- 6 files changed, 125 insertions(+), 191 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 90794ff264..eaaf87612d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -530,6 +530,86 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) { #endif // FP16_AVAILABLE } +enum class block_reduce_method { + MAX, + SUM, +}; + +template +struct block_reduce_policy; + +template +inline constexpr bool is_any = (std::is_same_v || ...); + +template +inline constexpr bool ggml_cuda_dependent_false_v = false; + +template struct block_reduce_policy { + static __device__ T reduce(T val) { + if constexpr(is_any) { + return warp_reduce_sum(val); + } else { + static_assert(ggml_cuda_dependent_false_v, "Unsupported type for block reduce sum"); + } + } + + static __device__ T sentinel() { + if constexpr (std::is_same_v) { + return 0.0f; + } else if constexpr (std::is_same_v) { + return make_float2(0.0f, 0.0f); + } else if constexpr (std::is_same_v) { + return make_half2(0.0f, 0.0f); + } else if constexpr (std::is_same_v) { + return 0; + } else { + static_assert(ggml_cuda_dependent_false_v, "Unsupported type for block reduce sum"); + } + } +}; + +template struct block_reduce_policy { + static __device__ T reduce(T val) { + if constexpr (is_any) { + return warp_reduce_max(val); + } else { + static_assert(ggml_cuda_dependent_false_v, "Unsupported type for block reduce max"); + } + } + + static __device__ T sentinel() { + if constexpr (std::is_same_v) { + return -INFINITY; + } else if constexpr (std::is_same_v) { + return make_half2(-INFINITY, -INFINITY); + } else { + static_assert(ggml_cuda_dependent_false_v, "Unsupported type for block reduce max"); + } + } +}; + +template +static __device__ T block_reduce(T val, T * shared_vals) { + val = block_reduce_policy::reduce(val); + const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template; + if (block_size > WARP_SIZE) { + assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0); + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + shared_vals[warp_id] = val; + } + __syncthreads(); + val = block_reduce_policy::sentinel(); + if (lane_id < (static_cast(block_size) / WARP_SIZE)) { + val = shared_vals[lane_id]; + } + return block_reduce_policy::reduce(val); + } + + return val; +} + static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { #ifdef FP16_AVAILABLE diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c3ee2ea066..553623fbd4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4551,7 +4551,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_L2_NORM: return true; case GGML_OP_RMS_NORM_BACK: - return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; + return ggml_is_contiguous(op->src[0]); break; case GGML_OP_NONE: case GGML_OP_RESHAPE: diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 4f153c5718..ef98f675aa 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -25,19 +25,8 @@ static __global__ void norm_f32( } // sum up partial sums - mean_var = warp_reduce_sum(mean_var); - if constexpr (block_size > WARP_SIZE) { - static_assert(block_size == 1024, "unexpected block_size"); - __shared__ float2 s_sum[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = mean_var; - } - __syncthreads(); - mean_var = s_sum[lane_id]; - mean_var = warp_reduce_sum(mean_var); - } + extern __shared__ float2 s_sum2[]; + mean_var = block_reduce(mean_var, s_sum2); const float mean = mean_var.x / ncols; const float var = mean_var.y / ncols - mean * mean; @@ -61,19 +50,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr tmp += x[j]; } - tmp = warp_reduce_sum(tmp); - if constexpr (block_size > WARP_SIZE) { - static_assert(block_size == 1024, "unexpected block_size"); - __shared__ float s_sum[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } + extern __shared__ float s_sum[]; + tmp = block_reduce(tmp, s_sum); const float mean = tmp / group_size; tmp = 0.0f; @@ -84,18 +62,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr tmp += xi * xi; } - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __shared__ float s_sum[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } + tmp = block_reduce(tmp, s_sum); const float variance = tmp / group_size; const float scale = rsqrtf(variance + eps); @@ -163,22 +130,8 @@ static __global__ void rms_norm_f32(const float * x, } // sum up partial sums - tmp = warp_reduce_sum(tmp); - if constexpr (block_size > WARP_SIZE) { - static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size"); - __shared__ float s_sum[32]; - const int warp_id = tid / WARP_SIZE; - const int lane_id = tid % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = 0.0f; - if (lane_id < (block_size / WARP_SIZE)) { - tmp = s_sum[lane_id]; - } - tmp = warp_reduce_sum(tmp); - } + extern __shared__ float s_sum[]; + tmp = block_reduce(tmp, s_sum); const float mean = tmp / ncols; const float scale = rsqrtf(mean + eps); @@ -306,19 +259,8 @@ static __global__ void l2_norm_f32( } // sum up partial sums - tmp = warp_reduce_sum(tmp); - if constexpr (block_size > WARP_SIZE) { - static_assert(block_size == 1024, "unexpected block_size"); - __shared__ float s_sum[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } + extern __shared__ float s_sum[]; + tmp = block_reduce(tmp, s_sum); // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html const float scale = rsqrtf(fmaxf(tmp, eps * eps)); @@ -337,7 +279,7 @@ static void norm_f32_cuda( norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - norm_f32<1024><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } @@ -348,7 +290,7 @@ static void group_norm_f32_cuda( group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); } else { const dim3 block_dims(1024, 1, 1); - group_norm_f32<1024><<>>(x, dst, group_size, ne_elements, eps); + group_norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps); } } @@ -358,10 +300,10 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, false><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, false><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + rms_norm_f32<1024, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } @@ -404,12 +346,12 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true><<>>( + rms_norm_f32<256, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true><<>>( + rms_norm_f32<1024, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); } @@ -425,14 +367,14 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true, true><<>>( + rms_norm_f32<256, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, add_nchannels_packed, add_nsamples_packed); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true, true><<>>( + rms_norm_f32<1024, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, @@ -460,7 +402,7 @@ static void l2_norm_f32_cuda( l2_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - l2_norm_f32<1024><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + l2_norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index 6bcae9e52f..de240fd441 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -28,22 +28,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r } // sum up partial sums - sum = warp_reduce_sum(sum); - if (blockDim.x > WARP_SIZE) { - assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0); - __shared__ float s_sum[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = sum; - } - __syncthreads(); - sum = 0.0f; - if (lane_id < (static_cast(blockDim.x) / WARP_SIZE)) { - sum = s_sum[lane_id]; - } - sum = warp_reduce_sum(sum); - } + __shared__ float shared_vals[32]; + sum = block_reduce(sum, shared_vals); if (col != 0) { return; diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index 1ae84ebf63..dc06d06930 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -75,9 +75,6 @@ static __global__ void soft_max_f32( const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1); extern __shared__ float data_soft_max_f32[]; @@ -102,21 +99,7 @@ static __global__ void soft_max_f32( } // find the max value in the block - max_val = warp_reduce_max(max_val); - if (block_size > WARP_SIZE) { - if (warp_id == 0) { - buf_iw[lane_id] = -INFINITY; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = max_val; - } - __syncthreads(); - - max_val = buf_iw[lane_id]; - max_val = warp_reduce_max(max_val); - } + max_val = block_reduce(max_val, buf_iw); float tmp = 0.0f; // partial sum @@ -134,22 +117,7 @@ static __global__ void soft_max_f32( } // find the sum of exps in the block - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __syncthreads(); - if (warp_id == 0) { - buf_iw[lane_id] = 0.0f; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = tmp; - } - __syncthreads(); - - tmp = buf_iw[lane_id]; - tmp = warp_reduce_sum(tmp); - } + tmp = block_reduce(tmp, buf_iw); if (sinks) { tmp += expf(sinks[i02] - max_val); @@ -169,50 +137,6 @@ static __global__ void soft_max_f32( } } - -// TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated -static __device__ float two_stage_warp_reduce_max(float val) { - val = warp_reduce_max(val); - if (blockDim.x > WARP_SIZE) { - assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0); - __shared__ float local_vals[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - local_vals[warp_id] = val; - } - __syncthreads(); - val = -INFINITY; - if (lane_id < (static_cast(blockDim.x) / WARP_SIZE)) { - val = local_vals[lane_id]; - } - return warp_reduce_max(val); - } else { - return val; - } -} - -static __device__ float two_stage_warp_reduce_sum(float val) { - val = warp_reduce_sum(val); - if (blockDim.x > WARP_SIZE) { - assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0); - __shared__ float local_vals[32]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - local_vals[warp_id] = val; - } - __syncthreads(); - val = 0.0f; - if (lane_id < (static_cast(blockDim.x) / WARP_SIZE)) { - val = local_vals[lane_id]; - } - return warp_reduce_sum(val); - } else { - return val; - } -} - // TODO: Template to allow keeping ncols in registers if they fit static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x, float * __restrict__ dst, @@ -230,6 +154,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ float local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY }; float local_max = -INFINITY; const int step_size = gridDim.x * blockDim.x; + __shared__ float shared_vals[32]; // Compute thread-local max for (int col = col_start; col < p.ncols;) { @@ -246,7 +171,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ } // Compute CTA-level max - local_max = two_stage_warp_reduce_max(local_max); + local_max = block_reduce(local_max, shared_vals); // Store CTA-level max to GMEM if (tid == 0) { @@ -261,7 +186,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ } else { local_max = -INFINITY; } - local_max = two_stage_warp_reduce_max(local_max); + local_max = block_reduce(local_max, shared_vals); // Compute softmax dividends, accumulate divisor float tmp_expf = 0.0f; @@ -284,7 +209,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ } // Reduce divisor within CTA - tmp_expf = two_stage_warp_reduce_sum(tmp_expf); + tmp_expf = block_reduce(tmp_expf, shared_vals); // Store CTA-level sum to GMEM if (tid == 0) { @@ -298,7 +223,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ } else { tmp_expf = 0.0f; } - tmp_expf = two_stage_warp_reduce_sum(tmp_expf); + tmp_expf = block_reduce(tmp_expf, shared_vals); // Divide dividend by global sum + store data for (int col = col_start; col < p.ncols;) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 19ef58404e..188ffdf3db 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7482,25 +7482,29 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f)); test_cases.emplace_back(new test_silu_back()); - for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) { - for (bool v : {false, true}) { - test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps)); - test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps)); + for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) { + for (uint32_t n : { 64, 1025 }) { + for (bool v : { false, true }) { + test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps)); + test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps)); + } + test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps)); + test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps)); } - test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); - test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } // in-place tests test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, false, 1e-6f, true)); - for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) { - test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); - test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); - test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); - test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); - test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); - test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); + for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f }) { + for (uint32_t n : { 64, 1025 }) { + test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false)); + test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true)); + test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false)); + test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true)); + } } for (uint32_t n : {1, 511, 1025, 8192, 33*512}) { for (bool multi_add : {false, true}) { @@ -7524,9 +7528,6 @@ static std::vector> make_test_cases_eval() { } } } - - test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); - for (int64_t d_conv : {3, 4, 9}) { for (int64_t d_inner: {1024, 1536, 2048}) { test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));