Merge branch 'master' of https://github.com/ggml-org/llama.cpp into paddleocr-vl

2026-01-15 13:33:44 +08:00 · 2026-01-15 13:33:44 +08:00 · b9fd3c8578
parent 2f8c194135 36f0132464
commit b9fd3c8578
63 changed files with 536 additions and 1144 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
+RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
    yum clean all && \
    rm -rf /var/cache/yum

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+    apt-get install -y build-essential git cmake libssl-dev

 WORKDIR /app

--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
+    apt-get install -y git libssl-dev

 WORKDIR /app

--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -6,7 +6,7 @@ WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make libcurl-devel
+RUN yum install -y gcc g++ cmake make openssl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -18,7 +18,7 @@ RUN apt-get update && \
    python3 \
    python3-pip \
    git \
-    libcurl4-openssl-dev \
+    libssl-dev \
    libgomp1

 WORKDIR /app
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -32,7 +32,6 @@
  useMpi ? false,
  useRocm ? config.rocmSupport,
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
-  enableCurl ? true,
  useVulkan ? false,
  useRpc ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -160,15 +159,13 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs
-    ++ optionals enableCurl [ curl ];
+    ++ optionals useVulkan vulkanBuildInputs;

  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-      (cmakeBool "LLAMA_CURL" enableCurl)
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -27,7 +27,7 @@ RUN apt-get update \
    build-essential \
    cmake \
    git \
-    libcurl4-openssl-dev \
+    libssl-dev \
    curl \
    libgomp1

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    apt install -y --no-install-recommends \
        git cmake ccache ninja-build \
        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libcurl4-openssl-dev && \
+        libopenblas-dev libssl-dev && \
    rm -rf /var/lib/apt/lists/*

 WORKDIR /app
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

-# Install cURL and Vulkan SDK dependencies
-RUN apt install -y libcurl4-openssl-dev curl \
+# Install SSL and Vulkan SDK dependencies
+RUN apt install -y libssl-dev curl \
    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc

 # Build it
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@ -20,7 +20,7 @@ jobs:
        run: |
          PREFIX="$(pwd)"/inst
          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@ -30,7 +30,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
+  #         cmake -B build -DLLAMA_OPENSSL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_OPENMP=OFF \
  #                        -DLLAMA_BUILD_EXAMPLES=ON \
@ -76,7 +76,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
+  #         cmake -B build -DLLAMA_OPENSSL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@ -122,7 +122,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
+  #         cmake -B build -DLLAMA_OPENSSL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@ -178,7 +178,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
+          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
@ -235,7 +235,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
+          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
@ -281,7 +281,7 @@ jobs:
      - name: Build
        run: |
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
-          cmake -B build -DLLAMA_CURL=OFF \
+          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -79,7 +79,6 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=OFF \
@ -92,7 +91,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  macOS-latest-cmake-x64:
    runs-on: macos-15-intel
@ -118,7 +117,6 @@ jobs:
          cmake -B build \
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
@ -227,8 +225,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)
@ -237,7 +233,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@ -293,8 +289,6 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@ -305,8 +299,6 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@ -336,14 +328,10 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake .. \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
+          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_LLGUIDANCE=ON
-          cmake --build . --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -377,8 +365,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(nproc)

@ -412,8 +398,6 @@ jobs:
        id: cmake_configure
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@ -470,8 +454,6 @@ jobs:
        run: |
          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)

@ -545,8 +527,6 @@ jobs:
        run: |
          export Dawn_DIR=dawn/lib64/cmake/Dawn
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_WEBGPU=ON
          cmake --build build --config Release -j $(nproc)

@ -593,7 +573,7 @@ jobs:
          source emsdk/emsdk_env.sh
          emcmake cmake -B build-wasm \
            -DGGML_WEBGPU=ON \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg

          cmake --build build-wasm --target test-backend-ops -j $(nproc)
@ -624,8 +604,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
            -DGGML_HIP=ON
@ -657,8 +635,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -S . \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)

@ -706,8 +682,6 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
@ -757,8 +731,6 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
@ -893,7 +865,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@ -1043,7 +1015,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -S . -B build ${{ matrix.defines }} `
-            -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Add libopenblas.dll
@ -1101,8 +1073,6 @@ jobs:
          # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
          run: |
            cmake -S . -B build -G Ninja \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_FATAL_WARNINGS=ON \
              -DCMAKE_BUILD_TYPE=Release \
              -DCMAKE_CUDA_ARCHITECTURES=89-real \
@ -1150,7 +1120,6 @@ jobs:
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DLLAMA_BUILD_SERVER=ON ^
-            -DLLAMA_CURL=OFF ^
            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
@ -1258,7 +1227,6 @@ jobs:
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
            -DCMAKE_BUILD_TYPE=Release `
-            -DLLAMA_CURL=OFF `
            -DLLAMA_BUILD_BORINGSSL=ON `
            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
@ -1285,7 +1253,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@ -1352,7 +1320,7 @@ jobs:
      matrix:
        include:
          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
          - build: 'arm64-snapdragon'
            defines: '--preset arm64-android-snapdragon-release'

@ -1469,8 +1437,6 @@ jobs:
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
              cmake -S . -B build \
                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DLLAMA_CURL=OFF \
-                  -DLLAMA_OPENSSL=ON \
                  -DGGML_CANN=on \
                  -DSOC_TYPE=${SOC_TYPE}
              cmake --build build -j $(nproc)
@ -1499,7 +1465,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential

      - name: Test
        id: ggml-ci
@ -1525,7 +1491,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential

      - name: Test
        id: ggml-ci
@ -1551,7 +1517,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential

      - name: Test
        id: ggml-ci
@ -1577,7 +1543,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential

      - name: Test
        id: ggml-ci
@ -1603,7 +1569,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential

      - name: Test
        id: ggml-ci
@ -1767,7 +1733,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-           sudo apt-get install -y build-essential libcurl4-openssl-dev
+           sudo apt-get install -y build-essential

       - name: Test
         id: ggml-ci
@ -1834,8 +1800,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@ -1853,7 +1817,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@ -1928,7 +1892,7 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=ON \
            -DLLAMA_BUILD_EXAMPLES=ON \
@ -1947,7 +1911,7 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@ -2018,7 +1982,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@ -2092,8 +2056,6 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
@ -2129,7 +2091,6 @@ jobs:
           sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
           apt-get install -y \
            build-essential \
-            libcurl4-openssl-dev \
            python3-venv \
            gpg \
            wget \
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@ -38,7 +38,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
+          sudo apt-get install build-essential libssl-dev
          # Install git-clang-format script for formatting only changed code
          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -45,7 +45,6 @@ jobs:
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
@ -95,7 +94,6 @@ jobs:
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
@ -161,8 +159,6 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@ -212,8 +208,6 @@ jobs:
          cmake -B build \
            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_OPENSSL=ON \
            -DGGML_BACKEND_DL=ON \
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
@ -269,7 +263,6 @@ jobs:
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DLLAMA_CURL=OFF ^
            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
@ -358,7 +351,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release --target ${{ matrix.target }}

      - name: Pack artifacts
@ -412,7 +405,7 @@ jobs:
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_CUDA_CUB_3DOT2=ON
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
@ -481,7 +474,7 @@ jobs:
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_CURL=OFF
+            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-sycl -j

      - name: Build the release package
@ -608,7 +601,7 @@ jobs:
            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DLLAMA_CURL=OFF
+            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
@ -649,7 +642,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@ -734,8 +727,6 @@ jobs:
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
              cmake -S . -B build \
                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DLLAMA_CURL=OFF \
-                  -DLLAMA_OPENSSL=ON \
                  -DGGML_CANN=on \
                  -DSOC_TYPE=${SOC_TYPE}
              cmake --build build -j $(nproc)
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@ -168,8 +168,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@ -182,8 +180,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@ -195,8 +191,6 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_CURL=OFF \
-              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -72,7 +72,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@ -108,7 +108,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})

 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_HTTPLIB    "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

+# deprecated
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+if (LLAMA_CURL)
+    message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@ -212,11 +217,6 @@ add_subdirectory(src)
 # utils, programs, examples and tests
 #

-if (NOT LLAMA_BUILD_COMMON)
-    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
-    set(LLAMA_CURL OFF)
-endif()
-
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
    if (LLAMA_HTTPLIB)
--- a/README.md
+++ b/README.md
@ -586,6 +586,5 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
 - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
 - [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet

@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -S .
 cmake --build build-ios-device --config Release -- -quiet

@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -S .
 cmake --build build-macos --config Release -- -quiet

@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -- -quiet

@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -S .
 cmake --build build-tvos-device --config Release -- -quiet

--- a/ci/run.sh
+++ b/ci/run.sh
@ -45,7 +45,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@ -33,3 +33,25 @@ function(llama_add_compile_flags)
        endif()
    endif()
 endfunction()
+
+function(llama_download_model NAME HASH)
+    set(DEST "${CMAKE_BINARY_DIR}/${NAME}")
+    get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
+    file(MAKE_DIRECTORY "${DEST_DIR}")
+    if(NOT EXISTS "${DEST}")
+        message(STATUS "Downloading ${NAME} from ggml-org/models...")
+    endif()
+    file(DOWNLOAD
+        "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
+        "${DEST}"
+        TLS_VERIFY ON
+        EXPECTED_HASH ${HASH}
+        STATUS status
+    )
+    list(GET status 0 code)
+    if(NOT code EQUAL 0)
+        list(GET status 1 msg)
+        message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
+    endif()
+    set(LLAMA_DOWNLOAD_MODEL "${DEST}" PARENT_SCOPE)
+endfunction()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
    common.h
    console.cpp
    console.h
+    debug.cpp
+    debug.h
    download.cpp
    download.h
    http.h
@ -95,17 +97,7 @@ endif()
 # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)

-if (LLAMA_CURL)
-    # Use curl to download model url
-    find_package(CURL)
-    if (NOT CURL_FOUND)
-        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
-    endif()
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
-    include_directories(${CURL_INCLUDE_DIRS})
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-elseif (LLAMA_HTTPLIB)
-    # otherwise, use cpp-httplib
+if (LLAMA_HTTPLIB)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
 endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -341,7 +341,7 @@ static handle_model_result common_params_handle_model(
                if (model.path.empty()) {
                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                        exit(1); // built without CURL, error message already printed
+                        exit(1); // error message already printed
                    }
                    model.name    = model.hf_repo;      // repo name with tag
                    model.hf_repo = auto_detected.repo; // repo name without tag
--- a/common/debug.cpp
+++ b/common/debug.cpp
@ -0,0 +1,165 @@
+#include "debug.h"
+
+#include "log.h"
+
+#include <cmath>
+#include <string>
+
+static std::string common_ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static float common_ggml_get_float_value(const uint8_t * data,
+                           ggml_type       type,
+                           const size_t *  nb,
+                           size_t          i0,
+                           size_t          i1,
+                           size_t          i2,
+                           size_t          i3) {
+    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+    float  v;
+    if (type == GGML_TYPE_F16) {
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
+    } else if (type == GGML_TYPE_F32) {
+        v = *(const float *) &data[i];
+    } else if (type == GGML_TYPE_I64) {
+        v = (float) *(const int64_t *) &data[i];
+    } else if (type == GGML_TYPE_I32) {
+        v = (float) *(const int32_t *) &data[i];
+    } else if (type == GGML_TYPE_I16) {
+        v = (float) *(const int16_t *) &data[i];
+    } else if (type == GGML_TYPE_I8) {
+        v = (float) *(const int8_t *) &data[i];
+    } else if (type == GGML_TYPE_BF16) {
+        v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+    return v;
+}
+
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    sum += v;
+                }
+            }
+        }
+    }
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG_ERR("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2 * n) {
+                LOG_ERR("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG_ERR("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2 * n) {
+                    LOG_ERR("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG_ERR("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2 * n) {
+                        LOG_ERR("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
+                    LOG_ERR("%12.4f", v);
+                    if (i0 < ne[0] - 1) {
+                        LOG_ERR(", ");
+                    }
+                }
+                LOG_ERR("],\n");
+            }
+            LOG_ERR("                                      ],\n");
+        }
+        LOG_ERR("                                     ]\n");
+        LOG_ERR("                                     sum = %f\n", sum);
+    }
+
+    if constexpr (abort) {
+        if (std::isnan(sum)) {
+            LOG_ERR("encountered NaN - aborting\n");
+            exit(0);
+        }
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (base_callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true;  // Always retrieve data
+    }
+
+    bool matches_filter = cb_data->tensor_filters.empty();
+
+    if (!matches_filter) {
+        for (const auto & filter : cb_data->tensor_filters) {
+            if (std::regex_search(t->name, filter)) {
+                matches_filter = true;
+                break;
+            }
+        }
+    }
+
+    char src1_str[128] = { 0 };
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
+    }
+
+    if (matches_filter) {
+        LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
+                ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
+                common_ggml_ne_string(t).c_str());
+    }
+
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type) && matches_filter) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@ -0,0 +1,43 @@
+#pragma once
+#include "common.h"
+#include <string>
+#include <vector>
+#include <regex>
+
+// common debug functions and structs
+
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne   - the tensor dimensions array
+// nb   - the tensor strides array
+// n    - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
+
+// Intended to use as callback for ggml_backend_sched_eval_callback
+// prints tensors that are processed in the computation graph
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// in a tensor (useful for stopping debug sessions on first erroneous tensor)
+// The callback data will be passed as the third parameter (user_data)
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;
+
+    base_callback_data() = default;
+
+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = common_debug_cb_eval<false>;
+        params.cb_eval_user_data = this;
+    }
+};
--- a/common/download.cpp
+++ b/common/download.cpp
@ -19,10 +19,7 @@
 #include <thread>
 #include <vector>

-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#elif defined(LLAMA_USE_HTTPLIB)
+#if defined(LLAMA_USE_HTTPLIB)
 #include "http.h"
 #endif

@ -171,336 +168,7 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
    return {hf_repo, tag};
 }

-#ifdef LLAMA_USE_CURL
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-
-static CURLcode common_curl_perf(CURL * curl) {
-    CURLcode res = curl_easy_perform(curl);
-    if (res != CURLE_OK) {
-        LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
-    }
-
-    return res;
-}
-
-// Send a HEAD request to retrieve the etag and last-modified headers
-struct common_load_model_from_url_headers {
-    std::string etag;
-    std::string last_modified;
-    std::string accept_ranges;
-};
-
-struct FILE_deleter {
-    void operator()(FILE * f) const { fclose(f); }
-};
-
-static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
-    common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-    static std::regex                    header_regex("([^:]+): (.*)\r\n");
-    static std::regex                    etag_regex("ETag", std::regex_constants::icase);
-    static std::regex                    last_modified_regex("Last-Modified", std::regex_constants::icase);
-    static std::regex                    accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
-    std::string                          header(buffer, n_items);
-    std::smatch                          match;
-    if (std::regex_match(header, match, header_regex)) {
-        const std::string & key   = match[1];
-        const std::string & value = match[2];
-        if (std::regex_match(key, match, etag_regex)) {
-            headers->etag = value;
-        } else if (std::regex_match(key, match, last_modified_regex)) {
-            headers->last_modified = value;
-        } else if (std::regex_match(key, match, accept_ranges_regex)) {
-            headers->accept_ranges = value;
-        }
-    }
-
-    return n_items;
-}
-
-static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
-    return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
-}
-
-// helper function to hide password in URL
-static std::string llama_download_hide_password_in_url(const std::string & url) {
-    // Use regex to match and replace the user[:password]@ pattern in URLs
-    // Pattern: scheme://[user[:password]@]host[...]
-    static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
-    std::smatch             match;
-
-    if (std::regex_match(url, match, url_regex)) {
-        // match[1] = scheme (e.g., "https://")
-        // match[2] = user[:password]@ part
-        // match[3] = rest of URL (host and path)
-        return match[1].str() + "********@" + match[3].str();
-    }
-
-    return url;  // No credentials found or malformed URL
-}
-
-static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-
-#    if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#    endif
-
-    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);      // will trigger the HEAD verb
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);  // hide head request progress
-    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
-}
-
-static void common_curl_easy_setopt_get(CURL * curl) {
-    curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
-    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
-
-    //  display download progress
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
-}
-
-static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
-    if (std::filesystem::exists(path_temporary)) {
-        const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
-        LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
-        const std::string range_str = partial_size + "-";
-        curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
-    }
-
-    // Always open file in append mode could be resuming
-    std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
-    if (!outfile) {
-        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
-        return false;
-    }
-
-    common_curl_easy_setopt_get(curl);
-    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
-
-    return common_curl_perf(curl) == CURLE_OK;
-}
-
-static bool common_download_head(CURL *              curl,
-                                 curl_slist_ptr &    http_headers,
-                                 const std::string & url,
-                                 const std::string & bearer_token) {
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    // Check if hf-token or bearer-token was specified
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr        = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-
-    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
-    common_curl_easy_setopt_head(curl, url);
-    return common_curl_perf(curl) == CURLE_OK;
-}
-
-// download one single file from remote URL to local path
-// returns status code or -1 on error
-static int common_download_file_single_online(const std::string & url,
-                                               const std::string & path,
-                                               const std::string & bearer_token,
-                                               const common_header_list & custom_headers) {
-    static const int max_attempts        = 3;
-    static const int retry_delay_seconds = 2;
-
-    for (int i = 0; i < max_attempts; ++i) {
-        std::string etag;
-
-        // Check if the file already exists locally
-        const auto file_exists = std::filesystem::exists(path);
-        if (file_exists) {
-            etag = read_etag(path);
-        } else {
-            LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-        }
-
-        bool head_request_ok = false;
-        bool should_download = !file_exists;  // by default, we should download if the file does not exist
-
-        // Initialize libcurl
-        curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
-        common_load_model_from_url_headers headers;
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-        curl_slist_ptr http_headers;
-
-        for (const auto & h : custom_headers) {
-             std::string s = h.first + ": " + h.second;
-             http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
-        }
-        const bool     was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
-        if (!was_perform_successful) {
-            head_request_ok = false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code == 200) {
-            head_request_ok = true;
-        } else {
-            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-            head_request_ok = false;
-        }
-
-        // if head_request_ok is false, we don't have the etag or last-modified headers
-        // we leave should_download as-is, which is true if the file does not exist
-        bool should_download_from_scratch = false;
-        if (head_request_ok) {
-            // check if ETag or Last-Modified headers are different
-            // if it is, we need to download the file again
-            if (!etag.empty() && etag != headers.etag) {
-                LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
-                        headers.etag.c_str());
-                should_download              = true;
-                should_download_from_scratch = true;
-            }
-        }
-
-        const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
-        if (should_download) {
-            if (file_exists &&
-                !accept_ranges_supported) {  // Resumable downloads not supported, delete and start again.
-                LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-                if (remove(path.c_str()) != 0) {
-                    LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                    return -1;
-                }
-            }
-
-            const std::string path_temporary = path + ".downloadInProgress";
-            if (should_download_from_scratch) {
-                if (std::filesystem::exists(path_temporary)) {
-                    if (remove(path_temporary.c_str()) != 0) {
-                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                        return -1;
-                    }
-                }
-
-                if (std::filesystem::exists(path)) {
-                    if (remove(path.c_str()) != 0) {
-                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                        return -1;
-                    }
-                }
-            }
-            if (head_request_ok) {
-                write_etag(path, headers.etag);
-            }
-
-            // start the download
-            LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
-                    __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
-                    headers.etag.c_str(), headers.last_modified.c_str());
-            const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
-            if (!was_pull_successful) {
-                if (i + 1 < max_attempts) {
-                    const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
-                    LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
-                    std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-                } else {
-                    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-                }
-
-                continue;
-            }
-
-            long http_code = 0;
-            curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-
-            int status = static_cast<int>(http_code);
-            if (!is_http_status_ok(http_code)) {
-                LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-                return status; // TODO: maybe only return on certain codes
-            }
-
-            if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                return -1;
-            }
-
-            return static_cast<int>(http_code);
-        } else {
-            LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-
-            return 304; // Not Modified - fake cached response
-        }
-    }
-
-    return -1; // max attempts reached
-}
-
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::vector<char> res_buffer;
-
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        auto data_vec = static_cast<std::vector<char> *>(data);
-        data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (params.timeout > 0) {
-        curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
-    }
-    if (params.max_size > 0) {
-        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
-    }
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-
-    for (const auto & header : params.headers) {
-        std::string header_ = header.first + ": " + header.second;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
-    }
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        std::string error_msg = curl_easy_strerror(res);
-        throw std::runtime_error("error: cannot make GET request: " + error_msg);
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-
-    return { res_code, std::move(res_buffer) };
-}
-
-#elif defined(LLAMA_USE_HTTPLIB)
+#if defined(LLAMA_USE_HTTPLIB)

 class ProgressBar {
    static inline std::mutex mutex;
@ -797,10 +465,6 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
    return { res->status, std::move(buf) };
 }

-#endif // LLAMA_USE_CURL
-
-#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
-
 int common_download_file_single(const std::string & url,
                                const std::string & path,
                                const std::string & bearer_token,
@ -1151,7 +815,7 @@ int common_download_file_single(const std::string &,
    throw std::runtime_error("download functionality is not enabled in this build");
 }

-#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
+#endif // defined(LLAMA_USE_HTTPLIB)

 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -8826,11 +8826,7 @@ class ExaoneMoEModel(Exaone4Model):
        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
        n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
        self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
-        # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE,
-        # so that we can convert MTP weights to GGUF format for speculative decoding.
-        # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now.
-        # Will be updated when HF config is updated.
-        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1))
+        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))

        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)

--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@ -1,4 +1,4 @@
-{
+{
  "version": 4,
  "configurePresets": [
    {
@ -23,7 +23,7 @@
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
-            "LLAMA_CURL":       "OFF"
+            "LLAMA_OPENSSL":    "OFF"
        }
    },

@ -38,7 +38,7 @@
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
-            "LLAMA_CURL":       "OFF"
+            "LLAMA_OPENSSL":    "OFF"
        }
    },

--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@ -15,7 +15,7 @@ Below is the build script: it requires utilizing RISC-V vector instructions for
 cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CPU_RISCV64_SPACEMIT=ON \
-    -DLLAMA_CURL=OFF \
+    -DLLAMA_OPENSSL=OFF \
    -DGGML_RVV=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
--- a/docs/build.md
+++ b/docs/build.md
@ -65,10 +65,10 @@ cmake --build build --config Release
      cmake --preset x64-windows-llvm-release
      cmake --build build-x64-windows-llvm-release
      ```
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
-  - **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev`  # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
-  - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
-  - **Arch / Manjaro:** `sudo pacman -S curl`  # includes libcurl headers
+- If you want HTTPS/TLS features, you may install OpenSSL development libraries. If not installed, the project will build and run without SSL support.
+  - **Debian / Ubuntu:** `sudo apt-get install libssl-dev`
+  - **Fedora / RHEL / Rocky / Alma:** `sudo dnf install openssl-devel`
+  - **Arch / Manjaro:** `sudo pacman -S openssl`

 ## BLAS Build

--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@ -1,11 +1,9 @@
+#include "debug.h"
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "ggml.h"

-#include <cmath>
-#include <cstdint>
 #include <cstdlib>
 #include <string>
 #include <vector>
@ -13,7 +11,7 @@
 #include <fstream>
 #include <regex>

-static void print_usage(int, char ** argv) {
+static void print_usage(int /*argc*/, char ** argv) {
    const std::string usage_template = R"(
        example usage:

@ -35,28 +33,6 @@ static void print_usage(int, char ** argv) {
    LOG("%s\n", usage.c_str());
 }

-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
-
-struct callback_data {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
-
-    callback_data() = default;
-
-    callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
-        for (const auto & pattern : filter_patterns) {
-            try {
-                std::string anchored_pattern = "^" + pattern;
-                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-            } catch (const std::regex_error & e) {
-                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-            }
-        }
-        params.cb_eval           = ggml_debug;
-        params.cb_eval_user_data = this;
-    }
-};
-
 static bool has_pooling(llama_context * ctx) {
    switch (llama_pooling_type(ctx)) {
        case LLAMA_POOLING_TYPE_NONE:
@ -120,168 +96,6 @@ struct output_data {
    }
 };

-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.i = (uint32_t)h.bits << 16;
-    return u.f;
-}
-
-static float ggml_get_float_value(const uint8_t * data, ggml_type type,
-        const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
-    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-    switch (type) {
-        case GGML_TYPE_F16:
-            return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
-        case GGML_TYPE_F32:
-            return *(const float *) &data[i];
-        case GGML_TYPE_I64:
-            return (float) *(const int64_t *) &data[i];
-        case GGML_TYPE_I32:
-            return (float) *(const int32_t *) &data[i];
-        case GGML_TYPE_I16:
-            return (float) *(const int16_t *) &data[i];
-        case GGML_TYPE_I8:
-            return (float) *(const int8_t *) &data[i];
-        case GGML_TYPE_BF16:
-            return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
-        default:
-            GGML_ABORT("fatal error");
-    }
-}
-
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
-    float sum    = 0;
-    float sum_sq = 0.0;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    sum    += v;
-                    sum_sq += v * v;
-                }
-            }
-        }
-    }
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG_DBG("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                LOG_DBG("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
-            LOG_DBG("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    LOG_DBG("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
-                LOG_DBG("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        LOG_DBG("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    LOG_DBG("%12.4f", v);
-                    if (i0 < ne[0] - 1) {
-                        LOG_DBG(", ");
-                    }
-                }
-                LOG_DBG("],\n");
-            }
-            LOG_DBG("                                      ],\n");
-        }
-        LOG_DBG("                                     ]\n");
-        LOG_DBG("                                     sum    = %f\n", sum);
-        LOG_DBG("                                     sum_sq = %f\n", sum_sq);
-    }
-
-    if (std::isnan(sum)) {
-        LOG_ERR("encountered NaN - aborting\n");
-        exit(0);
-    }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- *            see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (callback_data *) user_data;
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    if (ask) {
-        return true; // Always retrieve data
-    }
-
-    bool matches_filter = cb_data->tensor_filters.empty();
-
-    if (!matches_filter) {
-        for (const auto & filter : cb_data->tensor_filters) {
-            if (std::regex_search(t->name, filter)) {
-                matches_filter = true;
-                break;
-            }
-        }
-    }
-
-    char src1_str[128] = {0};
-    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
-    }
-
-    if (matches_filter) {
-        LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-             t->name,
-             ggml_type_name(t->type),
-             ggml_op_desc(t),
-             src0->name,
-             ggml_ne_string(src0).c_str(),
-             src1 ? src1_str : "",
-             ggml_ne_string(t).c_str());
-    }
-
-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
-    if (!is_host) {
-        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
-    }
-
-    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
-    }
-
-    return true;
-}
-
-
 static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
    std::filesystem::create_directory(output_dir);
    auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
@ -408,7 +222,7 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    callback_data cb_data(params, params.tensor_filter);
+    base_callback_data cb_data(params, params.tensor_filter);

    auto llama_init = common_init_from_params(params);

--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -6,10 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TEST_TARGET test-eval-callback)
 if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-        add_test(NAME ${TEST_TARGET}
-                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+    llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739)
 else()
-        add_test(NAME ${TEST_TARGET}
-                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
+    llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d)
 endif()
-set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
+add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${LLAMA_DOWNLOAD_MODEL}" --prompt hello --seed 42 -ngl 0)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -1,165 +1,12 @@
 #include "arg.h"
 #include "common.h"
+#include "debug.h"
 #include "log.h"
 #include "llama.h"
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
+#include "llama-cpp.h"
 #include <string>
 #include <vector>

-/**
- * This the arbitrary data which will be passed to each callback.
- * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
- */
-struct callback_data {
-    std::vector<uint8_t> data;
-};
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.i = (uint32_t)h.bits << 16;
-    return u.f;
-}
-
-static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
-    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-    float v;
-    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
-    } else if (type == GGML_TYPE_F32) {
-        v = *(const float *) &data[i];
-    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(const int64_t *) &data[i];
-    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(const int32_t *) &data[i];
-    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(const int16_t *) &data[i];
-    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(const int8_t *) &data[i];
-    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
-    } else {
-        GGML_ABORT("fatal error");
-    }
-    return v;
-}
-
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
-    float sum = 0;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    sum += v;
-                }
-            }
-        }
-    }
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                LOG("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
-            LOG("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    LOG("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
-                LOG("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        LOG("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    LOG("%12.4f", v);
-                    if (i0 < ne[0] - 1) LOG(", ");
-                }
-                LOG("],\n");
-            }
-            LOG("                                      ],\n");
-        }
-        LOG("                                     ]\n");
-        LOG("                                     sum = %f\n", sum);
-    }
-
-    // TODO: make this abort configurable/optional?
-    if (std::isnan(sum)) {
-        LOG_ERR("encountered NaN - aborting\n");
-        exit(0);
-    }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- *            see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (callback_data *) user_data;
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    if (ask) {
-        return true; // Always retrieve data
-    }
-
-    char src1_str[128] = {0};
-    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
-    }
-
-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
-         src0->name, ggml_ne_string(src0).c_str(),
-         src1 ? src1_str : "",
-         ggml_ne_string(t).c_str());
-
-
-    // copy the data from the GPU memory if needed
-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
-    if (!is_host) {
-        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
-    }
-
-    if (!ggml_is_quantized(t->type)) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
-    }
-
-    return true;
-}
-
 static bool run(llama_context * ctx, const common_params & params) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);
@ -182,7 +29,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 }

 int main(int argc, char ** argv) {
-    callback_data cb_data;
+    base_callback_data cb_data;

    common_params params;

@ -197,7 +44,7 @@ int main(int argc, char ** argv) {

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    params.cb_eval = ggml_debug;
+    params.cb_eval = common_debug_cb_eval<false>;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

--- a/examples/llama.android/lib/build.gradle.kts
+++ b/examples/llama.android/lib/build.gradle.kts
@ -26,7 +26,7 @@ android {

                arguments += "-DBUILD_SHARED_LIBS=ON"
                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DLLAMA_CURL=OFF"
+                arguments += "-DLLAMA_OPENSSL=OFF"

                arguments += "-DGGML_NATIVE=OFF"
                arguments += "-DGGML_BACKEND_DL=ON"
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF

 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON

 ::  for FP32
-cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR

 ::  build all binary
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -530,6 +530,86 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }

+enum class block_reduce_method {
+    MAX,
+    SUM,
+};
+
+template<block_reduce_method method_t, typename T>
+struct block_reduce_policy;
+
+template <typename T, typename... Ts>
+inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
+
+template<typename...>
+inline constexpr bool ggml_cuda_dependent_false_v = false;
+
+template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
+    static __device__ T reduce(T val) {
+        if constexpr(is_any<T, float, float2, half2, int>) {
+            return warp_reduce_sum(val);
+        } else {
+            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
+        }
+    }
+
+    static __device__ T sentinel() {
+        if constexpr (std::is_same_v<T, float>) {
+            return 0.0f;
+        } else if constexpr (std::is_same_v<T, float2>) {
+            return make_float2(0.0f, 0.0f);
+        } else if constexpr (std::is_same_v<T, half2>) {
+            return make_half2(0.0f, 0.0f);
+        } else if constexpr (std::is_same_v<T, int>) {
+            return 0;
+        } else {
+            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
+        }
+    }
+};
+
+template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
+    static __device__ T reduce(T val) {
+        if constexpr (is_any<T, float, half2>) {
+            return warp_reduce_max(val);
+        } else {
+            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
+        }
+    }
+
+    static __device__ T sentinel() {
+        if constexpr (std::is_same_v<T, float>) {
+            return -INFINITY;
+        } else if constexpr (std::is_same_v<T, half2>) {
+            return make_half2(-INFINITY, -INFINITY);
+        } else {
+            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
+        }
+    }
+};
+
+template <block_reduce_method reduce_method_t, const unsigned int block_size_template = 0, typename T>
+static __device__ T block_reduce(T val, T * shared_vals) {
+    val                           = block_reduce_policy<reduce_method_t, T>::reduce(val);
+    const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
+    if (block_size > WARP_SIZE) {
+        assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0);
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            shared_vals[warp_id] = val;
+        }
+        __syncthreads();
+        val = block_reduce_policy<reduce_method_t, T>::sentinel();
+        if (lane_id < (static_cast<int>(block_size) / WARP_SIZE)) {
+            val = shared_vals[lane_id];
+        }
+        return block_reduce_policy<reduce_method_t, T>::reduce(val);
+    }
+
+    return val;
+}
+
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE

--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
    return 128;
 }

-// Currenlty llvm with the amdgcn target dose not support unrolling loops
+// Currenlty llvm with the amdgcn target does not support unrolling loops
 // that contain a break that can not be resolved at compile time.
 #ifdef __clang__
 #pragma clang diagnostic push
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4551,7 +4551,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_L2_NORM:
            return true;
        case GGML_OP_RMS_NORM_BACK:
-            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
+            return ggml_is_contiguous(op->src[0]);
            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@ -25,19 +25,8 @@ static __global__ void norm_f32(
    }

    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var);
-    if constexpr (block_size > WARP_SIZE) {
-        static_assert(block_size == 1024, "unexpected block_size");
-        __shared__ float2 s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        __syncthreads();
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var);
-    }
+    extern __shared__ float2 s_sum2[];
+    mean_var = block_reduce<block_reduce_method::SUM, block_size>(mean_var, s_sum2);

    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
@ -61,19 +50,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += x[j];
    }

-    tmp = warp_reduce_sum(tmp);
-    if constexpr (block_size > WARP_SIZE) {
-        static_assert(block_size == 1024, "unexpected block_size");
-        __shared__ float s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);

    const float mean = tmp / group_size;
    tmp = 0.0f;
@ -84,18 +62,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += xi * xi;
    }

-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);

    const float variance = tmp / group_size;
    const float scale = rsqrtf(variance + eps);
@ -163,22 +130,8 @@ static __global__ void rms_norm_f32(const float * x,
    }

    // sum up partial sums
-    tmp = warp_reduce_sum(tmp);
-    if constexpr (block_size > WARP_SIZE) {
-        static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
-        __shared__ float s_sum[32];
-        const int        warp_id = tid / WARP_SIZE;
-        const int        lane_id = tid % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = 0.0f;
-        if (lane_id < (block_size / WARP_SIZE)) {
-            tmp = s_sum[lane_id];
-        }
-        tmp = warp_reduce_sum(tmp);
-    }
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);
@ -306,19 +259,8 @@ static __global__ void l2_norm_f32(
    }

    // sum up partial sums
-    tmp = warp_reduce_sum(tmp);
-    if constexpr (block_size > WARP_SIZE) {
-        static_assert(block_size == 1024, "unexpected block_size");
-        __shared__ float s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);

    // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
    const float scale = rsqrtf(fmaxf(tmp, eps * eps));
@ -337,7 +279,7 @@ static void norm_f32_cuda(
        norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@ -348,7 +290,7 @@ static void group_norm_f32_cuda(
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
+        group_norm_f32<1024><<<num_groups, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps);
    }
 }

@ -358,10 +300,10 @@ static void rms_norm_f32_cuda(
    const dim3 blocks_num(nrows, nchannels, nsamples);
    if (ncols < 1024) {
        const dim3 block_dims(256, 1, 1);
-        rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@ -404,12 +346,12 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
+            rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
+            rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        }
@ -425,14 +367,14 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 add_nsamples_packed  = init_fastdiv_values(add_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
+            rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
                add_nchannels_packed, add_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
+            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
@ -460,7 +402,7 @@ static void l2_norm_f32_cuda(
        l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        l2_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

--- a/ggml/src/ggml-cuda/reduce_rows.cuh
+++ b/ggml/src/ggml-cuda/reduce_rows.cuh
@ -28,22 +28,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
    }

    // sum up partial sums
-    sum = warp_reduce_sum(sum);
-    if (blockDim.x > WARP_SIZE) {
-        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
-        __shared__ float s_sum[32];
-        const int        warp_id = threadIdx.x / WARP_SIZE;
-        const int        lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = sum;
-        }
-        __syncthreads();
-        sum = 0.0f;
-        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
-            sum = s_sum[lane_id];
-        }
-        sum = warp_reduce_sum(sum);
-    }
+    __shared__ float shared_vals[32];
+    sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);

    if (col != 0) {
        return;
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@ -75,9 +75,6 @@ static __global__ void soft_max_f32(

    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
    const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);

    extern __shared__ float data_soft_max_f32[];
@ -102,21 +99,7 @@ static __global__ void soft_max_f32(
    }

    // find the max value in the block
-    max_val = warp_reduce_max(max_val);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = -INFINITY;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = max_val;
-        }
-        __syncthreads();
-
-        max_val = buf_iw[lane_id];
-        max_val = warp_reduce_max(max_val);
-    }
+    max_val = block_reduce<block_reduce_method::MAX, block_size_template>(max_val, buf_iw);

    float tmp = 0.0f; // partial sum

@ -134,22 +117,7 @@ static __global__ void soft_max_f32(
    }

    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __syncthreads();
-        if (warp_id == 0) {
-            buf_iw[lane_id] = 0.0f;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = tmp;
-        }
-        __syncthreads();
-
-        tmp = buf_iw[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
+    tmp = block_reduce<block_reduce_method::SUM, block_size_template>(tmp, buf_iw);

    if (sinks) {
        tmp += expf(sinks[i02] - max_val);
@ -169,50 +137,6 @@ static __global__ void soft_max_f32(
    }
 }

-
-// TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated
-static __device__ float two_stage_warp_reduce_max(float val) {
-    val = warp_reduce_max(val);
-    if (blockDim.x > WARP_SIZE) {
-        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
-        __shared__ float local_vals[32];
-        const int        warp_id = threadIdx.x / WARP_SIZE;
-        const int        lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            local_vals[warp_id] = val;
-        }
-        __syncthreads();
-        val = -INFINITY;
-        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
-            val = local_vals[lane_id];
-        }
-        return warp_reduce_max(val);
-    } else {
-        return val;
-    }
-}
-
-static __device__ float two_stage_warp_reduce_sum(float val) {
-    val = warp_reduce_sum(val);
-    if (blockDim.x > WARP_SIZE) {
-        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
-        __shared__ float local_vals[32];
-        const int        warp_id = threadIdx.x / WARP_SIZE;
-        const int        lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            local_vals[warp_id] = val;
-        }
-        __syncthreads();
-        val = 0.0f;
-        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
-            val = local_vals[lane_id];
-        }
-        return warp_reduce_sum(val);
-    } else {
-        return val;
-    }
-}
-
 // TODO: Template to allow keeping ncols in registers if they fit
 static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
                                                                float * __restrict__ dst,
@ -230,6 +154,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    float     local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
    float     local_max                     = -INFINITY;
    const int step_size                     = gridDim.x * blockDim.x;
+    __shared__ float shared_vals[32];

    // Compute thread-local max
    for (int col = col_start; col < p.ncols;) {
@ -246,7 +171,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    }

    // Compute CTA-level max
-    local_max = two_stage_warp_reduce_max(local_max);
+    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);

    // Store CTA-level max to GMEM
    if (tid == 0) {
@ -261,7 +186,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    } else {
        local_max = -INFINITY;
    }
-    local_max = two_stage_warp_reduce_max(local_max);
+    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);

    // Compute softmax dividends, accumulate divisor
    float tmp_expf = 0.0f;
@ -284,7 +209,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    }

    // Reduce divisor within CTA
-    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
+    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);

    // Store CTA-level sum to GMEM
    if (tid == 0) {
@ -298,7 +223,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
    } else {
        tmp_expf = 0.0f;
    }
-    tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
+    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);

    // Divide dividend by global sum + store data
    for (int col = col_start; col < p.ncols;) {
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@ -23,11 +23,6 @@ if (GGML_METAL_NDEBUG)
    add_compile_definitions(GGML_METAL_NDEBUG)
 endif()

-# copy metal files to bin directory
-configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
-configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
-configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
-
 set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
 if (GGML_METAL_EMBED_LIBRARY)
    enable_language(ASM)
@ -37,12 +32,12 @@ if (GGML_METAL_EMBED_LIBRARY)
    set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    set(METALLIB_IMPL   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")

-    file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")

    # merge ggml-common.h and ggml-metal.metal into a single file
-    set(METALLIB_EMBED_ASM        "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
-    set(METALLIB_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
-    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
+    set(METALLIB_EMBED_ASM        "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+    set(METALLIB_SOURCE_EMBED     "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")

    add_custom_command(
        OUTPUT "${METALLIB_EMBED_ASM}"
@ -62,6 +57,11 @@ if (GGML_METAL_EMBED_LIBRARY)

    target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
 else()
+    # copy metal files to bin directory
+    configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
+    configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
+    configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
+
    if (GGML_METAL_SHADER_DEBUG)
        # custom command to do the following:
        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -14413,13 +14413,29 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
    const vk_device& device = ggml_vk_get_device(ctx->device);

+    const bool uses_bda = (op->op == GGML_OP_IM2COL || op->op == GGML_OP_IM2COL_3D) &&
+                          device->shader_int64 && device->buffer_device_address;
+
+    auto const & tensor_size_supported = [&](size_t tensor_size) {
+        if (tensor_size > device->max_buffer_size) {
+            return false;
+        }
+        // For im2col shaders using BDA, maxStorageBufferRange limit doesn't apply.
+        // If shader64BitIndexing is enabled, maxStorageBufferRange limit doesn't apply.
+        if (!uses_bda && !device->shader_64b_indexing) {
+            if (tensor_size > device->properties.limits.maxStorageBufferRange) {
+                return false;
+            }
+        }
+        return true;
+    };
    // reject any tensors larger than the max buffer size
    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) {
+        if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) {
            return false;
        }
    }
-    if (ggml_nbytes(op) > device->max_buffer_size) {
+    if (!tensor_size_supported(ggml_nbytes(op))) {
        return false;
    }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl
@ -264,7 +264,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
        const i8vec2 scales = i8vec2(unpack8(uint32_t(((data_a_packed16[ib_k].scales[(is % 8      ) / 2] >> (4 * (is / 8))) & 0x0F0F) |
                                                     (((data_a_packed16[ib_k].scales[(8 + (is % 4)) / 2] >> (2 * (is / 4))) & 0x0303) << 4))).xy); // vec4 used due to #12147

-        buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales - 32);
+        buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales - 32));
    }
 }

@ -334,7 +334,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
                              (data_a[ib_k].scales[is+4] >>  4) | ((data_a[ib_k].scales[is  ] & 0xC0) >> 2));
        }

-        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm);
+        buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(vec2(data_a_packed32[ib_k].dm) * vec2(scale_dm));
    }
 }

@ -385,7 +385,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
        const uint is = iqs_k / 4;
        const i8vec2 scales = unpack8(int32_t(data_a_packed16[ib_k].scales[is / 2])).xy;

-        buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales);
+        buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales));
    }
 }

--- a/licenses/LICENSE-curl
+++ b/licenses/LICENSE-curl
@ -1,22 +0,0 @@
-COPYRIGHT AND PERMISSION NOTICE
-
-Copyright (c) 1996 - 2026, Daniel Stenberg, <daniel@haxx.se>, and many
-contributors, see the THANKS file.
-
-All rights reserved.
-
-Permission to use, copy, modify, and distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright
-notice and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
-NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-
-Except as contained in this notice, the name of a copyright holder shall not
-be used in advertising or otherwise to promote the sale, use or other dealings
-in this Software without prior written authorization of the copyright holder.
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@ -109,8 +109,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 # Step 2: Setup Build Environment and Compile Test Binaries
 ###########################################################

-# Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 || abort "Failed to build environment"
 pushd "$build_dir"
 make -j || abort "Failed to compile"
 popd > /dev/null || exit 1
--- a/scripts/serve-static.js
+++ b/scripts/serve-static.js
@ -4,7 +4,7 @@ const path = require('path');

 // This file is used for testing wasm build from emscripten
 // Example build command:
-// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF
+// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_OPENSSL=OFF
 // cmake --build build-wasm --target test-backend-ops -j

 const PORT = 8080;
--- a/scripts/tool_bench.py
+++ b/scripts/tool_bench.py
@ -7,7 +7,7 @@

    Simple usage example:

-        cmake -B build -DLLAMA_CURL=1 && cmake --build build --config Release -j -t llama-server
+        cmake -B build && cmake --build build --config Release -j -t llama-server

        export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
        export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -614,9 +614,9 @@ struct llama_mlock::impl {

        char* errmsg = std::strerror(errno);
        bool suggest = (errno == ENOMEM);
-#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
-        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
-        // Skip resource limit checks on visionOS/tvOS
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
+        // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
+        // Skip resource limit checks on these platforms
        suggest = false;
 #else
        struct rlimit lock_limit;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -446,7 +446,7 @@ struct llama_model::impl {
    llama_mlocks mlock_bufs;
    llama_mlocks mlock_mmaps;

-    // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
+    // contexts where the model tensors metadata is stored as well as the corresponding buffers:
    std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;

    buft_list_t cpu_buft_list;
@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, true);
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,                hparams.n_expert_groups, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_used, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -202,15 +202,13 @@ llama_build_and_test(
 llama_build_and_test(test-regex-partial.cpp)

 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-    llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+    llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739)
 else()
-    llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+    llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d)
 endif()
+llama_build_and_test(test-thread-safety.cpp ARGS -m "${LLAMA_DOWNLOAD_MODEL}" -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)

-# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
-if (NOT WIN32)
-    llama_build_and_test(test-arg-parser.cpp)
-endif()
+llama_build_and_test(test-arg-parser.cpp)

 if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
  # TODO: repair known memory leaks
@ -225,11 +223,7 @@ llama_build_and_test(test-backend-sampler.cpp   LABEL "model")

 # Test for state restore with fragmented KV cache
 # Requires a model, uses same args pattern as test-thread-safety
-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf)
-else()
-    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf)
-endif()
+llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${LLAMA_DOWNLOAD_MODEL}")

 if (NOT GGML_BACKEND_DL)
    # these tests use the backends directly and cannot be built with dynamic loading
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -173,7 +173,7 @@ int main(void) {
    assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32

-    printf("test-arg-parser: test curl-related functions\n\n");
+    printf("test-arg-parser: test download functions\n\n");
    const char * GOOD_URL = "http://ggml.ai/";
    const char * BAD_URL  = "http://ggml.ai/404";

--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -7482,25 +7482,29 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
    test_cases.emplace_back(new test_silu_back());

-    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
-        for (bool v : {false, true}) {
-            test_cases.emplace_back(new test_norm    (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
-            test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) {
+        for (uint32_t n : { 64, 1025 }) {
+            for (bool v : { false, true }) {
+                test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
+                test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
+            }
+            test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
+            test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
        }
-        test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
-        test_cases.emplace_back(new test_l2_norm      (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
    }

    // in-place tests
    test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, false, 1e-6f, true));

-    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
-        test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
-        test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
-        test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
-        test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
-        test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
-        test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
+    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f }) {
+        for (uint32_t n : { 64, 1025 }) {
+            test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+            test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+            test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+        }
    }
    for (uint32_t n : {1, 511, 1025, 8192, 33*512}) {
        for (bool multi_add : {false, true}) {
@ -7524,9 +7528,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
            }
        }
    }
-
-    test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
-
    for (int64_t d_conv : {3, 4, 9}) {
        for (int64_t d_inner: {1024, 1536, 2048}) {
            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@ -32,10 +32,6 @@ struct clip_graph {
    const float kq_scale;
    const clip_flash_attn_type flash_attn_type;

-    // for debugging
-    const bool debug_graph;
-    std::vector<ggml_tensor *> & debug_print_tensors;
-
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -152,18 +152,14 @@ struct clip_ctx {
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_ptr buf;

+
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;

-    // for debugging
-    bool debug_graph = false;
-    std::vector<ggml_tensor *> debug_print_tensors;
-
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
-        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (!backend_cpu) {
            throw std::runtime_error("failed to initialize CPU backend");
@ -204,6 +200,10 @@ struct clip_ctx {
        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
        );
+
+        if (ctx_params.cb_eval != nullptr) {
+            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
+        }
    }

    ~clip_ctx() {
@ -239,9 +239,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
        eps(hparams.eps),
        kq_scale(1.0f / sqrtf((float)d_head)),
-        flash_attn_type(ctx->flash_attn_type),
-        debug_graph(ctx->debug_graph),
-        debug_print_tensors(ctx->debug_print_tensors) {
+        flash_attn_type(ctx->flash_attn_type) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
@ -252,14 +250,11 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }

-void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
-    if (debug_graph) {
-        ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
-        std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
-        ggml_set_name(cur, cur_name.c_str());
-        ggml_set_output(cur);
-        ggml_build_forward_expand(gf, cur);
-        debug_print_tensors.push_back(cur);
+void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
+    if (il >= 0) {
+        ggml_format_name(cur, "%s-%d", name, il);
+    } else {
+        ggml_set_name(cur, name);
    }
 }

@ -1533,8 +1528,8 @@ struct clip_model_loader {
                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
@ -1776,8 +1771,8 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
-                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
-                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
@ -3364,7 +3359,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // build the inference graph
-    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@ -3757,18 +3751,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

-    // print debug nodes
-    if (ctx->debug_graph) {
-        LOG_INF("\n\n---\n\n");
-        LOG_INF("\n\nDebug graph:\n\n");
-        for (ggml_tensor * t : ctx->debug_print_tensors) {
-            std::vector<uint8_t> data(ggml_nbytes(t));
-            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
-            print_tensor_shape(t);
-            print_tensor_data(t, data.data(), 3);
-        }
-    }
-
    // the last node is the embedding tensor
    ggml_tensor * embeddings = ggml_graph_node(gf, -1);

@ -3921,7 +3903,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
-
 void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    clip_image_f32 img;
    img.nx = w;
@ -3930,9 +3911,6 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
    for (int i = 0; i < h * w * 3; i++) {
        img.buf[i] = static_cast<float>(fill_value);
    }
-    bool cur_debug_graph = ctx->debug_graph;
-    ctx->debug_graph = true;
    clip_image_encode(ctx, 1, &img, nullptr);
-    ctx->debug_graph = cur_debug_graph;
    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
 }
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -1,6 +1,7 @@
 #pragma once

 #include "ggml.h"
+#include "mtmd.h"

 #include <stddef.h>
 #include <stdint.h>
@ -37,6 +38,8 @@ struct clip_context_params {
    int image_min_tokens;
    int image_max_tokens;
    bool warmup;
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
 };

 struct clip_init_result {
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -1,4 +1,5 @@
 #include "arg.h"
+#include "debug.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
@ -88,6 +89,8 @@ struct mtmd_cli_context {
    int n_threads    = 1;
    llama_pos n_past = 0;

+    base_callback_data cb_data;
+
    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
        model = llama_init->model();
        lctx = llama_init->context();
@ -139,6 +142,10 @@ struct mtmd_cli_context {
        mparams.warmup           = params.warmup;
        mparams.image_min_tokens = params.image_min_tokens;
        mparams.image_max_tokens = params.image_max_tokens;
+        if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
+            mparams.cb_eval_user_data = &cb_data;
+            mparams.cb_eval = common_debug_cb_eval<false>;
+        }
        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
        if (!ctx_vision.get()) {
            LOG_ERR("Failed to load vision model from %s\n", clip_path);
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -111,6 +111,8 @@ mtmd_context_params mtmd_context_params_default() {
        /* warmup            */ true,
        /* image_min_tokens  */ -1,
        /* image_max_tokens  */ -1,
+        /* cb_eval           */ nullptr,
+        /* cb_eval_user_data */ nullptr,
    };
    return params;
 }
@ -176,6 +178,8 @@ struct mtmd_context {
            /* image_min_tokens  */ ctx_params.image_min_tokens,
            /* image_max_tokens  */ ctx_params.image_max_tokens,
            /* warmup            */ ctx_params.warmup,
+            /* cb_eval           */ ctx_params.cb_eval,
+            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
        };

        auto res = clip_init(mmproj_fname, ctx_clip_params);
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -95,6 +95,10 @@ struct mtmd_context_params {
    // limit number of image tokens, only for vision models with dynamic resolution
    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+    // callback function passed over to mtmd proper
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
 };

 MTMD_API const char * mtmd_default_marker(void);
@ -273,12 +277,12 @@ struct bitmap {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
    ~bitmap() = default;
-    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
-    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
-    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
-    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
-    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
+    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
 };

 struct bitmaps {
@ -302,8 +306,8 @@ struct input_chunks {
    input_chunks() = default;
    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
    ~input_chunks() = default;
-    size_t size() { return mtmd_input_chunks_size(ptr.get()); }
-    const mtmd_input_chunk * operator[](size_t idx) {
+    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+    const mtmd_input_chunk * operator[](size_t idx) const {
        return mtmd_input_chunks_get(ptr.get(), idx);
    }
 };
--- a/tools/tts/README.md
+++ b/tools/tts/README.md
@ -4,7 +4,7 @@ This example demonstrates the Text To Speech feature. It uses a
 [outeai](https://www.outeai.com/).

 ## Quickstart
-If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the
+If you have built llama.cpp with SSL support you can simply run the
 following command and the required models will be downloaded automatically:
 ```console
 $ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav