Merge branch 'master' of https://github.com/ggml-org/llama.cpp into paddleocr-vl
This commit is contained in:
commit
b9fd3c8578
|
|
@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
|
|||
FROM ${CANN_BASE_IMAGE} AS build
|
||||
|
||||
# -- Install build dependencies --
|
||||
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||
RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
|
|||
ARG TARGETARCH
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
|||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
|||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
apt-get install -y git libssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
RUN yum install -y gcc g++ cmake make libcurl-devel
|
||||
RUN yum install -y gcc g++ cmake make openssl-devel
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ RUN apt-get update && \
|
|||
python3 \
|
||||
python3-pip \
|
||||
git \
|
||||
libcurl4-openssl-dev \
|
||||
libssl-dev \
|
||||
libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@
|
|||
useMpi ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
useRpc ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
|
@ -160,15 +159,13 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
++ optionals enableCurl [ curl ];
|
||||
++ optionals useVulkan vulkanBuildInputs;
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ RUN apt-get update \
|
|||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
libcurl4-openssl-dev \
|
||||
libssl-dev \
|
||||
curl \
|
||||
libgomp1
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
|||
apt install -y --no-install-recommends \
|
||||
git cmake ccache ninja-build \
|
||||
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
|
||||
libopenblas-dev libcurl4-openssl-dev && \
|
||||
libopenblas-dev libssl-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
|
|
|||
|
|
@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
|
|||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
# Install cURL and Vulkan SDK dependencies
|
||||
RUN apt install -y libcurl4-openssl-dev curl \
|
||||
# Install SSL and Vulkan SDK dependencies
|
||||
RUN apt install -y libssl-dev curl \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
|
||||
|
||||
# Build it
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ jobs:
|
|||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ jobs:
|
|||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -76,7 +76,7 @@ jobs:
|
|||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_VULKAN=ON \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
|
|
@ -122,7 +122,7 @@ jobs:
|
|||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_VULKAN=ON \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
|
|
@ -178,7 +178,7 @@ jobs:
|
|||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -235,7 +235,7 @@ jobs:
|
|||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
|
|
@ -281,7 +281,7 @@ jobs:
|
|||
- name: Build
|
||||
run: |
|
||||
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
|
|||
|
|
@ -79,7 +79,6 @@ jobs:
|
|||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
|
|
@ -92,7 +91,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
|
@ -118,7 +117,6 @@ jobs:
|
|||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
|
|
@ -227,8 +225,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
|
@ -237,7 +233,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
|
|
@ -293,8 +289,6 @@ jobs:
|
|||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
|
|
@ -305,8 +299,6 @@ jobs:
|
|||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
|
|
@ -336,14 +328,10 @@ jobs:
|
|||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_LLGUIDANCE=ON
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
|
@ -377,8 +365,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -412,8 +398,6 @@ jobs:
|
|||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
|
|
@ -470,8 +454,6 @@ jobs:
|
|||
run: |
|
||||
source ./vulkan_sdk/setup-env.sh
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_VULKAN=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -545,8 +527,6 @@ jobs:
|
|||
run: |
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_WEBGPU=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -593,7 +573,7 @@ jobs:
|
|||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
cmake --build build-wasm --target test-backend-ops -j $(nproc)
|
||||
|
|
@ -624,8 +604,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
|
|
@ -657,8 +635,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -706,8 +682,6 @@ jobs:
|
|||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
|
|
@ -757,8 +731,6 @@ jobs:
|
|||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
|
|
@ -893,7 +865,7 @@ jobs:
|
|||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
|
@ -1043,7 +1015,7 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
|
|
@ -1101,8 +1073,6 @@ jobs:
|
|||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||
|
|
@ -1150,7 +1120,6 @@ jobs:
|
|||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DLLAMA_CURL=OFF ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
|
|
@ -1258,7 +1227,6 @@ jobs:
|
|||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DLLAMA_CURL=OFF `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||
-DROCM_DIR="${env:HIP_PATH}" `
|
||||
-DGGML_HIP=ON `
|
||||
|
|
@ -1285,7 +1253,7 @@ jobs:
|
|||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
|
@ -1352,7 +1320,7 @@ jobs:
|
|||
matrix:
|
||||
include:
|
||||
- build: 'arm64-cpu'
|
||||
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
|
||||
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
|
||||
- build: 'arm64-snapdragon'
|
||||
defines: '--preset arm64-android-snapdragon-release'
|
||||
|
||||
|
|
@ -1469,8 +1437,6 @@ jobs:
|
|||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE}
|
||||
cmake --build build -j $(nproc)
|
||||
|
|
@ -1499,7 +1465,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1525,7 +1491,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1551,7 +1517,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1577,7 +1543,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1603,7 +1569,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1767,7 +1733,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install -y build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
|
|
@ -1834,8 +1800,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -1853,7 +1817,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
|
|
@ -1928,7 +1892,7 @@ jobs:
|
|||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -1947,7 +1911,7 @@ jobs:
|
|||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -2018,7 +1982,7 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -2092,8 +2056,6 @@ jobs:
|
|||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
|
@ -2129,7 +2091,6 @@ jobs:
|
|||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
libcurl4-openssl-dev \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
# Install git-clang-format script for formatting only changed code
|
||||
wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
|
||||
sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
|
||||
|
|
|
|||
|
|
@ -45,7 +45,6 @@ jobs:
|
|||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
|
|
@ -95,7 +94,6 @@ jobs:
|
|||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
|
|
@ -161,8 +159,6 @@ jobs:
|
|||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -212,8 +208,6 @@ jobs:
|
|||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
|
|
@ -269,7 +263,6 @@ jobs:
|
|||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DLLAMA_CURL=OFF ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
|
|
@ -358,7 +351,7 @@ jobs:
|
|||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release --target ${{ matrix.target }}
|
||||
|
||||
- name: Pack artifacts
|
||||
|
|
@ -412,7 +405,7 @@ jobs:
|
|||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CPU=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DLLAMA_CURL=OFF ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
|
||||
|
|
@ -481,7 +474,7 @@ jobs:
|
|||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_CURL=OFF
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
|
|
@ -608,7 +601,7 @@ jobs:
|
|||
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DLLAMA_CURL=OFF
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
md "build\bin\hipblaslt\library"
|
||||
|
|
@ -649,7 +642,7 @@ jobs:
|
|||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
|
@ -734,8 +727,6 @@ jobs:
|
|||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE}
|
||||
cmake --build build -j $(nproc)
|
||||
|
|
|
|||
|
|
@ -168,8 +168,6 @@ jobs:
|
|||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
|
|
@ -182,8 +180,6 @@ jobs:
|
|||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
|
|
@ -195,8 +191,6 @@ jobs:
|
|||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ jobs:
|
|||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
|
|
@ -108,7 +108,7 @@ jobs:
|
|||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
|
|
|
|||
|
|
@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
|||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
||||
option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
|
||||
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
||||
|
||||
# deprecated
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
if (LLAMA_CURL)
|
||||
message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
|
||||
endif()
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
|
@ -212,11 +217,6 @@ add_subdirectory(src)
|
|||
# utils, programs, examples and tests
|
||||
#
|
||||
|
||||
if (NOT LLAMA_BUILD_COMMON)
|
||||
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
|
||||
set(LLAMA_CURL OFF)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
if (LLAMA_HTTPLIB)
|
||||
|
|
|
|||
|
|
@ -586,6 +586,5 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
|||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
|
||||
|
|
|
|||
|
|
@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
|
|
@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
|
|
@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \
|
|||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
|
|
@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_HTTPLIB=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
|
|
@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_HTTPLIB=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
|
|
@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
|
||||
|
|
@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
|
|||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ sd=`dirname $0`
|
|||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
|
|
|
|||
|
|
@ -33,3 +33,25 @@ function(llama_add_compile_flags)
|
|||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(llama_download_model NAME HASH)
|
||||
set(DEST "${CMAKE_BINARY_DIR}/${NAME}")
|
||||
get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
|
||||
file(MAKE_DIRECTORY "${DEST_DIR}")
|
||||
if(NOT EXISTS "${DEST}")
|
||||
message(STATUS "Downloading ${NAME} from ggml-org/models...")
|
||||
endif()
|
||||
file(DOWNLOAD
|
||||
"https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
|
||||
"${DEST}"
|
||||
TLS_VERIFY ON
|
||||
EXPECTED_HASH ${HASH}
|
||||
STATUS status
|
||||
)
|
||||
list(GET status 0 code)
|
||||
if(NOT code EQUAL 0)
|
||||
list(GET status 1 msg)
|
||||
message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
|
||||
endif()
|
||||
set(LLAMA_DOWNLOAD_MODEL "${DEST}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
|
|||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
debug.cpp
|
||||
debug.h
|
||||
download.cpp
|
||||
download.h
|
||||
http.h
|
||||
|
|
@ -95,17 +97,7 @@ endif()
|
|||
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
|
||||
if (LLAMA_CURL)
|
||||
# Use curl to download model url
|
||||
find_package(CURL)
|
||||
if (NOT CURL_FOUND)
|
||||
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
|
||||
endif()
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
||||
elseif (LLAMA_HTTPLIB)
|
||||
# otherwise, use cpp-httplib
|
||||
if (LLAMA_HTTPLIB)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -341,7 +341,7 @@ static handle_model_result common_params_handle_model(
|
|||
if (model.path.empty()) {
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||
exit(1); // built without CURL, error message already printed
|
||||
exit(1); // error message already printed
|
||||
}
|
||||
model.name = model.hf_repo; // repo name with tag
|
||||
model.hf_repo = auto_detected.repo; // repo name without tag
|
||||
|
|
|
|||
|
|
@ -0,0 +1,165 @@
|
|||
#include "debug.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
|
||||
static std::string common_ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static float common_ggml_get_float_value(const uint8_t * data,
|
||||
ggml_type type,
|
||||
const size_t * nb,
|
||||
size_t i0,
|
||||
size_t i1,
|
||||
size_t i2,
|
||||
size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(const float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I64) {
|
||||
v = (float) *(const int64_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(const int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(const int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(const int8_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_BF16) {
|
||||
v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <bool abort>
|
||||
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
sum += v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
LOG_ERR(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2 * n) {
|
||||
LOG_ERR(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
LOG_ERR(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2 * n) {
|
||||
LOG_ERR(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
LOG_ERR(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2 * n) {
|
||||
LOG_ERR("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
LOG_ERR("%12.4f", v);
|
||||
if (i0 < ne[0] - 1) {
|
||||
LOG_ERR(", ");
|
||||
}
|
||||
}
|
||||
LOG_ERR("],\n");
|
||||
}
|
||||
LOG_ERR(" ],\n");
|
||||
}
|
||||
LOG_ERR(" ]\n");
|
||||
LOG_ERR(" sum = %f\n", sum);
|
||||
}
|
||||
|
||||
if constexpr (abort) {
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GGML operations callback during the graph execution.
|
||||
*
|
||||
* @param t current tensor
|
||||
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||
* see ggml_backend_sched_eval_callback
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (base_callback_data *) user_data;
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
if (ask) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
bool matches_filter = cb_data->tensor_filters.empty();
|
||||
|
||||
if (!matches_filter) {
|
||||
for (const auto & filter : cb_data->tensor_filters) {
|
||||
if (std::regex_search(t->name, filter)) {
|
||||
matches_filter = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
char src1_str[128] = { 0 };
|
||||
if (src1) {
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
if (matches_filter) {
|
||||
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
|
||||
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
|
||||
common_ggml_ne_string(t).c_str());
|
||||
}
|
||||
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type) && matches_filter) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Explicit template instantiations
|
||||
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
|
||||
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
|
||||
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
||||
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
#include "common.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
// common debug functions and structs
|
||||
|
||||
// Print a tensor's detailed data
|
||||
// data - the tensor's data in byte format
|
||||
// type - the tensor's quantization type
|
||||
// ne - the tensor dimensions array
|
||||
// nb - the tensor strides array
|
||||
// n - the number of rows/columns to fully print
|
||||
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
|
||||
|
||||
// Intended to use as callback for ggml_backend_sched_eval_callback
|
||||
// prints tensors that are processed in the computation graph
|
||||
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
|
||||
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
|
||||
// The template parameter determins whether an error should be thrown whenever a NaN is encountered
|
||||
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
|
||||
// The callback data will be passed as the third parameter (user_data)
|
||||
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
struct base_callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<std::regex> tensor_filters;
|
||||
|
||||
base_callback_data() = default;
|
||||
|
||||
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
||||
for (const auto & pattern : filter_patterns) {
|
||||
try {
|
||||
std::string anchored_pattern = "^" + pattern;
|
||||
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||
} catch (const std::regex_error & e) {
|
||||
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||
}
|
||||
}
|
||||
params.cb_eval = common_debug_cb_eval<false>;
|
||||
params.cb_eval_user_data = this;
|
||||
}
|
||||
};
|
||||
|
|
@ -19,10 +19,7 @@
|
|||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#include <curl/curl.h>
|
||||
#include <curl/easy.h>
|
||||
#elif defined(LLAMA_USE_HTTPLIB)
|
||||
#if defined(LLAMA_USE_HTTPLIB)
|
||||
#include "http.h"
|
||||
#endif
|
||||
|
||||
|
|
@ -171,336 +168,7 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
|
|||
return {hf_repo, tag};
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
//
|
||||
// CURL utils
|
||||
//
|
||||
|
||||
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
||||
struct curl_slist_ptr {
|
||||
struct curl_slist * ptr = nullptr;
|
||||
~curl_slist_ptr() {
|
||||
if (ptr) {
|
||||
curl_slist_free_all(ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static CURLcode common_curl_perf(CURL * curl) {
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
std::string accept_ranges;
|
||||
};
|
||||
|
||||
struct FILE_deleter {
|
||||
void operator()(FILE * f) const { fclose(f); }
|
||||
};
|
||||
|
||||
static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
static std::regex accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
} else if (std::regex_match(key, match, accept_ranges_regex)) {
|
||||
headers->accept_ranges = value;
|
||||
}
|
||||
}
|
||||
|
||||
return n_items;
|
||||
}
|
||||
|
||||
static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
|
||||
return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
|
||||
}
|
||||
|
||||
// helper function to hide password in URL
|
||||
static std::string llama_download_hide_password_in_url(const std::string & url) {
|
||||
// Use regex to match and replace the user[:password]@ pattern in URLs
|
||||
// Pattern: scheme://[user[:password]@]host[...]
|
||||
static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
|
||||
std::smatch match;
|
||||
|
||||
if (std::regex_match(url, match, url_regex)) {
|
||||
// match[1] = scheme (e.g., "https://")
|
||||
// match[2] = user[:password]@ part
|
||||
// match[3] = rest of URL (host and path)
|
||||
return match[1].str() + "********@" + match[3].str();
|
||||
}
|
||||
|
||||
return url; // No credentials found or malformed URL
|
||||
}
|
||||
|
||||
static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
# if defined(_WIN32)
|
||||
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
|
||||
// operating system. Currently implemented under MS-Windows.
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
# endif
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
|
||||
}
|
||||
|
||||
static void common_curl_easy_setopt_get(CURL * curl) {
|
||||
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
|
||||
|
||||
// display download progress
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
}
|
||||
|
||||
static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
|
||||
if (std::filesystem::exists(path_temporary)) {
|
||||
const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
|
||||
LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
|
||||
const std::string range_str = partial_size + "-";
|
||||
curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
|
||||
}
|
||||
|
||||
// Always open file in append mode could be resuming
|
||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
|
||||
if (!outfile) {
|
||||
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
common_curl_easy_setopt_get(curl);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
|
||||
|
||||
return common_curl_perf(curl) == CURLE_OK;
|
||||
}
|
||||
|
||||
static bool common_download_head(CURL * curl,
|
||||
curl_slist_ptr & http_headers,
|
||||
const std::string & url,
|
||||
const std::string & bearer_token) {
|
||||
if (!curl) {
|
||||
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
// Check if hf-token or bearer-token was specified
|
||||
if (!bearer_token.empty()) {
|
||||
std::string auth_header = "Authorization: Bearer " + bearer_token;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
common_curl_easy_setopt_head(curl, url);
|
||||
return common_curl_perf(curl) == CURLE_OK;
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
// returns status code or -1 on error
|
||||
static int common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
std::string etag;
|
||||
|
||||
// Check if the file already exists locally
|
||||
const auto file_exists = std::filesystem::exists(path);
|
||||
if (file_exists) {
|
||||
etag = read_etag(path);
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
common_load_model_from_url_headers headers;
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
curl_slist_ptr http_headers;
|
||||
|
||||
for (const auto & h : custom_headers) {
|
||||
std::string s = h.first + ": " + h.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
|
||||
}
|
||||
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
// we leave should_download as-is, which is true if the file does not exist
|
||||
bool should_download_from_scratch = false;
|
||||
if (head_request_ok) {
|
||||
// check if ETag or Last-Modified headers are different
|
||||
// if it is, we need to download the file again
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
|
||||
headers.etag.c_str());
|
||||
should_download = true;
|
||||
should_download_from_scratch = true;
|
||||
}
|
||||
}
|
||||
|
||||
const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
|
||||
if (should_download) {
|
||||
if (file_exists &&
|
||||
!accept_ranges_supported) { // Resumable downloads not supported, delete and start again.
|
||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string path_temporary = path + ".downloadInProgress";
|
||||
if (should_download_from_scratch) {
|
||||
if (std::filesystem::exists(path_temporary)) {
|
||||
if (remove(path_temporary.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::filesystem::exists(path)) {
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (head_request_ok) {
|
||||
write_etag(path, headers.etag);
|
||||
}
|
||||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
|
||||
__func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
|
||||
headers.etag.c_str(), headers.last_modified.c_str());
|
||||
const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
|
||||
if (!was_pull_successful) {
|
||||
if (i + 1 < max_attempts) {
|
||||
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
|
||||
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
} else {
|
||||
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
|
||||
int status = static_cast<int>(http_code);
|
||||
if (!is_http_status_ok(http_code)) {
|
||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||
return status; // TODO: maybe only return on certain codes
|
||||
}
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return -1;
|
||||
}
|
||||
|
||||
return static_cast<int>(http_code);
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
|
||||
return 304; // Not Modified - fake cached response
|
||||
}
|
||||
}
|
||||
|
||||
return -1; // max attempts reached
|
||||
}
|
||||
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
std::vector<char> res_buffer;
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
|
||||
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
||||
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
||||
auto data_vec = static_cast<std::vector<char> *>(data);
|
||||
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
|
||||
#if defined(_WIN32)
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
if (params.timeout > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
|
||||
}
|
||||
if (params.max_size > 0) {
|
||||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||
}
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
std::string header_ = header.first + ": " + header.second;
|
||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
|
||||
}
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
std::string error_msg = curl_easy_strerror(res);
|
||||
throw std::runtime_error("error: cannot make GET request: " + error_msg);
|
||||
}
|
||||
|
||||
long res_code;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
||||
|
||||
return { res_code, std::move(res_buffer) };
|
||||
}
|
||||
|
||||
#elif defined(LLAMA_USE_HTTPLIB)
|
||||
#if defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
class ProgressBar {
|
||||
static inline std::mutex mutex;
|
||||
|
|
@ -797,10 +465,6 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|||
return { res->status, std::move(buf) };
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
|
||||
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
int common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
|
|
@ -1151,7 +815,7 @@ int common_download_file_single(const std::string &,
|
|||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
|
||||
#endif // defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
std::vector<common_cached_model_info> common_list_cached_models() {
|
||||
std::vector<common_cached_model_info> models;
|
||||
|
|
|
|||
|
|
@ -8826,11 +8826,7 @@ class ExaoneMoEModel(Exaone4Model):
|
|||
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
|
||||
self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
|
||||
# For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE,
|
||||
# so that we can convert MTP weights to GGUF format for speculative decoding.
|
||||
# This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now.
|
||||
# Will be updated when HF config is updated.
|
||||
self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1))
|
||||
self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
|
||||
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_CURL": "OFF"
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
|
|
@ -38,7 +38,7 @@
|
|||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_CURL": "OFF"
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ Below is the build script: it requires utilizing RISC-V vector instructions for
|
|||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
|
|
|
|||
|
|
@ -65,10 +65,10 @@ cmake --build build --config Release
|
|||
cmake --preset x64-windows-llvm-release
|
||||
cmake --build build-x64-windows-llvm-release
|
||||
```
|
||||
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
|
||||
- **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev` # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
|
||||
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
|
||||
- **Arch / Manjaro:** `sudo pacman -S curl` # includes libcurl headers
|
||||
- If you want HTTPS/TLS features, you may install OpenSSL development libraries. If not installed, the project will build and run without SSL support.
|
||||
- **Debian / Ubuntu:** `sudo apt-get install libssl-dev`
|
||||
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install openssl-devel`
|
||||
- **Arch / Manjaro:** `sudo pacman -S openssl`
|
||||
|
||||
## BLAS Build
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,9 @@
|
|||
#include "debug.h"
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
|
@ -13,7 +11,7 @@
|
|||
#include <fstream>
|
||||
#include <regex>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
static void print_usage(int /*argc*/, char ** argv) {
|
||||
const std::string usage_template = R"(
|
||||
example usage:
|
||||
|
||||
|
|
@ -35,28 +33,6 @@ static void print_usage(int, char ** argv) {
|
|||
LOG("%s\n", usage.c_str());
|
||||
}
|
||||
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
struct callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<std::regex> tensor_filters;
|
||||
|
||||
callback_data() = default;
|
||||
|
||||
callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
||||
for (const auto & pattern : filter_patterns) {
|
||||
try {
|
||||
std::string anchored_pattern = "^" + pattern;
|
||||
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||
} catch (const std::regex_error & e) {
|
||||
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||
}
|
||||
}
|
||||
params.cb_eval = ggml_debug;
|
||||
params.cb_eval_user_data = this;
|
||||
}
|
||||
};
|
||||
|
||||
static bool has_pooling(llama_context * ctx) {
|
||||
switch (llama_pooling_type(ctx)) {
|
||||
case LLAMA_POOLING_TYPE_NONE:
|
||||
|
|
@ -120,168 +96,6 @@ struct output_data {
|
|||
}
|
||||
};
|
||||
|
||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(const uint8_t * data, ggml_type type,
|
||||
const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
case GGML_TYPE_F32:
|
||||
return *(const float *) &data[i];
|
||||
case GGML_TYPE_I64:
|
||||
return (float) *(const int64_t *) &data[i];
|
||||
case GGML_TYPE_I32:
|
||||
return (float) *(const int32_t *) &data[i];
|
||||
case GGML_TYPE_I16:
|
||||
return (float) *(const int16_t *) &data[i];
|
||||
case GGML_TYPE_I8:
|
||||
return (float) *(const int8_t *) &data[i];
|
||||
case GGML_TYPE_BF16:
|
||||
return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
float sum_sq = 0.0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
sum += v;
|
||||
sum_sq += v * v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
LOG_DBG(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
LOG_DBG(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
LOG_DBG(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
LOG_DBG(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
LOG_DBG(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
LOG_DBG("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
LOG_DBG("%12.4f", v);
|
||||
if (i0 < ne[0] - 1) {
|
||||
LOG_DBG(", ");
|
||||
}
|
||||
}
|
||||
LOG_DBG("],\n");
|
||||
}
|
||||
LOG_DBG(" ],\n");
|
||||
}
|
||||
LOG_DBG(" ]\n");
|
||||
LOG_DBG(" sum = %f\n", sum);
|
||||
LOG_DBG(" sum_sq = %f\n", sum_sq);
|
||||
}
|
||||
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GGML operations callback during the graph execution.
|
||||
*
|
||||
* @param t current tensor
|
||||
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||
* see ggml_backend_sched_eval_callback
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (callback_data *) user_data;
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
if (ask) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
bool matches_filter = cb_data->tensor_filters.empty();
|
||||
|
||||
if (!matches_filter) {
|
||||
for (const auto & filter : cb_data->tensor_filters) {
|
||||
if (std::regex_search(t->name, filter)) {
|
||||
matches_filter = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
if (matches_filter) {
|
||||
LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
t->name,
|
||||
ggml_type_name(t->type),
|
||||
ggml_op_desc(t),
|
||||
src0->name,
|
||||
ggml_ne_string(src0).c_str(),
|
||||
src1 ? src1_str : "",
|
||||
ggml_ne_string(t).c_str());
|
||||
}
|
||||
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type) && matches_filter) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
|
||||
std::filesystem::create_directory(output_dir);
|
||||
auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
|
||||
|
|
@ -408,7 +222,7 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
callback_data cb_data(params, params.tensor_filter);
|
||||
base_callback_data cb_data(params, params.tensor_filter);
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
|
||||
|
|
|
|||
|
|
@ -6,10 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739)
|
||||
else()
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
|
||||
llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d)
|
||||
endif()
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${LLAMA_DOWNLOAD_MODEL}" --prompt hello --seed 42 -ngl 0)
|
||||
|
|
|
|||
|
|
@ -1,165 +1,12 @@
|
|||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "debug.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include "llama-cpp.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* This the arbitrary data which will be passed to each callback.
|
||||
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||
*/
|
||||
struct callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
};
|
||||
|
||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(const float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I64) {
|
||||
v = (float) *(const int64_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(const int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(const int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(const int8_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_BF16) {
|
||||
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
sum += v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
LOG(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
LOG(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
LOG(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
LOG(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
LOG(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
LOG("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
LOG("%12.4f", v);
|
||||
if (i0 < ne[0] - 1) LOG(", ");
|
||||
}
|
||||
LOG("],\n");
|
||||
}
|
||||
LOG(" ],\n");
|
||||
}
|
||||
LOG(" ]\n");
|
||||
LOG(" sum = %f\n", sum);
|
||||
}
|
||||
|
||||
// TODO: make this abort configurable/optional?
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GGML operations callback during the graph execution.
|
||||
*
|
||||
* @param t current tensor
|
||||
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||
* see ggml_backend_sched_eval_callback
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (callback_data *) user_data;
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
if (ask) {
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
char src1_str[128] = {0};
|
||||
if (src1) {
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||
src0->name, ggml_ne_string(src0).c_str(),
|
||||
src1 ? src1_str : "",
|
||||
ggml_ne_string(t).c_str());
|
||||
|
||||
|
||||
// copy the data from the GPU memory if needed
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type)) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool run(llama_context * ctx, const common_params & params) {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
|
@ -182,7 +29,7 @@ static bool run(llama_context * ctx, const common_params & params) {
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
callback_data cb_data;
|
||||
base_callback_data cb_data;
|
||||
|
||||
common_params params;
|
||||
|
||||
|
|
@ -197,7 +44,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
params.cb_eval = ggml_debug;
|
||||
params.cb_eval = common_debug_cb_eval<false>;
|
||||
params.cb_eval_user_data = &cb_data;
|
||||
params.warmup = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ android {
|
|||
|
||||
arguments += "-DBUILD_SHARED_LIBS=ON"
|
||||
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||
arguments += "-DLLAMA_CURL=OFF"
|
||||
arguments += "-DLLAMA_OPENSSL=OFF"
|
||||
|
||||
arguments += "-DGGML_NATIVE=OFF"
|
||||
arguments += "-DGGML_BACKEND_DL=ON"
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ cd build
|
|||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
|
||||
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
|
|
|||
|
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: build all binary
|
||||
|
|
|
|||
|
|
@ -530,6 +530,86 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
|
|||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
enum class block_reduce_method {
|
||||
MAX,
|
||||
SUM,
|
||||
};
|
||||
|
||||
template<block_reduce_method method_t, typename T>
|
||||
struct block_reduce_policy;
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
|
||||
|
||||
template<typename...>
|
||||
inline constexpr bool ggml_cuda_dependent_false_v = false;
|
||||
|
||||
template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
|
||||
static __device__ T reduce(T val) {
|
||||
if constexpr(is_any<T, float, float2, half2, int>) {
|
||||
return warp_reduce_sum(val);
|
||||
} else {
|
||||
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ T sentinel() {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return 0.0f;
|
||||
} else if constexpr (std::is_same_v<T, float2>) {
|
||||
return make_float2(0.0f, 0.0f);
|
||||
} else if constexpr (std::is_same_v<T, half2>) {
|
||||
return make_half2(0.0f, 0.0f);
|
||||
} else if constexpr (std::is_same_v<T, int>) {
|
||||
return 0;
|
||||
} else {
|
||||
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
|
||||
static __device__ T reduce(T val) {
|
||||
if constexpr (is_any<T, float, half2>) {
|
||||
return warp_reduce_max(val);
|
||||
} else {
|
||||
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ T sentinel() {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return -INFINITY;
|
||||
} else if constexpr (std::is_same_v<T, half2>) {
|
||||
return make_half2(-INFINITY, -INFINITY);
|
||||
} else {
|
||||
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <block_reduce_method reduce_method_t, const unsigned int block_size_template = 0, typename T>
|
||||
static __device__ T block_reduce(T val, T * shared_vals) {
|
||||
val = block_reduce_policy<reduce_method_t, T>::reduce(val);
|
||||
const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
||||
if (block_size > WARP_SIZE) {
|
||||
assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0);
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
shared_vals[warp_id] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
val = block_reduce_policy<reduce_method_t, T>::sentinel();
|
||||
if (lane_id < (static_cast<int>(block_size) / WARP_SIZE)) {
|
||||
val = shared_vals[lane_id];
|
||||
}
|
||||
return block_reduce_policy<reduce_method_t, T>::reduce(val);
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
|
|||
return 128;
|
||||
}
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// Currenlty llvm with the amdgcn target does not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
|
|
|
|||
|
|
@ -4551,7 +4551,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_L2_NORM:
|
||||
return true;
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
|
|
|
|||
|
|
@ -25,19 +25,8 @@ static __global__ void norm_f32(
|
|||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
__shared__ float2 s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
}
|
||||
__syncthreads();
|
||||
mean_var = s_sum[lane_id];
|
||||
mean_var = warp_reduce_sum(mean_var);
|
||||
}
|
||||
extern __shared__ float2 s_sum2[];
|
||||
mean_var = block_reduce<block_reduce_method::SUM, block_size>(mean_var, s_sum2);
|
||||
|
||||
const float mean = mean_var.x / ncols;
|
||||
const float var = mean_var.y / ncols - mean * mean;
|
||||
|
|
@ -61,19 +50,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
|||
tmp += x[j];
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
extern __shared__ float s_sum[];
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
|
||||
const float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
|
@ -84,18 +62,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
|||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
|
||||
const float variance = tmp / group_size;
|
||||
const float scale = rsqrtf(variance + eps);
|
||||
|
|
@ -163,22 +130,8 @@ static __global__ void rms_norm_f32(const float * x,
|
|||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = tid / WARP_SIZE;
|
||||
const int lane_id = tid % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
tmp = 0.0f;
|
||||
if (lane_id < (block_size / WARP_SIZE)) {
|
||||
tmp = s_sum[lane_id];
|
||||
}
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
extern __shared__ float s_sum[];
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
|
||||
const float mean = tmp / ncols;
|
||||
const float scale = rsqrtf(mean + eps);
|
||||
|
|
@ -306,19 +259,8 @@ static __global__ void l2_norm_f32(
|
|||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
extern __shared__ float s_sum[];
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
|
||||
|
||||
// from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
|
||||
const float scale = rsqrtf(fmaxf(tmp, eps * eps));
|
||||
|
|
@ -337,7 +279,7 @@ static void norm_f32_cuda(
|
|||
norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -348,7 +290,7 @@ static void group_norm_f32_cuda(
|
|||
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
||||
group_norm_f32<1024><<<num_groups, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -358,10 +300,10 @@ static void rms_norm_f32_cuda(
|
|||
const dim3 blocks_num(nrows, nchannels, nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -404,12 +346,12 @@ static void rms_norm_mul_f32_cuda(const float * x,
|
|||
const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
|
||||
rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
|
||||
rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
|
||||
}
|
||||
|
|
@ -425,14 +367,14 @@ static void rms_norm_mul_f32_cuda(const float * x,
|
|||
const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(256, 1, 1);
|
||||
rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
|
||||
rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
|
||||
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
|
||||
add_nchannels_packed, add_nsamples_packed);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
|
||||
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
|
||||
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
|
||||
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
|
||||
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
|
||||
|
|
@ -460,7 +402,7 @@ static void l2_norm_f32_cuda(
|
|||
l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
l2_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,22 +28,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
|
|||
}
|
||||
|
||||
// sum up partial sums
|
||||
sum = warp_reduce_sum(sum);
|
||||
if (blockDim.x > WARP_SIZE) {
|
||||
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = sum;
|
||||
}
|
||||
__syncthreads();
|
||||
sum = 0.0f;
|
||||
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
|
||||
sum = s_sum[lane_id];
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
}
|
||||
__shared__ float shared_vals[32];
|
||||
sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -75,9 +75,6 @@ static __global__ void soft_max_f32(
|
|||
|
||||
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
||||
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);
|
||||
|
||||
extern __shared__ float data_soft_max_f32[];
|
||||
|
|
@ -102,21 +99,7 @@ static __global__ void soft_max_f32(
|
|||
}
|
||||
|
||||
// find the max value in the block
|
||||
max_val = warp_reduce_max(max_val);
|
||||
if (block_size > WARP_SIZE) {
|
||||
if (warp_id == 0) {
|
||||
buf_iw[lane_id] = -INFINITY;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf_iw[warp_id] = max_val;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
max_val = buf_iw[lane_id];
|
||||
max_val = warp_reduce_max(max_val);
|
||||
}
|
||||
max_val = block_reduce<block_reduce_method::MAX, block_size_template>(max_val, buf_iw);
|
||||
|
||||
float tmp = 0.0f; // partial sum
|
||||
|
||||
|
|
@ -134,22 +117,7 @@ static __global__ void soft_max_f32(
|
|||
}
|
||||
|
||||
// find the sum of exps in the block
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if (block_size > WARP_SIZE) {
|
||||
__syncthreads();
|
||||
if (warp_id == 0) {
|
||||
buf_iw[lane_id] = 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (lane_id == 0) {
|
||||
buf_iw[warp_id] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
tmp = buf_iw[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
tmp = block_reduce<block_reduce_method::SUM, block_size_template>(tmp, buf_iw);
|
||||
|
||||
if (sinks) {
|
||||
tmp += expf(sinks[i02] - max_val);
|
||||
|
|
@ -169,50 +137,6 @@ static __global__ void soft_max_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: This is a common pattern used across kernels that could be moved to common.cuh + templated
|
||||
static __device__ float two_stage_warp_reduce_max(float val) {
|
||||
val = warp_reduce_max(val);
|
||||
if (blockDim.x > WARP_SIZE) {
|
||||
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
||||
__shared__ float local_vals[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
local_vals[warp_id] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
val = -INFINITY;
|
||||
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
|
||||
val = local_vals[lane_id];
|
||||
}
|
||||
return warp_reduce_max(val);
|
||||
} else {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ float two_stage_warp_reduce_sum(float val) {
|
||||
val = warp_reduce_sum(val);
|
||||
if (blockDim.x > WARP_SIZE) {
|
||||
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
||||
__shared__ float local_vals[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
local_vals[warp_id] = val;
|
||||
}
|
||||
__syncthreads();
|
||||
val = 0.0f;
|
||||
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
|
||||
val = local_vals[lane_id];
|
||||
}
|
||||
return warp_reduce_sum(val);
|
||||
} else {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Template to allow keeping ncols in registers if they fit
|
||||
static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
|
||||
float * __restrict__ dst,
|
||||
|
|
@ -230,6 +154,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
|
|||
float local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
|
||||
float local_max = -INFINITY;
|
||||
const int step_size = gridDim.x * blockDim.x;
|
||||
__shared__ float shared_vals[32];
|
||||
|
||||
// Compute thread-local max
|
||||
for (int col = col_start; col < p.ncols;) {
|
||||
|
|
@ -246,7 +171,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
|
|||
}
|
||||
|
||||
// Compute CTA-level max
|
||||
local_max = two_stage_warp_reduce_max(local_max);
|
||||
local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
|
||||
|
||||
// Store CTA-level max to GMEM
|
||||
if (tid == 0) {
|
||||
|
|
@ -261,7 +186,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
|
|||
} else {
|
||||
local_max = -INFINITY;
|
||||
}
|
||||
local_max = two_stage_warp_reduce_max(local_max);
|
||||
local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
|
||||
|
||||
// Compute softmax dividends, accumulate divisor
|
||||
float tmp_expf = 0.0f;
|
||||
|
|
@ -284,7 +209,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
|
|||
}
|
||||
|
||||
// Reduce divisor within CTA
|
||||
tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
|
||||
tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
|
||||
|
||||
// Store CTA-level sum to GMEM
|
||||
if (tid == 0) {
|
||||
|
|
@ -298,7 +223,7 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __
|
|||
} else {
|
||||
tmp_expf = 0.0f;
|
||||
}
|
||||
tmp_expf = two_stage_warp_reduce_sum(tmp_expf);
|
||||
tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
|
||||
|
||||
// Divide dividend by global sum + store data
|
||||
for (int col = col_start; col < p.ncols;) {
|
||||
|
|
|
|||
|
|
@ -23,11 +23,6 @@ if (GGML_METAL_NDEBUG)
|
|||
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
# copy metal files to bin directory
|
||||
configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
if (GGML_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
|
|
@ -37,12 +32,12 @@ if (GGML_METAL_EMBED_LIBRARY)
|
|||
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
|
||||
|
||||
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
||||
|
||||
# merge ggml-common.h and ggml-metal.metal into a single file
|
||||
set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
|
||||
set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
set(METALLIB_EMBED_ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
|
||||
set(METALLIB_SOURCE_EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||
|
|
@ -62,6 +57,11 @@ if (GGML_METAL_EMBED_LIBRARY)
|
|||
|
||||
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||
else()
|
||||
# copy metal files to bin directory
|
||||
configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
|
||||
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
|
|
|
|||
|
|
@ -14413,13 +14413,29 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
const bool uses_bda = (op->op == GGML_OP_IM2COL || op->op == GGML_OP_IM2COL_3D) &&
|
||||
device->shader_int64 && device->buffer_device_address;
|
||||
|
||||
auto const & tensor_size_supported = [&](size_t tensor_size) {
|
||||
if (tensor_size > device->max_buffer_size) {
|
||||
return false;
|
||||
}
|
||||
// For im2col shaders using BDA, maxStorageBufferRange limit doesn't apply.
|
||||
// If shader64BitIndexing is enabled, maxStorageBufferRange limit doesn't apply.
|
||||
if (!uses_bda && !device->shader_64b_indexing) {
|
||||
if (tensor_size > device->properties.limits.maxStorageBufferRange) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
// reject any tensors larger than the max buffer size
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) {
|
||||
if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (ggml_nbytes(op) > device->max_buffer_size) {
|
||||
if (!tensor_size_supported(ggml_nbytes(op))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -264,7 +264,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
|
|||
const i8vec2 scales = i8vec2(unpack8(uint32_t(((data_a_packed16[ib_k].scales[(is % 8 ) / 2] >> (4 * (is / 8))) & 0x0F0F) |
|
||||
(((data_a_packed16[ib_k].scales[(8 + (is % 4)) / 2] >> (2 * (is / 4))) & 0x0303) << 4))).xy); // vec4 used due to #12147
|
||||
|
||||
buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales - 32);
|
||||
buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales - 32));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -334,7 +334,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
|
|||
(data_a[ib_k].scales[is+4] >> 4) | ((data_a[ib_k].scales[is ] & 0xC0) >> 2));
|
||||
}
|
||||
|
||||
buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(data_a_packed32[ib_k].dm) * FLOAT_TYPE_VEC2(scale_dm);
|
||||
buf_a[buf_ib].dm = FLOAT_TYPE_VEC2(vec2(data_a_packed32[ib_k].dm) * vec2(scale_dm));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -385,7 +385,7 @@ void block_a_to_shmem(const uint buf_ib, const uint ib, const uint iqs) {
|
|||
const uint is = iqs_k / 4;
|
||||
const i8vec2 scales = unpack8(int32_t(data_a_packed16[ib_k].scales[is / 2])).xy;
|
||||
|
||||
buf_a[buf_ib].d_scales = FLOAT_TYPE(data_a_packed16[ib_k].d) * FLOAT_TYPE_VEC2(scales);
|
||||
buf_a[buf_ib].d_scales = FLOAT_TYPE_VEC2(float(data_a_packed16[ib_k].d) * vec2(scales));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,22 +0,0 @@
|
|||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1996 - 2026, Daniel Stenberg, <daniel@haxx.se>, and many
|
||||
contributors, see the THANKS file.
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Permission to use, copy, modify, and distribute this software for any purpose
|
||||
with or without fee is hereby granted, provided that the above copyright
|
||||
notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
|
||||
NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
|
||||
OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall not
|
||||
be used in advertising or otherwise to promote the sale, use or other dealings
|
||||
in this Software without prior written authorization of the copyright holder.
|
||||
|
|
@ -109,8 +109,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
|
|||
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||
###########################################################
|
||||
|
||||
# Note: test-eval-callback requires -DLLAMA_CURL
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 || abort "Failed to build environment"
|
||||
pushd "$build_dir"
|
||||
make -j || abort "Failed to compile"
|
||||
popd > /dev/null || exit 1
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ const path = require('path');
|
|||
|
||||
// This file is used for testing wasm build from emscripten
|
||||
// Example build command:
|
||||
// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF
|
||||
// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_OPENSSL=OFF
|
||||
// cmake --build build-wasm --target test-backend-ops -j
|
||||
|
||||
const PORT = 8080;
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
Simple usage example:
|
||||
|
||||
cmake -B build -DLLAMA_CURL=1 && cmake --build build --config Release -j -t llama-server
|
||||
cmake -B build && cmake --build build --config Release -j -t llama-server
|
||||
|
||||
export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
|
||||
export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
|
||||
|
|
|
|||
|
|
@ -614,9 +614,9 @@ struct llama_mlock::impl {
|
|||
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
|
||||
// visionOS/tvOS dont't support RLIMIT_MEMLOCK
|
||||
// Skip resource limit checks on visionOS/tvOS
|
||||
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
|
||||
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
|
||||
// Skip resource limit checks on these platforms
|
||||
suggest = false;
|
||||
#else
|
||||
struct rlimit lock_limit;
|
||||
|
|
|
|||
|
|
@ -446,7 +446,7 @@ struct llama_model::impl {
|
|||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
||||
// contexts where the model tensors metadata is stored as well as the corresponding buffers:
|
||||
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
||||
|
||||
buft_list_t cpu_buft_list;
|
||||
|
|
@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, true);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
|
|
|
|||
|
|
@ -202,15 +202,13 @@ llama_build_and_test(
|
|||
llama_build_and_test(test-regex-partial.cpp)
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
|
||||
llama_download_model("tinyllamas/stories15M-q4_0.gguf" SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739)
|
||||
else()
|
||||
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
|
||||
llama_download_model("tinyllamas/stories15M-be.Q4_0.gguf" SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d)
|
||||
endif()
|
||||
llama_build_and_test(test-thread-safety.cpp ARGS -m "${LLAMA_DOWNLOAD_MODEL}" -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
|
||||
|
||||
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
||||
if (NOT WIN32)
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
endif()
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
|
||||
if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
|
||||
# TODO: repair known memory leaks
|
||||
|
|
@ -225,11 +223,7 @@ llama_build_and_test(test-backend-sampler.cpp LABEL "model")
|
|||
|
||||
# Test for state restore with fragmented KV cache
|
||||
# Requires a model, uses same args pattern as test-thread-safety
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf)
|
||||
else()
|
||||
llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf)
|
||||
endif()
|
||||
llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${LLAMA_DOWNLOAD_MODEL}")
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
|
|
|
|||
|
|
@ -173,7 +173,7 @@ int main(void) {
|
|||
assert(params.cpuparams.n_threads == 1010);
|
||||
#endif // _WIN32
|
||||
|
||||
printf("test-arg-parser: test curl-related functions\n\n");
|
||||
printf("test-arg-parser: test download functions\n\n");
|
||||
const char * GOOD_URL = "http://ggml.ai/";
|
||||
const char * BAD_URL = "http://ggml.ai/404";
|
||||
|
||||
|
|
|
|||
|
|
@ -7482,25 +7482,29 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
|
||||
test_cases.emplace_back(new test_silu_back());
|
||||
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
|
||||
for (bool v : {false, true}) {
|
||||
test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
|
||||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
|
||||
for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) {
|
||||
for (uint32_t n : { 64, 1025 }) {
|
||||
for (bool v : { false, true }) {
|
||||
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
|
||||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
|
||||
}
|
||||
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
|
||||
}
|
||||
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
}
|
||||
|
||||
// in-place tests
|
||||
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, false, 1e-6f, true));
|
||||
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f }) {
|
||||
for (uint32_t n : { 64, 1025 }) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
|
||||
test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
|
||||
test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
|
||||
}
|
||||
}
|
||||
for (uint32_t n : {1, 511, 1025, 8192, 33*512}) {
|
||||
for (bool multi_add : {false, true}) {
|
||||
|
|
@ -7524,9 +7528,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
|
||||
|
||||
for (int64_t d_conv : {3, 4, 9}) {
|
||||
for (int64_t d_inner: {1024, 1536, 2048}) {
|
||||
test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
|
||||
|
|
|
|||
|
|
@ -32,10 +32,6 @@ struct clip_graph {
|
|||
const float kq_scale;
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
// for debugging
|
||||
const bool debug_graph;
|
||||
std::vector<ggml_tensor *> & debug_print_tensors;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
ggml_context * ctx0;
|
||||
ggml_cgraph * gf;
|
||||
|
|
|
|||
|
|
@ -152,18 +152,14 @@ struct clip_ctx {
|
|||
ggml_backend_t backend_cpu = nullptr;
|
||||
ggml_backend_buffer_ptr buf;
|
||||
|
||||
|
||||
int max_nodes = 8192;
|
||||
ggml_backend_sched_ptr sched;
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
// for debugging
|
||||
bool debug_graph = false;
|
||||
std::vector<ggml_tensor *> debug_print_tensors;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!backend_cpu) {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
|
|
@ -204,6 +200,10 @@ struct clip_ctx {
|
|||
sched.reset(
|
||||
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
|
||||
);
|
||||
|
||||
if (ctx_params.cb_eval != nullptr) {
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
|
|
@ -239,9 +239,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
|||
n_mmproj_embd(clip_n_mmproj_embd(ctx)),
|
||||
eps(hparams.eps),
|
||||
kq_scale(1.0f / sqrtf((float)d_head)),
|
||||
flash_attn_type(ctx->flash_attn_type),
|
||||
debug_graph(ctx->debug_graph),
|
||||
debug_print_tensors(ctx->debug_print_tensors) {
|
||||
flash_attn_type(ctx->flash_attn_type) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||
|
|
@ -252,14 +250,11 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
|||
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
||||
}
|
||||
|
||||
void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
||||
if (debug_graph) {
|
||||
ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
|
||||
std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
|
||||
ggml_set_name(cur, cur_name.c_str());
|
||||
ggml_set_output(cur);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
debug_print_tensors.push_back(cur);
|
||||
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
} else {
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1533,8 +1528,8 @@ struct clip_model_loader {
|
|||
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
|
||||
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
|
||||
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
|
|
@ -1776,8 +1771,8 @@ struct clip_model_loader {
|
|||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
|
|
@ -3364,7 +3359,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
|
||||
// build the inference graph
|
||||
ctx->debug_print_tensors.clear();
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
||||
|
|
@ -3757,18 +3751,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
return false;
|
||||
}
|
||||
|
||||
// print debug nodes
|
||||
if (ctx->debug_graph) {
|
||||
LOG_INF("\n\n---\n\n");
|
||||
LOG_INF("\n\nDebug graph:\n\n");
|
||||
for (ggml_tensor * t : ctx->debug_print_tensors) {
|
||||
std::vector<uint8_t> data(ggml_nbytes(t));
|
||||
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
|
||||
print_tensor_shape(t);
|
||||
print_tensor_data(t, data.data(), 3);
|
||||
}
|
||||
}
|
||||
|
||||
// the last node is the embedding tensor
|
||||
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
|
|
@ -3921,7 +3903,6 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
|||
//
|
||||
// API for debugging
|
||||
//
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
clip_image_f32 img;
|
||||
img.nx = w;
|
||||
|
|
@ -3930,9 +3911,6 @@ void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
|||
for (int i = 0; i < h * w * 3; i++) {
|
||||
img.buf[i] = static_cast<float>(fill_value);
|
||||
}
|
||||
bool cur_debug_graph = ctx->debug_graph;
|
||||
ctx->debug_graph = true;
|
||||
clip_image_encode(ctx, 1, &img, nullptr);
|
||||
ctx->debug_graph = cur_debug_graph;
|
||||
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
|
@ -37,6 +38,8 @@ struct clip_context_params {
|
|||
int image_min_tokens;
|
||||
int image_max_tokens;
|
||||
bool warmup;
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "arg.h"
|
||||
#include "debug.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
|
|
@ -88,6 +89,8 @@ struct mtmd_cli_context {
|
|||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
base_callback_data cb_data;
|
||||
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init->model();
|
||||
lctx = llama_init->context();
|
||||
|
|
@ -139,6 +142,10 @@ struct mtmd_cli_context {
|
|||
mparams.warmup = params.warmup;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
|
||||
mparams.cb_eval_user_data = &cb_data;
|
||||
mparams.cb_eval = common_debug_cb_eval<false>;
|
||||
}
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
|
|
|
|||
|
|
@ -111,6 +111,8 @@ mtmd_context_params mtmd_context_params_default() {
|
|||
/* warmup */ true,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
/* cb_eval */ nullptr,
|
||||
/* cb_eval_user_data */ nullptr,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
|
|
@ -176,6 +178,8 @@ struct mtmd_context {
|
|||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
/* warmup */ ctx_params.warmup,
|
||||
/* cb_eval */ ctx_params.cb_eval,
|
||||
/* cb_eval_user_data */ ctx_params.cb_eval_user_data,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
|
|
|
|||
|
|
@ -95,6 +95,10 @@ struct mtmd_context_params {
|
|||
// limit number of image tokens, only for vision models with dynamic resolution
|
||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
|
||||
|
||||
// callback function passed over to mtmd proper
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
|
|
@ -273,12 +277,12 @@ struct bitmap {
|
|||
ptr.reset(mtmd_bitmap_init(nx, ny, data));
|
||||
}
|
||||
~bitmap() = default;
|
||||
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
};
|
||||
|
||||
struct bitmaps {
|
||||
|
|
@ -302,8 +306,8 @@ struct input_chunks {
|
|||
input_chunks() = default;
|
||||
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
|
||||
~input_chunks() = default;
|
||||
size_t size() { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) {
|
||||
size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) const {
|
||||
return mtmd_input_chunks_get(ptr.get(), idx);
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ This example demonstrates the Text To Speech feature. It uses a
|
|||
[outeai](https://www.outeai.com/).
|
||||
|
||||
## Quickstart
|
||||
If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the
|
||||
If you have built llama.cpp with SSL support you can simply run the
|
||||
following command and the required models will be downloaded automatically:
|
||||
```console
|
||||
$ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav
|
||||
|
|
|
|||
Loading…
Reference in New Issue