diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index b6b802a7c6..fd7195c5be 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,9 +1,7 @@
-ARG UBUNTU_VERSION=25.10
+ARG UBUNTU_VERSION=26.04
FROM ubuntu:$UBUNTU_VERSION AS build
-# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
-
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -52,6 +50,7 @@ WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
+ build-essential \
git \
python3 \
python3-pip \
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
index 5575caeca3..6ad61582a5 100644
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -65,3 +65,34 @@ runs:
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+ - name: Install Cuda Toolkit 13.1
+ if: ${{ inputs.cuda_version == '13.1' }}
+ shell: pwsh
+ run: |
+ mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+ choco install unzip -y
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
+ unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+ echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+ echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index 36201281f0..c2c6ea12ae 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -291,6 +291,7 @@ jobs:
-DGGML_RVV=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
+ -DGGML_RV_ZIHINTPAUSE=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
diff --git a/.github/workflows/build-riscv-native.yml b/.github/workflows/build-riscv-native.yml
deleted file mode 100644
index a3a0b0d663..0000000000
--- a/.github/workflows/build-riscv-native.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
- pull_request:
- workflow_dispatch:
- workflow_call:
-
-jobs:
- debian-13-riscv64-native: # Bianbu 2.2
- runs-on: [self-hosted, RISCV64]
-
- steps:
- - name: Install prerequisites
- run: |
- sudo apt-get update || true
- sudo apt-get install -y libatomic1
- - uses: actions/checkout@v4
- - name: Setup Riscv
- run: |
- sudo apt-get update || true
- sudo apt-get install -y --no-install-recommends \
- build-essential \
- gcc-14-riscv64-linux-gnu \
- g++-14-riscv64-linux-gnu \
- ccache \
- cmake
-
- - name: Setup ccache
- run: |
- mkdir -p $HOME/.ccache
- ccache -M 5G -d $HOME/.ccache
- export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
- export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
- echo "$GITHUB_WORKSPACE"
- echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
- echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
- echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
- echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
- - name: Build
- run: |
- cmake -B build \
- -DLLAMA_CURL=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_OPENMP=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_TOOLS=ON \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_SYSTEM_NAME=Linux \
- -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
- -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
- -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
- -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
- -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
- cmake --build build --config Release -j $(nproc)
-
- # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
- # runs-on: [self-hosted, RISCV64]
-
- # steps:
- # - name: Install prerequisites
- # run: |
- # sudo apt-get update || true
- # sudo apt-get install -y libatomic1
- # - uses: actions/checkout@v4
- # - name: Setup Riscv
- # run: |
- # sudo apt-get update || true
- # sudo apt-get install -y --no-install-recommends \
- # build-essential \
- # gcc-14-riscv64-linux-gnu \
- # g++-14-riscv64-linux-gnu \
- # ccache \
- # cmake
- # sudo apt-get upgrade binutils -y
-
- # - name: Setup ccache
- # run: |
- # mkdir -p $HOME/.ccache
- # ccache -M 5G -d $HOME/.ccache
- # export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
- # export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
- # echo "$GITHUB_WORKSPACE"
- # echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
- # echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
- # echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
- # echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
- # - name: Build
- # run: |
- # cmake -B build \
- # -DLLAMA_CURL=OFF \
- # -DCMAKE_BUILD_TYPE=Release \
- # -DGGML_OPENMP=OFF \
- # -DLLAMA_BUILD_EXAMPLES=ON \
- # -DLLAMA_BUILD_TOOLS=ON \
- # -DLLAMA_BUILD_TESTS=OFF \
- # -DCMAKE_SYSTEM_NAME=Linux \
- # -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
- # -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
- # -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
- # -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- # -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- # -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
- # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
- # -DGGML_RVV=ON \
- # -DGGML_RV_ZFH=ON \
- # -DGGML_RV_ZICBOP=ON \
- # -DGGML_CPU_RISCV64_SPACEMIT=ON \
- # -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
-
- # cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5215cc3572..383427f36f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,13 +69,6 @@ jobs:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
-
- name: Build
id: cmake_build
run: |
@@ -83,6 +76,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
@@ -110,13 +105,6 @@ jobs:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
-
- name: Build
id: cmake_build
run: |
@@ -126,6 +114,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -151,13 +141,6 @@ jobs:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
-
- name: Dawn Dependency
id: dawn-depends
run: |
@@ -217,7 +200,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
- libjpeg-dev build-essential libcurl4-openssl-dev \
+ libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Python Dependencies
@@ -238,6 +221,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -258,7 +243,7 @@ jobs:
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
- ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
@@ -267,7 +252,7 @@ jobs:
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
- ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@@ -294,13 +279,15 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -311,6 +298,8 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -335,7 +324,7 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
@@ -343,6 +332,8 @@ jobs:
mkdir build
cd build
cmake .. \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
cmake --build . --config Release -j $(nproc)
@@ -373,12 +364,14 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -405,12 +398,14 @@ jobs:
- name: Dependencies
id: depends
run: |
- sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
+ sudo apt-get install -y glslc libvulkan-dev libssl-dev
- name: Configure
id: cmake_configure
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -440,7 +435,7 @@ jobs:
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
- sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+ sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@@ -466,6 +461,8 @@ jobs:
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
@@ -497,7 +494,7 @@ jobs:
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
- sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+ sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@@ -537,7 +534,10 @@ jobs:
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
- cmake -B build -DGGML_WEBGPU=ON
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
- name: Test
@@ -547,6 +547,46 @@ jobs:
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
+ ubuntu-24-wasm-webgpu:
+ runs-on: ubuntu-24.04
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: ubuntu-latest-wasm-webgpu
+ evict-old-files: 1d
+
+ - name: Install Emscripten
+ run: |
+ git clone https://github.com/emscripten-core/emsdk.git
+ cd emsdk
+ ./emsdk install latest
+ ./emsdk activate latest
+
+ - name: Fetch emdawnwebgpu
+ run: |
+ DAWN_TAG="v20251027.212519"
+ EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+ echo "Downloading ${EMDAWN_PKG}"
+ curl -L -o emdawn.zip \
+ "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+ unzip emdawn.zip
+
+ - name: Build WASM WebGPU
+ run: |
+ source emsdk/emsdk_env.sh
+ emcmake cmake -B build-wasm \
+ -DGGML_WEBGPU=ON \
+ -DLLAMA_CURL=OFF \
+ -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+ cmake --build build-wasm --target test-backend-ops -j $(nproc)
+
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.1.2
@@ -560,7 +600,7 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
+ sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -572,6 +612,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
@@ -590,7 +632,7 @@ jobs:
id: depends
run: |
apt-get update
- apt-get install -y build-essential git cmake libcurl4-openssl-dev
+ apt-get install -y build-essential git cmake libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -602,6 +644,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
@@ -626,7 +670,7 @@ jobs:
shell: bash
run: |
sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@@ -648,6 +692,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
@@ -674,7 +720,7 @@ jobs:
shell: bash
run: |
sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@@ -696,6 +742,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
@@ -722,12 +770,6 @@ jobs:
key: macOS-latest-cmake-ios
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build
id: cmake_build
run: |
@@ -759,12 +801,6 @@ jobs:
key: macOS-latest-cmake-tvos
evict-old-files: 1d
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build
id: cmake_build
run: |
@@ -790,12 +826,6 @@ jobs:
id: checkout
uses: actions/checkout@v4
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build
id: cmake_build
run: |
@@ -838,12 +868,6 @@ jobs:
name: llama-xcframework
path: build-apple/llama.xcframework/
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build llama.cpp with CMake
id: cmake_build
run: |
@@ -995,21 +1019,12 @@ jobs:
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
- with:
- architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }} `
- -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
- name: Add libopenblas.dll
id: add_libopenblas_dll
@@ -1053,7 +1068,7 @@ jobs:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
- apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
+ apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -1064,10 +1079,12 @@ jobs:
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DLLAMA_FATAL_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
- -DLLAMA_FATAL_WARNINGS=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON
cmake --build build
@@ -1101,25 +1118,20 @@ jobs:
run: |
choco install ninja
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
-
- name: Build
id: cmake_build
shell: cmd
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
+ -DLLAMA_CURL=OFF ^
+ -DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^
- -DGGML_RPC=ON ^
- -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
+ -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@@ -1151,7 +1163,7 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+ # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
@@ -1208,14 +1220,8 @@ jobs:
key: ${{ github.job }}
evict-old-files: 1d
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
-
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1224,11 +1230,12 @@ jobs:
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
+ -DLLAMA_CURL=OFF `
+ -DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
- -DGGML_RPC=ON `
- -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
ios-xcode-build:
@@ -1595,33 +1602,33 @@ jobs:
run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
- ggml-ci-x64-amd-vulkan:
- runs-on: [self-hosted, Linux, X64, AMD]
+ # ggml-ci-x64-amd-vulkan:
+ # runs-on: [self-hosted, Linux, X64, AMD]
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v4
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # vulkaninfo --summary
+ # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
- ggml-ci-x64-amd-rocm:
- runs-on: [self-hosted, Linux, X64, AMD]
+ # ggml-ci-x64-amd-rocm:
+ # runs-on: [self-hosted, Linux, X64, AMD]
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v4
- - name: Test
- id: ggml-ci
- run: |
- amd-smi static
- GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # amd-smi static
+ # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
@@ -1675,6 +1682,337 @@ jobs:
run: |
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+ ubuntu-cpu-cmake-riscv64-native:
+ runs-on: RISCV64
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Check environment
+ run: |
+ uname -a
+ gcc --version
+ g++ --version
+ ldd --version
+ cmake --version
+ rustc --version
+
+ - name: Setup ccache
+ run: |
+ # Set unique cache directory for this job
+ export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
+ mkdir -p "$CCACHE_DIR"
+
+ # Configure ccache for optimal performance
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+
+ # Enable more aggressive caching
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ # Export for subsequent steps
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=ON \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DGGML_RPC=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L 'main|curl' --verbose --timeout 900
+
+ - name: Test llama2c conversion
+ id: llama2c_test
+ run: |
+ cd build
+ echo "Fetch tokenizer"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+ echo "Fetch llama2c model"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+ ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+ ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+ ubuntu-cmake-sanitizer-riscv64-native:
+ runs-on: RISCV64
+
+ continue-on-error: true
+
+ strategy:
+ matrix:
+ sanitizer: [ADDRESS, THREAD, UNDEFINED]
+ build_type: [Debug]
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ # Unique cache directory per matrix combination
+ export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
+ mkdir -p "$CCACHE_DIR"
+
+ # Configure ccache
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ # Export for subsequent steps
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ if: ${{ matrix.sanitizer != 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DGGML_OPENMP=ON \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+
+ ubuntu-llguidance-riscv64-native:
+ runs-on: RISCV64
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
+ mkdir -p "$CCACHE_DIR"
+
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_LLGUIDANCE=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+
+ ubuntu-cmake-rpc-riscv64-native:
+ runs-on: RISCV64
+
+ continue-on-error: true
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
+ mkdir -p "$CCACHE_DIR"
+
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=ON \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+ -DGGML_RPC=ON
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose
+
ggml-ci-arm64-graviton4-kleidiai:
runs-on: ah-ubuntu_22_04-c8g_8x
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 0d5739c24b..77aec20c11 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,14 +66,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+ zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts
+ - name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
+ - name: Upload artifacts (tar)
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
+ name: llama-bin-macos-arm64.tar.gz
+
macOS-x64:
runs-on: macos-15-intel
@@ -120,14 +127,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+ zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts
+ - name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
+ - name: Upload artifacts (tar)
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
+ name: llama-bin-macos-x64.tar.gz
+
ubuntu-22-cpu:
strategy:
matrix:
@@ -182,14 +196,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+ zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts
+ - name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
+ - name: Upload artifacts (tar)
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
+ name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
+
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
@@ -235,14 +256,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+ zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- - name: Upload artifacts
+ - name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
+ - name: Upload artifacts (tar)
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
+ name: llama-bin-ubuntu-vulkan-x64.tar.gz
+
windows-cpu:
runs-on: windows-2025
@@ -298,7 +326,7 @@ jobs:
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
- 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+ 7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -380,7 +408,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+ 7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -393,7 +421,7 @@ jobs:
strategy:
matrix:
- cuda: ['12.4']
+ cuda: ['12.4', '13.1']
steps:
- name: Clone
@@ -434,7 +462,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+ 7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -448,6 +476,7 @@ jobs:
$dst='.\build\bin\cudart\'
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+ robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
@@ -517,6 +546,8 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
@@ -526,7 +557,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
- 7z a llama-bin-win-sycl-x64.zip ./build/bin/*
+ 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
@@ -632,7 +663,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+ 7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -685,58 +716,20 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+ zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+ tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
- - name: Upload artifacts
+ - name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
- name: llama-${{ steps.tag.outputs.name }}-xcframework
+ name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
- openEuler-cann:
- strategy:
- matrix:
- arch: [x86, aarch64]
- chip_type: ['910b', '310p']
- build: ['Release']
- runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
- container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
- steps:
- - name: Checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
-
- - name: Dependencies
- run: |
- yum update -y
- yum install -y git gcc gcc-c++ make cmake libcurl-devel
- git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
- - name: Build
- run: |
- export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
- cmake -S . -B build \
- -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
- -DGGML_CANN=on \
- -DSOC_TYPE=ascend${{ matrix.chip_type }}
- cmake --build build -j $(nproc)
-
- - name: Determine tag name
- id: tag
- uses: ./.github/actions/get-tag-name
-
- - name: Pack artifacts
- run: |
- cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
-
- - name: Upload artifacts
+ - name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
- name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+ path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
+ name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -759,7 +752,6 @@ jobs:
- macOS-arm64
- macOS-x64
- ios-xcode-build
- - openEuler-cann
steps:
- name: Clone
@@ -814,6 +806,7 @@ jobs:
echo "Moving other artifacts..."
mv -v artifact/*.zip release
+ mv -v artifact/*.tar.gz release
- name: Create release
id: create_release
@@ -822,6 +815,34 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
+ body: |
+ > [!WARNING]
+ > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
+
+
+
+ ${{ github.event.head_commit.message }}
+
+
+
+ **macOS/iOS:**
+ - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
+ - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
+ - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
+
+ **Linux:**
+ - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
+ - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
+ - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
+
+ **Windows:**
+ - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
+ - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+ - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
+ - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
+ - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+ - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+ - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
- name: Upload release
id: upload_release
@@ -833,7 +854,7 @@ jobs:
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./release')) {
- if (path.extname(file) === '.zip') {
+ if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index ebcd6424bc..a57d0e8b1c 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
curl \
wget \
language-pack-en \
- libcurl4-openssl-dev
+ libssl-dev
- name: Clone
id: checkout
@@ -242,7 +242,7 @@ jobs:
curl \
wget \
language-pack-en \
- libcurl4-openssl-dev
+ libssl-dev
- name: Clone
id: checkout
@@ -283,6 +283,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -295,6 +297,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -306,6 +310,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -345,16 +351,10 @@ jobs:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
-
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
- cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@@ -368,13 +368,6 @@ jobs:
run: |
pip install -r tools/server/tests/requirements.txt
- - name: Copy Libcurl
- id: prepare_libcurl
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
- run: |
- cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
index 5c28615595..d3d9be23ce 100644
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,6 +9,7 @@ jobs:
update:
name: Update Winget Package
runs-on: ubuntu-latest
+ if: github.repository_owner == 'ggml-org'
steps:
- name: Install cargo binstall
diff --git a/.gitignore b/.gitignore
index 8575a141c4..428f084110 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,5 @@ poetry.toml
# IDE
/*.code-workspace
/.windsurf/
+# emscripten
+a.out.*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3278c4a72c..c231ec0e3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,10 +33,24 @@ endif()
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
+
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
- option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+ # Use 64-bit memory to support backend_get_memory queries
+ # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+ if (LLAMA_WASM_MEM64)
+ add_compile_options("-sMEMORY64=1")
+ add_link_options("-sMEMORY64=1")
+ endif()
+ add_link_options("-sALLOW_MEMORY_GROWTH=1")
+
+ option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+ option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+ if (LLAMA_BUILD_HTML)
+ set(CMAKE_EXECUTABLE_SUFFIX ".html")
+ endif()
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -58,6 +72,12 @@ if (MSVC)
add_compile_options("$<$:/bigobj>")
endif()
+if (LLAMA_STANDALONE)
+ # enable parallel builds for msbuild
+ list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+ list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
else()
@@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
-if (MINGW)
- # Target Windows 8 for PrefetchVirtualMemory
- add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
#
# build the library
#
diff --git a/CODEOWNERS b/CODEOWNERS
index 908d13a35b..8e62a36e81 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,23 +2,25 @@
# multiplie collaborators per item can be specified
/.devops/*.Dockerfile @ngxson
-/.github/actions/ @slaren @CISC
+/.github/actions/ @CISC
/.github/workflows/ @CISC
-/.github/workflows/release.yml @slaren
-/.github/workflows/winget.yml @slaren
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
-/common/arg.* @ggerganov @ericcurtin
+/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
+/common/chat.* @pwilkin
+/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
+/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
+/common/unicode.* @aldehir
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
@@ -40,21 +42,14 @@
/examples/passkey/ @ggerganov
/examples/retrieval/ @ggerganov
/examples/save-load-state/ @ggerganov
-/examples/simple-chat/ @slaren
-/examples/simple/ @slaren
/examples/speculative-simple/ @ggerganov
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
-/ggml/include/ @ggerganov @slaren
-/ggml/src/ggml-alloc.c @slaren
-/ggml/src/ggml-backend* @slaren
-/ggml/src/ggml-blas/ @slaren
-/ggml/src/ggml-common.h @ggerganov @slaren
-/ggml/src/ggml-cpu/ @ggerganov @slaren
+/ggml/include/ @ggerganov
+/ggml/src/ggml-common.h @ggerganov
+/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
-/ggml/src/ggml-cuda/common.cuh @slaren
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
-/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
@@ -62,19 +57,19 @@
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
-/ggml/src/ggml-impl.h @ggerganov @slaren
+/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
-/ggml/src/ggml-threading.* @ggerganov @slaren
+/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-webgpu/ @reeselevine
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c @ggerganov @slaren
-/ggml/src/ggml.cpp @ggerganov @slaren
+/ggml/src/ggml.c @ggerganov
+/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
/gguf-py/ @CISC
/media/ @ggerganov
@@ -86,28 +81,22 @@
/src/llama-arch.* @CISC
/src/llama-chat.* @ngxson
/src/llama-graph.* @CISC
-/src/llama-model-loader.* @slaren
/src/llama-model.* @CISC
/src/llama-vocab.* @CISC
/src/models/ @CISC
/tests/ @ggerganov
-/tests/test-backend-ops.cpp @slaren
-/tests/test-thread-safety.cpp @slaren
+/tests/test-chat-.* @pwilkin
/tools/batched-bench/ @ggerganov
-/tools/llama-bench/ @slaren
/tools/main/ @ggerganov
/tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
-/tools/run/ @ericcurtin
-/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov
/vendor/ @ggerganov
-/.clang-format @slaren
-/.clang-tidy @slaren
/AUTHORS @ggerganov
/CMakeLists.txt @ggerganov
/CONTRIBUTING.md @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b808fa31ea..4545ff8f9a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,10 +15,12 @@ The project differentiates between 3 levels of contributors:
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
+- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
-- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
+- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
# Pull requests (for maintainers)
diff --git a/README.md b/README.md
index 2962783585..b7d24c9dd7 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2, AVX512 and AMX support for x86 architectures
-- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
+- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
- Vulkan and SYCL backend support
@@ -242,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
@@ -275,6 +276,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [HIP](docs/build.md#hip) | AMD GPU |
+| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
@@ -345,19 +347,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
--
- Run simple text completion
-
- To disable conversation mode explicitly, use `-no-cnv`
-
- ```bash
- llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
- # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
- ```
-
-
-
-
Constrain the output with a custom grammar
@@ -612,3 +601,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
+- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
diff --git a/SECURITY.md b/SECURITY.md
index 9749e95b71..9c86ae91b5 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -65,4 +65,6 @@ However, If you have discovered a security vulnerability in this project, please
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
+
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/ci/run.sh b/ci/run.sh
index 3fec8e9110..0676504b3e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
- (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
- (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
- (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then
@@ -428,10 +428,10 @@ function gg_run_qwen3_0_6b {
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -523,8 +523,8 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
- (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
@@ -564,7 +564,7 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is ""
- (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+ (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
diff --git a/cmake/build-info.cmake b/cmake/build-info.cmake
index 75c78222f2..c7005950c5 100644
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,26 +39,10 @@ if(Git_FOUND)
endif()
endif()
-if(MSVC)
- set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
- if (CMAKE_VS_PLATFORM_NAME)
- set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
- else()
- set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
- endif()
-else()
- execute_process(
- COMMAND ${CMAKE_C_COMPILER} --version
- OUTPUT_VARIABLE OUT
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
- set(BUILD_COMPILER ${OUT})
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
- execute_process(
- COMMAND ${CMAKE_C_COMPILER} -dumpmachine
- OUTPUT_VARIABLE OUT
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- set(BUILD_TARGET ${OUT})
+if(CMAKE_VS_PLATFORM_NAME)
+ set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+ set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
endif()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index bb168e8358..0182767c2b 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
chat-parser.h
chat-parser-xml-toolcall.h
chat-parser-xml-toolcall.cpp
+ chat-peg-parser.cpp
+ chat-peg-parser.h
chat.cpp
chat.h
common.cpp
@@ -69,12 +71,18 @@ add_library(${TARGET} STATIC
log.h
ngram-cache.cpp
ngram-cache.h
+ peg-parser.cpp
+ peg-parser.h
+ preset.cpp
+ preset.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
speculative.h
+ unicode.cpp
+ unicode.h
)
if (BUILD_SHARED_LIBS)
diff --git a/common/arg.cpp b/common/arg.cpp
index 430ab45dfe..5528eeb169 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -30,6 +30,7 @@
#include // for hardware_concurrency
#include
+#ifndef __EMSCRIPTEN__
#ifdef __linux__
#include
#elif defined(_WIN32)
@@ -41,13 +42,17 @@
#else
#include
#endif
+#endif
+
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
using json = nlohmann::ordered_json;
+using namespace common_arg_utils;
static std::initializer_list mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
+ LLAMA_EXAMPLE_CLI,
};
static std::string read_file(const std::string & fname) {
@@ -60,6 +65,15 @@ static std::string read_file(const std::string & fname) {
return content;
}
+static const std::vector & get_common_arg_defs() {
+ static const std::vector options = [] {
+ common_params params;
+ auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
+ return ctx.options;
+ }();
+ return options;
+}
+
common_arg & common_arg::set_examples(std::initializer_list examples) {
this->examples = examples;
return *this;
@@ -130,7 +144,7 @@ static std::vector break_str_into_lines(std::string input, size_t m
return result;
}
-std::string common_arg::to_string() {
+std::string common_arg::to_string() const {
// params for printing to console
const static int n_leading_spaces = 40;
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -212,13 +226,13 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
- const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
+ model.name = model.docker_repo; // set name for consistency
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
@@ -227,7 +241,8 @@ static handle_model_result common_params_handle_model(
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
- model.hf_repo = auto_detected.repo;
+ model.name = model.hf_repo; // repo name with tag
+ model.hf_repo = auto_detected.repo; // repo name without tag
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
@@ -257,8 +272,6 @@ static handle_model_result common_params_handle_model(
model.path = fs_get_cache_file(string_split(f, '/').back());
}
- } else if (model.path.empty()) {
- model.path = model_path_default;
}
}
@@ -405,7 +418,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
+ auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -415,12 +428,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
- common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
+ common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
- common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
- common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
+ common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
+ common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
+ }
+
+ // model is required (except for server)
+ // TODO @ngxson : maybe show a list of available models in CLI in this case
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
+ throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@@ -460,6 +479,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
));
}
+ common_log_set_verbosity_thold(params.verbosity);
+
return true;
}
@@ -636,6 +657,53 @@ static void add_rpc_devices(const std::string & servers) {
}
}
+bool common_params_parse(int argc, char ** argv, llama_example ex, std::map & out_map) {
+ common_params dummy_params;
+ common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
+
+ std::unordered_map arg_to_options;
+ for (auto & opt : ctx_arg.options) {
+ for (const auto & arg : opt.args) {
+ arg_to_options[arg] = &opt;
+ }
+ }
+
+ // TODO @ngxson : find a way to deduplicate this code
+
+ // handle command line arguments
+ auto check_arg = [&](int i) {
+ if (i+1 >= argc) {
+ throw std::invalid_argument("expected value for argument");
+ }
+ };
+
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
+
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+ }
+ auto opt = *arg_to_options[arg];
+ std::string val;
+ if (opt.value_hint != nullptr) {
+ // arg with single value
+ check_arg(i);
+ val = argv[++i];
+ }
+ if (opt.value_hint_2 != nullptr) {
+ // TODO: support arg with 2 values
+ throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
+ }
+ out_map[opt] = val;
+ }
+
+ return true;
+}
+
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -681,19 +749,21 @@ static std::string list_builtin_chat_templates() {
return msg.str();
}
-static bool is_truthy(const std::string & value) {
+bool common_arg_utils::is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1";
}
-static bool is_falsey(const std::string & value) {
+bool common_arg_utils::is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0";
}
-static bool is_autoy(const std::string & value) {
+bool common_arg_utils::is_autoy(const std::string & value) {
return value == "auto" || value == "-1";
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+ params.use_color = tty_can_use_colors();
+
// load dynamic backends
ggml_backend_load_all();
@@ -774,14 +844,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.display_prompt = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"-co", "--color"},
- string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
- [](common_params & params) {
- params.use_color = true;
+ {"-co", "--color"}, "[on|off|auto]",
+ "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
+ "'auto' enables colors when output is to a terminal",
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.use_color = true;
+ } else if (is_falsey(value)) {
+ params.use_color = false;
+ } else if (is_autoy(value)) {
+ params.use_color = tty_can_use_colors();
+ } else {
+ throw std::invalid_argument(
+ string_format("error: unknown value for --color: '%s'\n", value.c_str()));
+ }
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg(
{"-t", "--threads"}, "N",
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -914,7 +994,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N",
string_format(
- ex == LLAMA_EXAMPLE_MAIN
+ ex == LLAMA_EXAMPLE_COMPLETION
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
: "number of tokens to predict (default: %d, -1 = infinity)",
params.n_predict),
@@ -958,7 +1038,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
- ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
@@ -966,7 +1046,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
- ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -974,21 +1054,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.kv_unified = true;
}
- ).set_env("LLAMA_ARG_KV_SPLIT"));
+ ).set_env("LLAMA_ARG_KV_UNIFIED"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](common_params & params) {
params.ctx_shift = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
[](common_params & params) {
params.ctx_shift = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1008,7 +1088,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(
- string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+ string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
}
}).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(common_arg(
@@ -1024,7 +1104,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -1033,6 +1113,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
+ add_opt(common_arg(
+ {"--no-show-timings"},
+ string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+ [](common_params & params) {
+ params.show_timings = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
@@ -1054,7 +1141,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.system_prompt.pop_back();
}
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)",
@@ -1102,42 +1189,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_print = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache"}, "FNAME",
"file to cache prompt state for faster startup (default: none)",
[](common_params & params, const std::string & value) {
params.path_prompt_cache = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-all"},
"if specified, saves user input and generations to cache as well\n",
[](common_params & params) {
params.prompt_cache_all = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-ro"},
"if specified, uses the prompt cache but does not update it",
[](common_params & params) {
params.prompt_cache_ro = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-r", "--reverse-prompt"}, "PROMPT",
"halt generation at PROMPT, return control in interactive mode\n",
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
[](common_params & params) {
params.special = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
"run in conversation mode:\n"
@@ -1147,14 +1234,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-st", "--single-turn"},
"run conversation for a single turn only, then exit when done\n"
@@ -1163,28 +1250,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.single_turn = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-i", "--interactive"},
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
[](common_params & params) {
params.interactive = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-if", "--interactive-first"},
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
[](common_params & params) {
params.interactive_first = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-mli", "--multiline-input"},
"allows you to write or paste multiple lines without ending each in '\\'",
[](common_params & params) {
params.multiline_input = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--in-prefix-bos"},
"prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1192,7 +1279,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix_bos = true;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-prefix"}, "STRING",
"string to prefix user inputs with (default: empty)",
@@ -1200,7 +1287,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
@@ -1208,14 +1295,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_suffix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--no-warmup"},
"skip warming up the model with an empty run",
[](common_params & params) {
params.warmup = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@@ -1232,6 +1319,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
const auto sampler_names = string_split(value, ';');
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
}
).set_sparam());
add_opt(common_arg(
@@ -1261,6 +1349,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
}
).set_sparam());
add_opt(common_arg(
@@ -1268,6 +1357,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
[](common_params & params, int value) {
params.sampling.top_k = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
}
).set_sparam());
add_opt(common_arg(
@@ -1275,6 +1365,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
[](common_params & params, const std::string & value) {
params.sampling.top_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1282,6 +1373,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
[](common_params & params, const std::string & value) {
params.sampling.min_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1296,6 +1388,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
[](common_params & params, const std::string & value) {
params.sampling.xtc_probability = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
}
).set_sparam());
add_opt(common_arg(
@@ -1303,6 +1396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
[](common_params & params, const std::string & value) {
params.sampling.xtc_threshold = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
}
).set_sparam());
add_opt(common_arg(
@@ -1321,6 +1415,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
}
).set_sparam());
add_opt(common_arg(
@@ -1328,6 +1423,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
[](common_params & params, const std::string & value) {
params.sampling.penalty_repeat = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1425,6 +1521,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
[](common_params & params, int value) {
params.sampling.mirostat = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1432,6 +1529,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_eta = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
}
).set_sparam());
add_opt(common_arg(
@@ -1439,6 +1537,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_tau = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
}
).set_sparam());
add_opt(common_arg(
@@ -1594,14 +1693,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.grp_attn_n = value;
}
- ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-gaw", "--grp-attn-w"}, "N",
string_format("group-attention width (default: %d)", params.grp_attn_w),
[](common_params & params, int value) {
params.grp_attn_w = value;
}
- ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
@@ -1757,7 +1856,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg(
- {"--mmproj"}, "FILE",
+ {"-mm", "--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n"
"note: if -hf is used, this argument can be omitted",
[](common_params & params, const std::string & value) {
@@ -1765,7 +1864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
add_opt(common_arg(
- {"--mmproj-url"}, "URL",
+ {"-mmu", "--mmproj-url"}, "URL",
"URL to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
@@ -1791,7 +1890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_MTMD}));
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1884,7 +1983,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
@@ -1913,7 +2012,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@@ -1927,7 +2026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -2072,11 +2171,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
- ? std::string("model path from which to load base model")
- : string_format(
- "model path (default: `models/$filename` with filename from `--hf-file` "
- "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
- ),
+ ? "model path from which to load base model"
+ : "model path to load",
[](common_params & params, const std::string & value) {
params.model.path = value;
}
@@ -2409,7 +2505,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2468,19 +2564,71 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"path to save slot kv cache (default: disabled)",
[](common_params & params, const std::string & value) {
params.slot_save_path = value;
+ if (!fs_is_directory(params.slot_save_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
params.slot_save_path += DIRECTORY_SEPARATOR;
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--media-path"}, "PATH",
+ "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.media_path = value;
+ if (!fs_is_directory(params.media_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.media_path += DIRECTORY_SEPARATOR;
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--models-dir"}, "PATH",
+ "directory containing models for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+ add_opt(common_arg(
+ {"--models-preset"}, "PATH",
+ "path to INI file containing model presets for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_preset = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
+ add_opt(common_arg(
+ {"--models-max"}, "N",
+ string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
+ [](common_params & params, int value) {
+ params.models_max = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+ add_opt(common_arg(
+ {"--no-models-autoload"},
+ "disables automatic loading of models (default: enabled)",
+ [](common_params & params) {
+ params.models_autoload = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
- "use jinja template for chat (default: disabled)",
+ string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+ add_opt(common_arg(
+ {"--no-jinja"},
+ string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
+ [](common_params & params) {
+ params.use_jinja = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2491,7 +2639,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2499,7 +2647,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -2511,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
@@ -2523,7 +2671,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = read_file(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
{"--no-prefill-assistant"},
string_format(
@@ -2554,7 +2702,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.simple_io = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--positive-file"}, "FNAME",
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2614,7 +2762,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params &, const std::string & value) {
common_log_set_file(common_log_main(), value.c_str());
}
- ));
+ ).set_env("LLAMA_LOG_FILE"));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -2628,7 +2776,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
} else {
throw std::invalid_argument(
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
+ string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_LOG_COLORS"));
@@ -2637,7 +2785,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](common_params & params) {
params.verbosity = INT_MAX;
- common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
@@ -2649,10 +2796,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
- "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+ string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
+ " - 0: generic output\n"
+ " - 1: error\n"
+ " - 2: warning\n"
+ " - 3: info\n"
+ " - 4: debug\n"
+ "(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
- common_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(common_arg(
@@ -2785,14 +2937,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_max = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) {
params.speculative.n_min = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg(
{"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -2806,14 +2958,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) {
params.speculative.n_ctx = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg(
{"-devd", "--device-draft"}, "",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2821,7 +2973,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
@@ -2833,21 +2985,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.path = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@@ -3111,7 +3263,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--gpt-oss-120b-default"},
@@ -3130,7 +3282,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-4b-default"},
@@ -3141,7 +3293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-12b-default"},
@@ -3152,7 +3304,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
return ctx_arg;
}
diff --git a/common/arg.h b/common/arg.h
index 7ab7e2cea4..219c115e63 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,8 +3,10 @@
#include "common.h"
#include
+#include