diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile
index 9d9fabf887..db221b0b81 100644
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -3,7 +3,8 @@
# ==============================================================================
# Define the CANN base image for easier version updates later
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
+ARG CHIP_TYPE=910b
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
# ==============================================================================
# BUILD STAGE
@@ -11,9 +12,6 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build
-# Define the Ascend chip model for compilation. Default is Ascend910B3
-ARG ASCEND_SOC_TYPE=Ascend910B3
-
# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
@@ -36,13 +34,14 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# For brevity, only core variables are listed here. You can paste the original ENV list here.
# -- Build llama.cpp --
-# Use the passed ASCEND_SOC_TYPE argument and add general build options
+# Use the passed CHIP_TYPE argument and add general build options
+ARG CHIP_TYPE
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
- -DSOC_TYPE=${ASCEND_SOC_TYPE} \
+ -DSOC_TYPE=ascend${CHIP_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)
@@ -108,11 +107,11 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"]
### Target: light
-# Lightweight image containing only llama-cli
+# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
ENTRYPOINT [ "/app/llama-cli" ]
diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
index 6e16ecda44..b9e84ab986 100644
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
index 54f793d0a3..fed5863157 100644
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index d1a8fbed4c..adebf08229 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light
COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile
index ef43d78cd2..6581187f32 100644
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
- cmake --build build --config Release --target llama-cli
+ cmake --build build --config Release --target llama-cli && \
+ cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
ENV LC_ALL=C.utf8
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
index 3bbf4a4def..4d42a906b1 100644
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
@@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
index 45902dcf89..0a4f43058d 100644
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
@@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cli
+%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index faa3500e61..34d6ad9f40 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index d6bf28b105..53c3ed8d88 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile
index b7c9457680..1e66f061d5 100644
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 8a3a693400..cc5ee17dfd 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@"
+elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
+ exec ./llama-completion "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
exec ./llama-bench "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
else
echo "Unknown command: $arg1"
echo "Available commands: "
- echo " --run (-r): Run a model previously converted into ggml"
- echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+ echo " --run (-r): Run a model (chat) previously converted into ggml"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin"
+ echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index b6b802a7c6..b37b4f277d 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,9 +1,7 @@
-ARG UBUNTU_VERSION=25.10
+ARG UBUNTU_VERSION=26.04
FROM ubuntu:$UBUNTU_VERSION AS build
-# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
-
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -52,6 +50,7 @@ WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
+ build-essential \
git \
python3 \
python3-pip \
@@ -69,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
-COPY --from=build /app/full/llama-cli /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
index c42a14ff83..b815e70a8d 100644
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -11,7 +11,7 @@ body:
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
If you encountered the issue while using an external UI (e.g. ollama),
please reproduce your issue using one of the examples/binaries in this repository.
- The `llama-cli` binary can be used for simple and reproducible model inference.
+ The `llama-completion` binary can be used for simple and reproducible model inference.
- type: textarea
id: version
attributes:
@@ -74,9 +74,12 @@ body:
Please give us a summary of the problem and tell us how to reproduce it.
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
that information would be very much appreciated by us.
+
+ If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
+ If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
placeholder: >
- e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
- When I use -ngl 0 it works correctly.
+ e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
+ With short prompts or `-fa off` it works correctly.
Here are the exact commands that I used: ...
validations:
required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 1904e31fdc..e1bd08ddd2 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -86,6 +86,7 @@ body:
description: >
If applicable, please copy and paste any relevant log output, including any generated text.
This will be automatically formatted into code, so no need for backticks.
+ If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
render: shell
validations:
required: false
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
index 5575caeca3..6ad61582a5 100644
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -65,3 +65,34 @@ runs:
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+ - name: Install Cuda Toolkit 13.1
+ if: ${{ inputs.cuda_version == '13.1' }}
+ shell: pwsh
+ run: |
+ mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+ choco install unzip -y
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
+ unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+ echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+ echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index 36201281f0..c2c6ea12ae 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -291,6 +291,7 @@ jobs:
-DGGML_RVV=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
+ -DGGML_RV_ZIHINTPAUSE=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
diff --git a/.github/workflows/build-riscv-native.yml b/.github/workflows/build-riscv-native.yml
deleted file mode 100644
index a3a0b0d663..0000000000
--- a/.github/workflows/build-riscv-native.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
- pull_request:
- workflow_dispatch:
- workflow_call:
-
-jobs:
- debian-13-riscv64-native: # Bianbu 2.2
- runs-on: [self-hosted, RISCV64]
-
- steps:
- - name: Install prerequisites
- run: |
- sudo apt-get update || true
- sudo apt-get install -y libatomic1
- - uses: actions/checkout@v4
- - name: Setup Riscv
- run: |
- sudo apt-get update || true
- sudo apt-get install -y --no-install-recommends \
- build-essential \
- gcc-14-riscv64-linux-gnu \
- g++-14-riscv64-linux-gnu \
- ccache \
- cmake
-
- - name: Setup ccache
- run: |
- mkdir -p $HOME/.ccache
- ccache -M 5G -d $HOME/.ccache
- export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
- export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
- echo "$GITHUB_WORKSPACE"
- echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
- echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
- echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
- echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
- - name: Build
- run: |
- cmake -B build \
- -DLLAMA_CURL=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_OPENMP=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_TOOLS=ON \
- -DLLAMA_BUILD_TESTS=OFF \
- -DCMAKE_SYSTEM_NAME=Linux \
- -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
- -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
- -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
- -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
- -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
- cmake --build build --config Release -j $(nproc)
-
- # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
- # runs-on: [self-hosted, RISCV64]
-
- # steps:
- # - name: Install prerequisites
- # run: |
- # sudo apt-get update || true
- # sudo apt-get install -y libatomic1
- # - uses: actions/checkout@v4
- # - name: Setup Riscv
- # run: |
- # sudo apt-get update || true
- # sudo apt-get install -y --no-install-recommends \
- # build-essential \
- # gcc-14-riscv64-linux-gnu \
- # g++-14-riscv64-linux-gnu \
- # ccache \
- # cmake
- # sudo apt-get upgrade binutils -y
-
- # - name: Setup ccache
- # run: |
- # mkdir -p $HOME/.ccache
- # ccache -M 5G -d $HOME/.ccache
- # export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
- # export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
- # echo "$GITHUB_WORKSPACE"
- # echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
- # echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
- # echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
- # echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
- # - name: Build
- # run: |
- # cmake -B build \
- # -DLLAMA_CURL=OFF \
- # -DCMAKE_BUILD_TYPE=Release \
- # -DGGML_OPENMP=OFF \
- # -DLLAMA_BUILD_EXAMPLES=ON \
- # -DLLAMA_BUILD_TOOLS=ON \
- # -DLLAMA_BUILD_TESTS=OFF \
- # -DCMAKE_SYSTEM_NAME=Linux \
- # -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
- # -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
- # -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
- # -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- # -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
- # -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
- # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
- # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
- # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
- # -DGGML_RVV=ON \
- # -DGGML_RV_ZFH=ON \
- # -DGGML_RV_ZICBOP=ON \
- # -DGGML_CPU_RISCV64_SPACEMIT=ON \
- # -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
-
- # cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e3697ffaaa..de3ad06065 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,7 +20,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
- '**/*.comp'
+ '**/*.comp',
+ '**/*.glsl'
]
pull_request:
@@ -40,7 +41,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
- '**/*.comp'
+ '**/*.comp',
+ '**/*.glsl'
]
concurrency:
@@ -68,13 +70,7 @@ jobs:
with:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -83,6 +79,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
@@ -109,13 +107,7 @@ jobs:
with:
key: macOS-latest-cmake-x64
evict-old-files: 1d
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -126,6 +118,8 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -150,13 +144,7 @@ jobs:
with:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
- brew install curl
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dawn Dependency
id: dawn-depends
@@ -210,6 +198,7 @@ jobs:
with:
key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build Dependencies
id: build_depends
@@ -217,7 +206,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
- libjpeg-dev build-essential libcurl4-openssl-dev \
+ libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Python Dependencies
@@ -238,6 +227,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -258,7 +249,7 @@ jobs:
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
- ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
@@ -267,7 +258,7 @@ jobs:
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
- ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+ ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@@ -289,18 +280,21 @@ jobs:
with:
key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -311,6 +305,8 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -335,7 +331,7 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
@@ -343,6 +339,8 @@ jobs:
mkdir build
cd build
cmake .. \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
cmake --build . --config Release -j $(nproc)
@@ -373,12 +371,14 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential libcurl4-openssl-dev
+ sudo apt-get install build-essential libssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -401,16 +401,19 @@ jobs:
with:
key: ubuntu-24-cmake-vulkan-deb
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
- sudo apt-get install -y glslc libvulkan-dev libcurl4-openssl-dev
+ sudo apt-get install -y glslc libvulkan-dev libssl-dev
- name: Configure
id: cmake_configure
run: |
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -434,13 +437,14 @@ jobs:
with:
key: ubuntu-24-cmake-vulkan
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
- sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+ sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@@ -466,6 +470,8 @@ jobs:
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
@@ -491,13 +497,14 @@ jobs:
with:
key: ubuntu-24-cmake-webgpu
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
- sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+ sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@@ -537,7 +544,10 @@ jobs:
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
- cmake -B build -DGGML_WEBGPU=ON
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
- name: Test
@@ -547,6 +557,47 @@ jobs:
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
+ ubuntu-24-wasm-webgpu:
+ runs-on: ubuntu-24.04
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: ubuntu-latest-wasm-webgpu
+ evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+ - name: Install Emscripten
+ run: |
+ git clone https://github.com/emscripten-core/emsdk.git
+ cd emsdk
+ ./emsdk install latest
+ ./emsdk activate latest
+
+ - name: Fetch emdawnwebgpu
+ run: |
+ DAWN_TAG="v20251027.212519"
+ EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+ echo "Downloading ${EMDAWN_PKG}"
+ curl -L -o emdawn.zip \
+ "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+ unzip emdawn.zip
+
+ - name: Build WASM WebGPU
+ run: |
+ source emsdk/emsdk_env.sh
+ emcmake cmake -B build-wasm \
+ -DGGML_WEBGPU=ON \
+ -DLLAMA_CURL=OFF \
+ -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+ cmake --build build-wasm --target test-backend-ops -j $(nproc)
+
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.1.2
@@ -560,18 +611,21 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
+ sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-hip
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
@@ -590,18 +644,21 @@ jobs:
id: depends
run: |
apt-get update
- apt-get install -y build-essential git cmake libcurl4-openssl-dev
+ apt-get install -y build-essential git cmake libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-musa
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
@@ -626,7 +683,7 @@ jobs:
shell: bash
run: |
sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@@ -642,12 +699,15 @@ jobs:
with:
key: ubuntu-22-cmake-sycl
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
@@ -674,7 +734,7 @@ jobs:
shell: bash
run: |
sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
@@ -690,12 +750,15 @@ jobs:
with:
key: ubuntu-22-cmake-sycl-fp16
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
@@ -721,12 +784,7 @@ jobs:
with:
key: macOS-latest-cmake-ios
evict-old-files: 1d
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -758,12 +816,7 @@ jobs:
with:
key: macOS-latest-cmake-tvos
evict-old-files: 1d
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
@@ -790,12 +843,6 @@ jobs:
id: checkout
uses: actions/checkout@v4
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build
id: cmake_build
run: |
@@ -831,6 +878,7 @@ jobs:
with:
key: macOS-latest-swift
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Download xcframework artifact
uses: actions/download-artifact@v4
@@ -838,12 +886,6 @@ jobs:
name: llama-xcframework
path: build-apple/llama.xcframework/
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- name: Build llama.cpp with CMake
id: cmake_build
run: |
@@ -879,6 +921,7 @@ jobs:
key: windows-msys2
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
@@ -947,6 +990,7 @@ jobs:
key: windows-latest-cmake-${{ matrix.build }}
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Download OpenBLAS
id: get_openblas
@@ -995,21 +1039,12 @@ jobs:
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
- with:
- architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }} `
- -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
- name: Add libopenblas.dll
id: add_libopenblas_dll
@@ -1053,21 +1088,24 @@ jobs:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
- apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
+ apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-cmake-cuda
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DLLAMA_FATAL_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
- -DLLAMA_FATAL_WARNINGS=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON
cmake --build build
@@ -1090,6 +1128,7 @@ jobs:
key: windows-cuda-${{ matrix.cuda }}
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install Cuda Toolkit
uses: ./.github/actions/windows-setup-cuda
@@ -1101,25 +1140,20 @@ jobs:
run: |
choco install ninja
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
-
- name: Build
id: cmake_build
shell: cmd
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
+ -DLLAMA_CURL=OFF ^
+ -DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^
- -DGGML_RPC=ON ^
- -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
+ -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@@ -1146,12 +1180,13 @@ jobs:
key: windows-latest-cmake-sycl
variant: ccache
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+ # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
@@ -1207,15 +1242,10 @@ jobs:
with:
key: ${{ github.job }}
evict-old-files: 1d
-
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1224,11 +1254,12 @@ jobs:
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
+ -DLLAMA_CURL=OFF `
+ -DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
- -DGGML_RPC=ON `
- -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
ios-xcode-build:
@@ -1390,32 +1421,57 @@ jobs:
strategy:
matrix:
arch: [x86, aarch64]
- cann:
- - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
- device:
- - 'ascend910b3'
- build:
- - 'Release'
+ chip_type: ['910b', '310p']
+ build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
- container: ascendai/cann:${{ matrix.cann }}
steps:
- name: Checkout
uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
- - name: Dependencies
+ - name: Free up disk space
+ uses: ggml-org/free-disk-space@v1.3.1
+ with:
+ tool-cache: true
+
+ - name: Set container image
+ id: cann-image
run: |
- yum update -y
- yum install -y git gcc gcc-c++ make cmake libcurl-devel
+ image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+ echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+ - name: Pull container image
+ run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build
+ env:
+ BUILD_TYPE: ${{ matrix.build }}
+ SOC_TYPE: ascend${{ matrix.chip_type }}
run: |
- export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+ HOST_UID=$(id -u)
+ HOST_GID=$(id -g)
- cmake -S . -B build \
- -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
- -DGGML_CANN=on \
- -DSOC_TYPE=${{ matrix.device }}
- cmake --build build -j $(nproc)
+ docker run --rm \
+ -v "${PWD}:/workspace" \
+ -w /workspace \
+ -e SOC_TYPE=${SOC_TYPE} \
+ -e BUILD_TYPE=${BUILD_TYPE} \
+ "${{ steps.cann-image.outputs.image }}" \
+ bash -lc '
+ set -e
+ yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+ yum clean all && rm -rf /var/cache/yum
+ git config --global --add safe.directory "/workspace"
+ export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+ cmake -S . -B build \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DGGML_CANN=on \
+ -DSOC_TYPE=${SOC_TYPE}
+ cmake --build build -j $(nproc)
+
+ chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+ '
# TODO: simplify the following workflows using a matrix
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
@@ -1432,6 +1488,7 @@ jobs:
with:
key: ggml-ci-x64-cpu-low-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1457,6 +1514,7 @@ jobs:
with:
key: ggml-ci-arm64-cpu-low-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1482,6 +1540,7 @@ jobs:
with:
key: ggml-ci-x64-cpu-high-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1507,6 +1566,7 @@ jobs:
with:
key: ggml-ci-arm64-cpu-high-perf
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1532,6 +1592,7 @@ jobs:
with:
key: ggml-ci-arm64-cpu-high-perf-sve
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1599,33 +1660,33 @@ jobs:
run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
- ggml-ci-x64-amd-vulkan:
- runs-on: [self-hosted, Linux, X64, AMD]
+ # ggml-ci-x64-amd-vulkan:
+ # runs-on: [self-hosted, Linux, X64, AMD]
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v4
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # vulkaninfo --summary
+ # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
- ggml-ci-x64-amd-rocm:
- runs-on: [self-hosted, Linux, X64, AMD]
+ # ggml-ci-x64-amd-rocm:
+ # runs-on: [self-hosted, Linux, X64, AMD]
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v4
- - name: Test
- id: ggml-ci
- run: |
- amd-smi static
- GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # amd-smi static
+ # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
@@ -1667,6 +1728,7 @@ jobs:
with:
key: ggml-ci-arm64-cpu-kleidiai
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
@@ -1679,6 +1741,337 @@ jobs:
run: |
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+ ubuntu-cpu-cmake-riscv64-native:
+ runs-on: RISCV64
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Check environment
+ run: |
+ uname -a
+ gcc --version
+ g++ --version
+ ldd --version
+ cmake --version
+ rustc --version
+
+ - name: Setup ccache
+ run: |
+ # Set unique cache directory for this job
+ export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
+ mkdir -p "$CCACHE_DIR"
+
+ # Configure ccache for optimal performance
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+
+ # Enable more aggressive caching
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ # Export for subsequent steps
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=ON \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DGGML_RPC=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L 'main|curl' --verbose --timeout 900
+
+ - name: Test llama2c conversion
+ id: llama2c_test
+ run: |
+ cd build
+ echo "Fetch tokenizer"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+ echo "Fetch llama2c model"
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+ ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+ ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+ ubuntu-cmake-sanitizer-riscv64-native:
+ runs-on: RISCV64
+
+ continue-on-error: true
+
+ strategy:
+ matrix:
+ sanitizer: [ADDRESS, THREAD, UNDEFINED]
+ build_type: [Debug]
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ # Unique cache directory per matrix combination
+ export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
+ mkdir -p "$CCACHE_DIR"
+
+ # Configure ccache
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ # Export for subsequent steps
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ if: ${{ matrix.sanitizer != 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DGGML_OPENMP=ON \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+
+ ubuntu-llguidance-riscv64-native:
+ runs-on: RISCV64
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
+ mkdir -p "$CCACHE_DIR"
+
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DLLAMA_LLGUIDANCE=ON \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose --timeout 900
+
+
+ ubuntu-cmake-rpc-riscv64-native:
+ runs-on: RISCV64
+
+ continue-on-error: true
+
+ steps:
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+
+ # Install necessary packages
+ sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+ # Set gcc-14 and g++-14 as the default compilers
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+ sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+ sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+ # Install Rust stable version
+ rustup install stable
+ rustup default stable
+
+ - name: GCC version check
+ run: |
+ gcc --version
+ g++ --version
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Setup ccache
+ run: |
+ export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
+ mkdir -p "$CCACHE_DIR"
+
+ ccache --set-config=max_size=5G
+ ccache --set-config=compression=true
+ ccache --set-config=compression_level=6
+ ccache --set-config=cache_dir="$CCACHE_DIR"
+ ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+ ccache --set-config=hash_dir=false
+
+ echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+ echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=ON \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+ -DGGML_RPC=ON
+
+ cmake --build build --config Release -j $(nproc)
+
+ - name: Test
+ id: cmake_test
+ run: |
+ cd build
+ ctest -L main --verbose
+
ggml-ci-arm64-graviton4-kleidiai:
runs-on: ah-ubuntu_22_04-c8g_8x
@@ -1719,6 +2112,7 @@ jobs:
with:
key: ggml-ci-arm64-graviton4-kleidiai
evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Test
id: ggml-ci
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e72caa423b..4cc2f4665c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,13 +66,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
- name: llama-bin-macos-arm64.zip
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
+ name: llama-bin-macos-arm64.tar.gz
macOS-x64:
runs-on: macos-15-intel
@@ -120,13 +120,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
- name: llama-bin-macos-x64.zip
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
+ name: llama-bin-macos-x64.tar.gz
ubuntu-22-cpu:
strategy:
@@ -182,13 +182,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
- name: llama-bin-ubuntu-${{ matrix.build }}.zip
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
+ name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
@@ -235,13 +235,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
- zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
- path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
- name: llama-bin-ubuntu-vulkan-x64.zip
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
+ name: llama-bin-ubuntu-vulkan-x64.tar.gz
windows-cpu:
runs-on: windows-2025
@@ -298,7 +298,7 @@ jobs:
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
- 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+ 7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -380,7 +380,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+ 7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -393,7 +393,7 @@ jobs:
strategy:
matrix:
- cuda: ['12.4']
+ cuda: ['12.4', '13.1']
steps:
- name: Clone
@@ -434,7 +434,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+ 7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -448,6 +448,7 @@ jobs:
$dst='.\build\bin\cudart\'
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+ robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
@@ -517,6 +518,8 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
@@ -526,7 +529,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
- 7z a llama-bin-win-sycl-x64.zip ./build/bin/*
+ 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
@@ -632,7 +635,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- 7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+ 7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@@ -685,13 +688,87 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
- zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+ # Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
+ # For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
+ zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
- name: llama-${{ steps.tag.outputs.name }}-xcframework
+ name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+
+
+ openEuler-cann:
+ strategy:
+ matrix:
+ arch: [x86, aarch64]
+ chip_type: ['910b', '310p']
+ build: ['Release']
+ runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Free up disk space
+ uses: ggml-org/free-disk-space@v1.3.1
+ with:
+ tool-cache: true
+
+ - name: Set container image
+ id: cann-image
+ run: |
+ image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+ echo "image=${image}" >> "${GITHUB_OUTPUT}"
+
+ - name: Pull container image
+ run: docker pull "${{ steps.cann-image.outputs.image }}"
+
+ - name: Build
+ env:
+ BUILD_TYPE: ${{ matrix.build }}
+ SOC_TYPE: ascend${{ matrix.chip_type }}
+ run: |
+ HOST_UID=$(id -u)
+ HOST_GID=$(id -g)
+
+ docker run --rm \
+ -v "${PWD}:/workspace" \
+ -w /workspace \
+ -e SOC_TYPE=${SOC_TYPE} \
+ -e BUILD_TYPE=${BUILD_TYPE} \
+ "${{ steps.cann-image.outputs.image }}" \
+ bash -lc '
+ set -e
+ yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
+ yum clean all && rm -rf /var/cache/yum
+ git config --global --add safe.directory "/workspace"
+ export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+ cmake -S . -B build \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DGGML_CANN=on \
+ -DSOC_TYPE=${SOC_TYPE}
+ cmake --build build -j $(nproc)
+
+ chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+ '
+
+ - name: Determine tag name
+ id: tag
+ uses: ./.github/actions/get-tag-name
+
+ - name: Pack artifacts
+ run: |
+ cp LICENSE ./build/bin/
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
+ name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -714,6 +791,7 @@ jobs:
- macOS-arm64
- macOS-x64
- ios-xcode-build
+ - openEuler-cann
steps:
- name: Clone
@@ -768,6 +846,7 @@ jobs:
echo "Moving other artifacts..."
mv -v artifact/*.zip release
+ mv -v artifact/*.tar.gz release
- name: Create release
id: create_release
@@ -776,6 +855,37 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
+ body: |
+
+
+ ${{ github.event.head_commit.message }}
+
+
+
+ **macOS/iOS:**
+ - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
+ - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
+ - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
+
+ **Linux:**
+ - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
+ - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
+ - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
+
+ **Windows:**
+ - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
+ - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+ - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
+ - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
+ - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+ - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+ - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
+
+ **openEuler:**
+ - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
+ - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
+ - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
+ - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
- name: Upload release
id: upload_release
@@ -787,7 +897,7 @@ jobs:
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./release')) {
- if (path.extname(file) === '.zip') {
+ if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml
new file mode 100644
index 0000000000..544c4ad408
--- /dev/null
+++ b/.github/workflows/server-webui.yml
@@ -0,0 +1,225 @@
+# Server WebUI build and tests
+name: Server WebUI
+
+on:
+ workflow_dispatch: # allows manual triggering
+ inputs:
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ slow_tests:
+ description: 'Run slow tests'
+ required: true
+ type: boolean
+ push:
+ branches:
+ - master
+ paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+ pull_request:
+ types: [opened, synchronize, reopened]
+ paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+
+env:
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ webui-check:
+ name: WebUI Checks
+ runs-on: ubuntu-latest
+ continue-on-error: true
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Setup Node.js
+ id: node
+ uses: actions/setup-node@v4
+ with:
+ node-version: "22"
+ cache: "npm"
+ cache-dependency-path: "tools/server/webui/package-lock.json"
+
+ - name: Install dependencies
+ id: setup
+ if: ${{ steps.node.conclusion == 'success' }}
+ run: npm ci
+ working-directory: tools/server/webui
+
+ - name: Run type checking
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run check
+ working-directory: tools/server/webui
+
+ - name: Run linting
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run lint
+ working-directory: tools/server/webui
+
+ - name: Build application
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npm run build
+ working-directory: tools/server/webui
+
+ - name: Install Playwright browsers
+ id: playwright
+ if: ${{ always() && steps.setup.conclusion == 'success' }}
+ run: npx playwright install --with-deps
+ working-directory: tools/server/webui
+
+ - name: Build Storybook
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run build-storybook
+ working-directory: tools/server/webui
+
+ - name: Run Client tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:client
+ working-directory: tools/server/webui
+
+ - name: Run Unit tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:unit
+ working-directory: tools/server/webui
+
+ - name: Run UI tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:ui -- --testTimeout=60000
+ working-directory: tools/server/webui
+
+ - name: Run E2E tests
+ if: ${{ always() && steps.playwright.conclusion == 'success' }}
+ run: npm run test:e2e
+ working-directory: tools/server/webui
+
+ server-build:
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+ build_type: [RelWithDebInfo]
+ include:
+ - build_type: Release
+ sanitizer: ""
+ fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+ steps:
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get -y install \
+ build-essential \
+ xxd \
+ git \
+ cmake \
+ curl \
+ wget \
+ language-pack-en \
+ libssl-dev
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Python setup
+ id: setup_python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Tests dependencies
+ id: test_dependencies
+ run: |
+ pip install -r tools/server/tests/requirements.txt
+
+ - name: Setup Node.js for WebUI
+ uses: actions/setup-node@v4
+ with:
+ node-version: "22"
+ cache: "npm"
+ cache-dependency-path: "tools/server/webui/package-lock.json"
+
+ - name: Install WebUI dependencies
+ run: npm ci
+ working-directory: tools/server/webui
+
+ - name: Build WebUI
+ run: npm run build
+ working-directory: tools/server/webui
+
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+ -DGGML_OPENMP=OFF ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Build (sanitizers)
+ id: cmake_build_sanitizers
+ if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Build (sanitizers)
+ id: cmake_build
+ if: ${{ matrix.sanitizer == '' }}
+ run: |
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_CURL=OFF \
+ -DLLAMA_OPENSSL=ON \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ matrix.sanitizer == '' }}
+ env:
+ GITHUB_ACTIONS: "true"
+ run: |
+ cd tools/server/tests
+ ./tests.sh
+
+ - name: Tests (sanitizers)
+ id: server_integration_tests_sanitizers
+ if: ${{ matrix.sanitizer != '' }}
+ run: |
+ cd tools/server/tests
+ LLAMA_SANITIZE=1 ./tests.sh
+
+ - name: Slow tests
+ id: server_integration_tests_slow
+ if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+ run: |
+ cd tools/server/tests
+ SLOW_TESTS=1 ./tests.sh
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index ebcd6424bc..f9e2a79af7 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
curl \
wget \
language-pack-en \
- libcurl4-openssl-dev
+ libssl-dev
- name: Clone
id: checkout
@@ -76,264 +76,6 @@ jobs:
run: |
pip install -r tools/server/tests/requirements.txt
- webui-setup:
- name: WebUI Setup
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
- cache: "npm"
- cache-dependency-path: "tools/server/webui/package-lock.json"
-
- - name: Cache node_modules
- uses: actions/cache@v4
- id: cache-node-modules
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Install dependencies
- if: steps.cache-node-modules.outputs.cache-hit != 'true'
- run: npm ci
- working-directory: tools/server/webui
-
- webui-check:
- needs: webui-setup
- name: WebUI Check
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Run type checking
- run: npm run check
- working-directory: tools/server/webui
-
- - name: Run linting
- run: npm run lint
- working-directory: tools/server/webui
-
- webui-build:
- needs: webui-check
- name: WebUI Build
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Build application
- run: npm run build
- working-directory: tools/server/webui
-
- webui-tests:
- needs: webui-build
- name: Run WebUI tests
- permissions:
- contents: read
-
- runs-on: ubuntu-latest
-
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
-
- - name: Setup Node.js
- uses: actions/setup-node@v4
- with:
- node-version: "22"
-
- - name: Restore node_modules cache
- uses: actions/cache@v4
- with:
- path: tools/server/webui/node_modules
- key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
- restore-keys: |
- ${{ runner.os }}-node-modules-
-
- - name: Install Playwright browsers
- run: npx playwright install --with-deps
- working-directory: tools/server/webui
-
- - name: Build Storybook
- run: npm run build-storybook
- working-directory: tools/server/webui
-
- - name: Run Client tests
- run: npm run test:client
- working-directory: tools/server/webui
-
- - name: Run Server tests
- run: npm run test:server
- working-directory: tools/server/webui
-
- - name: Run UI tests
- run: npm run test:ui -- --testTimeout=60000
- working-directory: tools/server/webui
-
- - name: Run E2E tests
- run: npm run test:e2e
- working-directory: tools/server/webui
-
- server-build:
- needs: [webui-tests]
- runs-on: ubuntu-latest
-
- strategy:
- matrix:
- sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
- build_type: [RelWithDebInfo]
- include:
- - build_type: Release
- sanitizer: ""
- fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
- steps:
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get -y install \
- build-essential \
- xxd \
- git \
- cmake \
- curl \
- wget \
- language-pack-en \
- libcurl4-openssl-dev
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Python setup
- id: setup_python
- uses: actions/setup-python@v5
- with:
- python-version: '3.11'
-
- - name: Tests dependencies
- id: test_dependencies
- run: |
- pip install -r tools/server/tests/requirements.txt
-
- - name: Setup Node.js for WebUI
- uses: actions/setup-node@v4
- with:
- node-version: "22"
- cache: "npm"
- cache-dependency-path: "tools/server/webui/package-lock.json"
-
- - name: Install WebUI dependencies
- run: npm ci
- working-directory: tools/server/webui
-
- - name: Build WebUI
- run: npm run build
- working-directory: tools/server/webui
-
- - name: Build (no OpenMP)
- id: cmake_build_no_openmp
- if: ${{ matrix.sanitizer == 'THREAD' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
- -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
- -DGGML_OPENMP=OFF ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Build (sanitizers)
- id: cmake_build_sanitizers
- if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
- -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Build (sanitizers)
- id: cmake_build
- if: ${{ matrix.sanitizer == '' }}
- run: |
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
- cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
- - name: Tests
- id: server_integration_tests
- if: ${{ matrix.sanitizer == '' }}
- env:
- GITHUB_ACTIONS: "true"
- run: |
- cd tools/server/tests
- ./tests.sh
-
- - name: Tests (sanitizers)
- id: server_integration_tests_sanitizers
- if: ${{ matrix.sanitizer != '' }}
- run: |
- cd tools/server/tests
- LLAMA_SANITIZE=1 ./tests.sh
-
- - name: Slow tests
- id: server_integration_tests_slow
- if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
- run: |
- cd tools/server/tests
- SLOW_TESTS=1 ./tests.sh
-
-
server-windows:
runs-on: windows-2022
@@ -345,16 +87,10 @@ jobs:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- - name: libCURL
- id: get_libcurl
- uses: ./.github/actions/windows-setup-curl
-
- name: Build
id: cmake_build
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
- cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+ cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@@ -368,13 +104,6 @@ jobs:
run: |
pip install -r tools/server/tests/requirements.txt
- - name: Copy Libcurl
- id: prepare_libcurl
- env:
- CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
- run: |
- cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
index 5c28615595..d3d9be23ce 100644
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,6 +9,7 @@ jobs:
update:
name: Update Winget Package
runs-on: ubuntu-latest
+ if: github.repository_owner == 'ggml-org'
steps:
- name: Install cargo binstall
diff --git a/.gitignore b/.gitignore
index c7d0009785..05eb578a82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,52 +20,41 @@
*.so
*.swp
*.tmp
+*.DS_Store
# IDE / OS
-.cache/
-.ccls-cache/
-.direnv/
-.DS_Store
-.envrc
-.idea/
-.swiftpm
-.vs/
-.vscode/
-nppBackup
+/.cache/
+/.ccls-cache/
+/.direnv/
+/.envrc
+/.idea/
+/.swiftpm
+/.vs/
+/.vscode/
+/nppBackup
# Coverage
-gcovr-report/
-lcov-report/
+/gcovr-report/
+/lcov-report/
# Build Artifacts
-tags
-.build/
-build*
-release
-debug
-!build-info.cmake
-!build-info.cpp.in
-!build-info.sh
-!build.zig
-!docs/build.md
+/tags
+/.build/
+/build*
+/release
+/debug
/libllama.so
/llama-*
/vulkan-shaders-gen
-android-ndk-*
-arm_neon.h
-cmake-build-*
-CMakeSettings.json
-compile_commands.json
-ggml-metal-embed.metal
-llama-batched-swift
/rpc-server
-out/
-tmp/
-autogen-*.md
+/out/
+/tmp/
+/autogen-*.md
+/common/build-info.cpp
# Deprecated
@@ -74,44 +63,38 @@ autogen-*.md
# CI
-!.github/workflows/*.yml
+!/.github/workflows/*.yml
# Models
-models/*
-models-mnt
-!models/.editorconfig
-!models/ggml-vocab-*.gguf*
-!models/templates
+/models/*
+/models-mnt
+!/models/.editorconfig
+!/models/ggml-vocab-*.gguf*
+!/models/templates
# Zig
-zig-out/
-zig-cache/
-
-# Logs
-
-ppl-*.txt
-qnt-*.txt
-perf-*.txt
+/zig-out/
+/zig-cache/
# Examples
-examples/jeopardy/results.txt
-tools/server/*.css.hpp
-tools/server/*.html.hpp
-tools/server/*.js.hpp
-tools/server/*.mjs.hpp
-tools/server/*.gz.hpp
-!build_64.sh
-!examples/*.bat
-!examples/*/*.kts
-!examples/*/*/*.kts
-!examples/sycl/*.bat
-!examples/sycl/*.sh
+/examples/jeopardy/results.txt
+/tools/server/*.css.hpp
+/tools/server/*.html.hpp
+/tools/server/*.js.hpp
+/tools/server/*.mjs.hpp
+/tools/server/*.gz.hpp
+!/build_64.sh
+!/examples/*.bat
+!/examples/*/*.kts
+!/examples/*/*/*.kts
+!/examples/sycl/*.bat
+!/examples/sycl/*.sh
# Server Web UI temporary files
-node_modules
-tools/server/webui/dist
+/tools/server/webui/node_modules
+/tools/server/webui/dist
# Python
@@ -147,8 +130,10 @@ poetry.toml
# Local scripts
/run-vim.sh
/run-chat.sh
-.ccache/
+/.ccache/
# IDE
-*.code-workspace
-.windsurf/
+/*.code-workspace
+/.windsurf/
+# emscripten
+a.out.*
diff --git a/.github/copilot-instructions.md b/AGENTS.md
similarity index 88%
rename from .github/copilot-instructions.md
rename to AGENTS.md
index ad13c6ea8d..e0a65c0ab7 100644
--- a/.github/copilot-instructions.md
+++ b/AGENTS.md
@@ -1,4 +1,4 @@
-# Copilot Instructions for llama.cpp
+# Instructions for llama.cpp
## Repository Overview
@@ -12,6 +12,24 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
- **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
- **License**: MIT
+## Disclose AI Usage
+
+It is crucial to remind contributors that the project mandates disclosure of any AI usage in pull requests. This requirement stems from the potential for AI-generated code to include suboptimal optimizations and hidden bugs, owing to the inherent overconfidence in AI outputs.
+
+When generating significant portions of code, address this by:
+- Informing the user that AI-generated content may be rejected by maintainers.
+- Clearly marking AI-generated code in commit messages and comments.
+ - Example of commit message: `[AI] Fix a race condition in ...`
+ - Example of code comment: `// [AI] spawn a new thread ...`
+
+These measures apply to:
+- Changes resulting in large portions of code or complex logic.
+- Modifications or additions to public APIs in `llama.h`, `ggml.h`, or `mtmd.h`.
+- Backend-related changes, such as those involving CPU, CUDA, Metal, Vulkan, etc.
+- Modifications to `tools/server`.
+
+Note: These measures can be omitted for small fixes or trivial changes.
+
## Build Instructions
### Prerequisites
@@ -251,6 +269,7 @@ Primary tools:
- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
- **Performance focus**: This is a performance-critical inference library
- **API stability**: Changes to `include/llama.h` require careful consideration
+- **Disclose AI Usage**: Refer to the "Disclose AI Usage" earlier in this document
### Git Workflow
- Always create feature branches from `master`
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e133052ba2..956a39403f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,10 +33,24 @@ endif()
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
+
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
- option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+ # Use 64-bit memory to support backend_get_memory queries
+ # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+ if (LLAMA_WASM_MEM64)
+ add_compile_options("-sMEMORY64=1")
+ add_link_options("-sMEMORY64=1")
+ endif()
+ add_link_options("-sALLOW_MEMORY_GROWTH=1")
+
+ option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+ option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+ if (LLAMA_BUILD_HTML)
+ set(CMAKE_EXECUTABLE_SUFFIX ".html")
+ endif()
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -58,6 +72,12 @@ if (MSVC)
add_compile_options("$<$:/bigobj>")
endif()
+if (LLAMA_STANDALONE)
+ # enable parallel builds for msbuild
+ list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+ list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
else()
@@ -180,11 +200,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
-if (MINGW)
- # Target Windows 8 for PrefetchVirtualMemory
- add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
#
# build the library
#
diff --git a/CODEOWNERS b/CODEOWNERS
index 908d13a35b..750096d9a1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,23 +2,25 @@
# multiplie collaborators per item can be specified
/.devops/*.Dockerfile @ngxson
-/.github/actions/ @slaren @CISC
+/.github/actions/ @CISC
/.github/workflows/ @CISC
-/.github/workflows/release.yml @slaren
-/.github/workflows/winget.yml @slaren
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
-/common/arg.* @ggerganov @ericcurtin
+/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
+/common/chat.* @pwilkin
+/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
+/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
+/common/unicode.* @aldehir
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
@@ -30,7 +32,7 @@
/examples/export-docs/ @ggerganov
/examples/gen-docs/ @ggerganov
/examples/gguf/ @ggerganov
-/examples/llama.android/ @ggerganov
+/examples/llama.android/ @ggerganov @hanyin-arm @naco-siren
/examples/llama.swiftui/ @ggerganov
/examples/llama.vim @ggerganov
/examples/lookahead/ @ggerganov
@@ -40,21 +42,14 @@
/examples/passkey/ @ggerganov
/examples/retrieval/ @ggerganov
/examples/save-load-state/ @ggerganov
-/examples/simple-chat/ @slaren
-/examples/simple/ @slaren
/examples/speculative-simple/ @ggerganov
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
-/ggml/include/ @ggerganov @slaren
-/ggml/src/ggml-alloc.c @slaren
-/ggml/src/ggml-backend* @slaren
-/ggml/src/ggml-blas/ @slaren
-/ggml/src/ggml-common.h @ggerganov @slaren
-/ggml/src/ggml-cpu/ @ggerganov @slaren
+/ggml/include/ @ggerganov
+/ggml/src/ggml-common.h @ggerganov
+/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
-/ggml/src/ggml-cuda/common.cuh @slaren
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
-/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
@@ -62,19 +57,19 @@
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
-/ggml/src/ggml-impl.h @ggerganov @slaren
+/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
-/ggml/src/ggml-threading.* @ggerganov @slaren
+/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-webgpu/ @reeselevine
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c @ggerganov @slaren
-/ggml/src/ggml.cpp @ggerganov @slaren
+/ggml/src/ggml.c @ggerganov
+/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
/gguf-py/ @CISC
/media/ @ggerganov
@@ -86,28 +81,23 @@
/src/llama-arch.* @CISC
/src/llama-chat.* @ngxson
/src/llama-graph.* @CISC
-/src/llama-model-loader.* @slaren
/src/llama-model.* @CISC
/src/llama-vocab.* @CISC
/src/models/ @CISC
/tests/ @ggerganov
-/tests/test-backend-ops.cpp @slaren
-/tests/test-thread-safety.cpp @slaren
+/tests/test-chat-.* @pwilkin
/tools/batched-bench/ @ggerganov
-/tools/llama-bench/ @slaren
-/tools/main/ @ggerganov
+/tools/cli/ @ngxson
+/tools/completion/ @ggerganov
/tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
-/tools/run/ @ericcurtin
-/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov
/vendor/ @ggerganov
-/.clang-format @slaren
-/.clang-tidy @slaren
/AUTHORS @ggerganov
/CMakeLists.txt @ggerganov
/CONTRIBUTING.md @ggerganov
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b808fa31ea..4545ff8f9a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,10 +15,12 @@ The project differentiates between 3 levels of contributors:
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
+- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
-- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
+- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
# Pull requests (for maintainers)
diff --git a/README.md b/README.md
index 2962783585..ed956bb02e 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2, AVX512 and AMX support for x86 architectures
-- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
+- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
- Vulkan and SYCL backend support
@@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
- Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
+- Android: [llama.android](/examples/llama.android)
@@ -242,6 +243,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
@@ -275,6 +277,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [HIP](docs/build.md#hip) | AMD GPU |
+| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
@@ -311,7 +314,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
-## [`llama-cli`](tools/main)
+## [`llama-cli`](tools/cli)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
@@ -345,19 +348,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
--
- Run simple text completion
-
- To disable conversation mode explicitly, use `-no-cnv`
-
- ```bash
- llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
-
- # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
- ```
-
-
-
-
Constrain the output with a custom grammar
@@ -536,7 +526,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
## Other documentation
-- [main (cli)](tools/main/README.md)
+- [cli](tools/cli/README.md)
+- [completion](tools/completion/README.md)
- [server](tools/server/README.md)
- [GBNF grammars](grammars/README.md)
@@ -612,3 +603,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
+- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
diff --git a/SECURITY.md b/SECURITY.md
index 9749e95b71..ae496f4e3d 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -65,4 +65,9 @@ However, If you have discovered a security vulnerability in this project, please
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
+
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
+
+> [!IMPORTANT]
+> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
diff --git a/ci/run.sh b/ci/run.sh
index 3fec8e9110..0a4a0e41eb 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -398,18 +398,20 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
- (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
- (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
- (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
- (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
+ (time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then
@@ -428,10 +430,10 @@ function gg_run_qwen3_0_6b {
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@@ -523,8 +525,10 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
- (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
+ (time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
@@ -563,8 +567,10 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
+ (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
+
# for this model, the SEP token is ""
- (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+ (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
diff --git a/cmake/build-info.cmake b/cmake/build-info.cmake
index 75c78222f2..c7005950c5 100644
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,26 +39,10 @@ if(Git_FOUND)
endif()
endif()
-if(MSVC)
- set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
- if (CMAKE_VS_PLATFORM_NAME)
- set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
- else()
- set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
- endif()
-else()
- execute_process(
- COMMAND ${CMAKE_C_COMPILER} --version
- OUTPUT_VARIABLE OUT
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
- set(BUILD_COMPILER ${OUT})
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
- execute_process(
- COMMAND ${CMAKE_C_COMPILER} -dumpmachine
- OUTPUT_VARIABLE OUT
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- set(BUILD_TARGET ${OUT})
+if(CMAKE_VS_PLATFORM_NAME)
+ set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+ set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
endif()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 706fa32eed..f7b99159e3 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -50,6 +50,10 @@ add_library(${TARGET} STATIC
base64.hpp
chat-parser.cpp
chat-parser.h
+ chat-parser-xml-toolcall.h
+ chat-parser-xml-toolcall.cpp
+ chat-peg-parser.cpp
+ chat-peg-parser.h
chat.cpp
chat.h
common.cpp
@@ -67,14 +71,23 @@ add_library(${TARGET} STATIC
log.h
ngram-cache.cpp
ngram-cache.h
+ peg-parser.cpp
+ peg-parser.h
+ preset.cpp
+ preset.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
speculative.h
+ unicode.cpp
+ unicode.h
)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features (${TARGET} PUBLIC cxx_std_17)
+
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
@@ -141,9 +154,7 @@ if (LLAMA_LLGUIDANCE)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
#
diff --git a/common/arg.cpp b/common/arg.cpp
index 430ab45dfe..774f8731a9 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -20,6 +20,7 @@
#include
#include
+#include
#include
#include
#include
@@ -30,6 +31,7 @@
#include // for hardware_concurrency
#include
+#ifndef __EMSCRIPTEN__
#ifdef __linux__
#include
#elif defined(_WIN32)
@@ -41,13 +43,17 @@
#else
#include
#endif
+#endif
+
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
using json = nlohmann::ordered_json;
+using namespace common_arg_utils;
static std::initializer_list mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
+ LLAMA_EXAMPLE_CLI,
};
static std::string read_file(const std::string & fname) {
@@ -60,6 +66,15 @@ static std::string read_file(const std::string & fname) {
return content;
}
+static const std::vector & get_common_arg_defs() {
+ static const std::vector options = [] {
+ common_params params;
+ auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
+ return ctx.options;
+ }();
+ return options;
+}
+
common_arg & common_arg::set_examples(std::initializer_list examples) {
this->examples = examples;
return *this;
@@ -81,6 +96,11 @@ common_arg & common_arg::set_sparam() {
return *this;
}
+common_arg & common_arg::set_preset_only() {
+ is_preset_only = true;
+ return *this;
+}
+
bool common_arg::in_example(enum llama_example ex) {
return examples.find(ex) != examples.end();
}
@@ -91,6 +111,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
bool common_arg::get_value_from_env(std::string & output) const {
if (env == nullptr) return false;
+ if (!args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ char * neg_value = std::getenv(neg_env.c_str());
+ if (neg_value) {
+ output = "0"; // falsey
+ return true;
+ }
+ }
char * value = std::getenv(env);
if (value) {
output = value;
@@ -100,6 +130,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
}
bool common_arg::has_value_from_env() const {
+ if (env != nullptr && !args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ if (std::getenv(neg_env.c_str())) {
+ return true;
+ }
+ }
return env != nullptr && std::getenv(env);
}
@@ -130,16 +168,17 @@ static std::vector break_str_into_lines(std::string input, size_t m
return result;
}
-std::string common_arg::to_string() {
+std::string common_arg::to_string() const {
// params for printing to console
const static int n_leading_spaces = 40;
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
std::string leading_spaces(n_leading_spaces, ' ');
std::ostringstream ss;
- for (const auto arg : args) {
- if (arg == args.front()) {
- if (args.size() == 1) {
+ auto all_args = get_args(); // also contains args_neg
+ for (const auto & arg : all_args) {
+ if (arg == all_args.front()) {
+ if (all_args.size() == 1) {
ss << arg;
} else {
// first arg is usually abbreviation, we need padding to make it more beautiful
@@ -148,7 +187,7 @@ std::string common_arg::to_string() {
ss << tmp << spaces;
}
} else {
- ss << arg << (arg != args.back() ? ", " : "");
+ ss << arg << (arg != all_args.back() ? ", " : "");
}
}
if (value_hint) ss << " " << value_hint;
@@ -167,6 +206,31 @@ std::string common_arg::to_string() {
return ss.str();
}
+std::vector common_arg::get_args() const {
+ std::vector result;
+ for (const auto & arg : args) {
+ result.push_back(std::string(arg));
+ }
+ for (const auto & arg : args_neg) {
+ result.push_back(std::string(arg));
+ }
+ return result;
+}
+
+std::vector common_arg::get_env() const {
+ std::vector result;
+ if (env) {
+ result.push_back(std::string(env));
+ }
+ if (!args_neg.empty() && env) {
+ // for compatibility, we need to add LLAMA_ARG_NO_ variant
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ result.push_back(neg_env);
+ }
+ return result;
+}
+
//
// utils
//
@@ -212,13 +276,13 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
- const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
+ model.name = model.docker_repo; // set name for consistency
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
@@ -227,7 +291,8 @@ static handle_model_result common_params_handle_model(
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
- model.hf_repo = auto_detected.repo;
+ model.name = model.hf_repo; // repo name with tag
+ model.hf_repo = auto_detected.repo; // repo name without tag
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
@@ -257,8 +322,6 @@ static handle_model_result common_params_handle_model(
model.path = fs_get_cache_file(string_split(f, '/').back());
}
- } else if (model.path.empty()) {
- model.path = model_path_default;
}
}
@@ -303,6 +366,16 @@ static std::string get_all_kv_cache_types() {
return msg.str();
}
+static bool parse_bool_value(const std::string & value) {
+ if (is_truthy(value)) {
+ return true;
+ } else if (is_falsey(value)) {
+ return false;
+ } else {
+ throw std::invalid_argument("invalid boolean value");
+ }
+}
+
//
// CLI argument parsing functions
//
@@ -310,10 +383,13 @@ static std::string get_all_kv_cache_types() {
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
- std::unordered_map arg_to_options;
+ std::unordered_map> arg_to_options;
for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) {
- arg_to_options[arg] = &opt;
+ arg_to_options[arg] = {&opt, /* is_positive */ true};
+ }
+ for (const auto & arg : opt.args_neg) {
+ arg_to_options[arg] = {&opt, /* is_positive */ false};
}
}
@@ -322,12 +398,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
std::string value;
if (opt.get_value_from_env(value)) {
try {
- if (opt.handler_void && (value == "1" || value == "true")) {
+ if (opt.handler_void && is_truthy(value)) {
opt.handler_void(params);
}
if (opt.handler_int) {
opt.handler_int(params, std::stoi(value));
}
+ if (opt.handler_bool) {
+ opt.handler_bool(params, parse_bool_value(value));
+ }
if (opt.handler_string) {
opt.handler_string(params, value);
continue;
@@ -346,6 +425,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
}
};
+ std::set seen_args;
+
for (int i = 1; i < argc; i++) {
const std::string arg_prefix = "--";
@@ -356,7 +437,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
- auto opt = *arg_to_options[arg];
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto & tmp = arg_to_options[arg];
+ auto opt = *tmp.first;
+ bool is_positive = tmp.second;
if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
}
@@ -365,6 +451,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
opt.handler_void(params);
continue;
}
+ if (opt.handler_bool) {
+ opt.handler_bool(params, is_positive);
+ continue;
+ }
// arg with single value
check_arg(i);
@@ -389,7 +479,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h",
- arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+ arg.c_str(), e.what(), opt.to_string().c_str()));
}
}
@@ -405,7 +495,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
+ auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -415,12 +505,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
- common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
+ common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
- common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
- common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
+ common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
+ common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
+ }
+
+ // model is required (except for server)
+ // TODO @ngxson : maybe show a list of available models in CLI in this case
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
+ throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@@ -444,7 +540,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.kv_overrides.back().key[0] = 0;
}
- if (!params.tensor_buft_overrides.empty()) {
+ // pad tensor_buft_overrides for llama_params_fit:
+ const size_t ntbo = llama_max_tensor_buft_overrides();
+ while (params.tensor_buft_overrides.size() < ntbo) {
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
@@ -460,6 +558,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
));
}
+ common_log_set_verbosity_thold(params.verbosity);
+
return true;
}
@@ -552,6 +652,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-batched-bench",
"llama-bench",
"llama-cli",
+ "llama-completion",
"llama-convert-llama2c-to-ggml",
"llama-cvector-generator",
"llama-embedding",
@@ -636,6 +737,66 @@ static void add_rpc_devices(const std::string & servers) {
}
}
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map) {
+ common_params dummy_params;
+ common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
+
+ std::unordered_map arg_to_options;
+ for (auto & opt : ctx_arg.options) {
+ for (const auto & arg : opt.args) {
+ arg_to_options[arg] = &opt;
+ }
+ for (const auto & arg : opt.args_neg) {
+ arg_to_options[arg] = &opt;
+ }
+ }
+
+ // TODO @ngxson : find a way to deduplicate this code
+
+ // handle command line arguments
+ auto check_arg = [&](int i) {
+ if (i+1 >= argc) {
+ throw std::invalid_argument("expected value for argument");
+ }
+ };
+
+ std::set seen_args;
+
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
+
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+ }
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto opt = *arg_to_options[arg];
+ std::string val;
+ if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+ // bool arg (need to reverse the meaning for negative args)
+ bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+ val = is_neg ? "0" : "1";
+ }
+ if (opt.value_hint != nullptr) {
+ // arg with single value
+ check_arg(i);
+ val = argv[++i];
+ }
+ if (opt.value_hint_2 != nullptr) {
+ // TODO: support arg with 2 values
+ throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
+ }
+ out_map[opt] = val;
+ }
+
+ return true;
+}
+
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -681,19 +842,34 @@ static std::string list_builtin_chat_templates() {
return msg.str();
}
-static bool is_truthy(const std::string & value) {
- return value == "on" || value == "enabled" || value == "1";
+bool common_arg_utils::is_truthy(const std::string & value) {
+ return value == "on" || value == "enabled" || value == "true" || value == "1";
}
-static bool is_falsey(const std::string & value) {
- return value == "off" || value == "disabled" || value == "0";
+bool common_arg_utils::is_falsey(const std::string & value) {
+ return value == "off" || value == "disabled" || value == "false" || value == "0";
}
-static bool is_autoy(const std::string & value) {
+bool common_arg_utils::is_autoy(const std::string & value) {
return value == "auto" || value == "-1";
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+ // per-example default params
+ // we define here to make sure it's included in llama-gen-docs
+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
+ params.use_jinja = false; // disable jinja by default
+
+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
+ params.use_jinja = false; // disable jinja by default
+ params.sampling.temp = 0.2; // lower temp by default for better quality
+
+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
+ params.n_parallel = -1; // auto by default
+ }
+
+ params.use_color = tty_can_use_colors();
+
// load dynamic backends
ggml_backend_load_all();
@@ -707,7 +883,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
sampler_type_chars += common_sampler_type_to_chr(sampler);
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
}
- sampler_type_names.pop_back();
+ if (!sampler_type_names.empty()) {
+ sampler_type_names.pop_back(); // remove last semicolon
+ }
/**
@@ -769,19 +947,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
+ {"--display-prompt"},
{"--no-display-prompt"},
- string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
- [](common_params & params) {
- params.display_prompt = false;
+ string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.display_prompt = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"-co", "--color"},
- string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
- [](common_params & params) {
- params.use_color = true;
+ {"-co", "--color"}, "[on|off|auto]",
+ "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
+ "'auto' enables colors when output is to a terminal",
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.use_color = true;
+ } else if (is_falsey(value)) {
+ params.use_color = false;
+ } else if (is_autoy(value)) {
+ params.use_color = tty_can_use_colors();
+ } else {
+ throw std::invalid_argument(
+ string_format("error: unknown value for --color: '%s'\n", value.c_str()));
+ }
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg(
{"-t", "--threads"}, "N",
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -914,7 +1103,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N",
string_format(
- ex == LLAMA_EXAMPLE_MAIN
+ ex == LLAMA_EXAMPLE_COMPLETION
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
: "number of tokens to predict (default: %d, -1 = infinity)",
params.n_predict),
@@ -953,42 +1142,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
- string_format("max number of context checkpoints to create per slot (default: %d)\n"
+ string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
- ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--cache-ram", "-cram"}, "N",
- string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
+ {"-cram", "--cache-ram"}, "N",
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
- ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--kv-unified", "-kvu"},
- string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+ {"-kvu", "--kv-unified"},
+ "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
[](common_params & params) {
params.kv_unified = true;
}
- ).set_env("LLAMA_ARG_KV_SPLIT"));
- add_opt(common_arg(
- {"--no-context-shift"},
- string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
- [](common_params & params) {
- params.ctx_shift = false;
- }
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--context-shift"},
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
- [](common_params & params) {
- params.ctx_shift = true;
+ {"--no-context-shift"},
+ string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.ctx_shift = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1008,7 +1190,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(
- string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+ string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
}
}).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(common_arg(
@@ -1024,15 +1206,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
+ {"--perf"},
{"--no-perf"},
- string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
- [](common_params & params) {
- params.no_perf = true;
- params.sampling.no_perf = true;
+ string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.no_perf = !value;
+ params.sampling.no_perf = !value;
}
- ).set_env("LLAMA_ARG_NO_PERF"));
+ ).set_env("LLAMA_ARG_PERF"));
+ add_opt(common_arg(
+ {"--show-timings"},
+ {"--no-show-timings"},
+ string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.show_timings = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
@@ -1054,16 +1245,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.system_prompt.pop_back();
}
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
- "an input file (repeat to specify multiple files)",
+ "an input file (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
- std::ifstream file(value);
- if (!file) {
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ for (const auto & item : string_split(value, ',')) {
+ std::ifstream file(item);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.in_files.push_back(item);
}
- params.in_files.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
@@ -1084,16 +1277,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-e", "--escape"},
- string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
- [](common_params & params) {
- params.escape = true;
- }
- ));
- add_opt(common_arg(
{"--no-escape"},
- "do not process escape sequences",
- [](common_params & params) {
- params.escape = false;
+ string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.escape = value;
}
));
add_opt(common_arg(
@@ -1102,59 +1289,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_print = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache"}, "FNAME",
"file to cache prompt state for faster startup (default: none)",
[](common_params & params, const std::string & value) {
params.path_prompt_cache = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-all"},
"if specified, saves user input and generations to cache as well\n",
[](common_params & params) {
params.prompt_cache_all = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-ro"},
"if specified, uses the prompt cache but does not update it",
[](common_params & params) {
params.prompt_cache_ro = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-r", "--reverse-prompt"}, "PROMPT",
"halt generation at PROMPT, return control in interactive mode\n",
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
[](common_params & params) {
params.special = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
- "run in conversation mode:\n"
+ {"-no-cnv", "--no-conversation"},
+ "whether to run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n"
"(default: auto enabled if chat template is available)",
- [](common_params & params) {
- params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+ [](common_params & params, bool value) {
+ params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
- add_opt(common_arg(
- {"-no-cnv", "--no-conversation"},
- "force disable conversation mode (default: false)",
- [](common_params & params) {
- params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
- }
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-st", "--single-turn"},
"run conversation for a single turn only, then exit when done\n"
@@ -1163,28 +1344,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.single_turn = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-i", "--interactive"},
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
[](common_params & params) {
params.interactive = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-if", "--interactive-first"},
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
[](common_params & params) {
params.interactive_first = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-mli", "--multiline-input"},
"allows you to write or paste multiple lines without ending each in '\\'",
[](common_params & params) {
params.multiline_input = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--in-prefix-bos"},
"prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1192,7 +1373,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix_bos = true;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-prefix"}, "STRING",
"string to prefix user inputs with (default: empty)",
@@ -1200,7 +1381,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
@@ -1208,14 +1389,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_suffix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
+ {"--warmup"},
{"--no-warmup"},
- "skip warming up the model with an empty run",
- [](common_params & params) {
- params.warmup = false;
+ string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.warmup = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@@ -1232,6 +1414,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
const auto sampler_names = string_split(value, ';');
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
}
).set_sparam());
add_opt(common_arg(
@@ -1242,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
- {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+ {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
[](common_params & params, const std::string & value) {
params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -1261,6 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
}
).set_sparam());
add_opt(common_arg(
@@ -1268,13 +1452,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
[](common_params & params, int value) {
params.sampling.top_k = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
}
- ).set_sparam());
+ ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
add_opt(common_arg(
{"--top-p"}, "N",
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
[](common_params & params, const std::string & value) {
params.sampling.top_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1282,6 +1468,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
[](common_params & params, const std::string & value) {
params.sampling.min_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
}
).set_sparam());
add_opt(common_arg(
@@ -1296,6 +1483,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
[](common_params & params, const std::string & value) {
params.sampling.xtc_probability = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
}
).set_sparam());
add_opt(common_arg(
@@ -1303,6 +1491,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
[](common_params & params, const std::string & value) {
params.sampling.xtc_threshold = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
}
).set_sparam());
add_opt(common_arg(
@@ -1321,6 +1510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
params.sampling.penalty_last_n = value;
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
}
).set_sparam());
add_opt(common_arg(
@@ -1328,6 +1518,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
[](common_params & params, const std::string & value) {
params.sampling.penalty_repeat = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1425,6 +1616,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
[](common_params & params, int value) {
params.sampling.mirostat = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
}
).set_sparam());
add_opt(common_arg(
@@ -1432,6 +1624,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_eta = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
}
).set_sparam());
add_opt(common_arg(
@@ -1439,6 +1632,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
[](common_params & params, const std::string & value) {
params.sampling.mirostat_tau = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
}
).set_sparam());
add_opt(common_arg(
@@ -1594,28 +1788,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.grp_attn_n = value;
}
- ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-gaw", "--grp-attn-w"}, "N",
string_format("group-attention width (default: %d)", params.grp_attn_w),
[](common_params & params, int value) {
params.grp_attn_w = value;
}
- ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
+ {"-kvo", "--kv-offload"},
{"-nkvo", "--no-kv-offload"},
- "disable KV offload",
- [](common_params & params) {
- params.no_kv_offload = true;
+ string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_kv_offload = !value;
}
- ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+ ).set_env("LLAMA_ARG_KV_OFFLOAD"));
add_opt(common_arg(
+ {"--repack"},
{"-nr", "--no-repack"},
- "disable weight repacking",
- [](common_params & params) {
- params.no_extra_bufts = true;
+ string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_extra_bufts = !value;
}
- ).set_env("LLAMA_ARG_NO_REPACK"));
+ ).set_env("LLAMA_ARG_REPACK"));
add_opt(common_arg(
{"--no-host"},
"bypass host buffer allowing extra buffers to be used",
@@ -1728,13 +1924,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
}
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
- add_opt(common_arg(
- {"-np", "--parallel"}, "N",
- string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
- [](common_params & params, int value) {
- params.n_parallel = value;
- }
- ).set_env("LLAMA_ARG_N_PARALLEL"));
+ if (ex == LLAMA_EXAMPLE_SERVER) {
+ // this is to make sure this option appears in the server-specific section of the help message
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
+ [](common_params & params, int value) {
+ if (value == 0) {
+ throw std::invalid_argument("error: invalid value for n_parallel\n");
+ }
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
+ } else {
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+ [](common_params & params, int value) {
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL"));
+ }
add_opt(common_arg(
{"-ns", "--sequences"}, "N",
string_format("number of sequences to decode (default: %d)", params.n_sequences),
@@ -1744,20 +1954,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
{"-cb", "--cont-batching"},
- string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
- [](common_params & params) {
- params.cont_batching = true;
+ {"-nocb", "--no-cont-batching"},
+ string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.cont_batching = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
add_opt(common_arg(
- {"-nocb", "--no-cont-batching"},
- "disable continuous batching",
- [](common_params & params) {
- params.cont_batching = false;
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
- add_opt(common_arg(
- {"--mmproj"}, "FILE",
+ {"-mm", "--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n"
"note: if -hf is used, this argument can be omitted",
[](common_params & params, const std::string & value) {
@@ -1765,33 +1969,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
add_opt(common_arg(
- {"--mmproj-url"}, "URL",
+ {"-mmu", "--mmproj-url"}, "URL",
"URL to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
add_opt(common_arg(
- {"--no-mmproj"},
- "explicitly disable multimodal projector, useful when using -hf",
- [](common_params & params) {
- params.no_mmproj = true;
+ {"--mmproj-auto"},
+ {"--no-mmproj", "--no-mmproj-auto"},
+ string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_mmproj = !value;
}
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg(
+ {"--mmproj-offload"},
{"--no-mmproj-offload"},
- "do not offload multimodal projector to GPU",
- [](common_params & params) {
- params.mmproj_use_gpu = false;
+ string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.mmproj_use_gpu = value;
}
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
- "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
[](common_params & params, const std::string & value) {
- params.image.emplace_back(value);
+ for (const auto & item : string_split(value, ',')) {
+ params.image.emplace_back(item);
+ }
}
- ).set_examples({LLAMA_EXAMPLE_MTMD}));
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1824,12 +2032,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_MLOCK"));
add_opt(common_arg(
+ {"--mmap"},
{"--no-mmap"},
- "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
- [](common_params & params) {
- params.use_mmap = false;
+ string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_mmap = value;
}
- ).set_env("LLAMA_ARG_NO_MMAP"));
+ ).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
@@ -1874,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
- {"--override-tensor", "-ot"}, "=,...",
+ {"-ot", "--override-tensor"}, "=,...",
"override tensor buffer type", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
}
- ));
+ ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
add_opt(common_arg(
- {"--override-tensor-draft", "-otd"}, "=,...",
+ {"-otd", "--override-tensor-draft"}, "=,...",
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
- {"--cpu-moe", "-cmoe"},
+ {"-cmoe", "--cpu-moe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
- {"--n-cpu-moe", "-ncmoe"}, "N",
+ {"-ncmoe", "--n-cpu-moe"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
[](common_params & params, int value) {
if (value < 0) {
@@ -1908,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
- {"--cpu-moe-draft", "-cmoed"},
+ {"-cmoed", "--cpu-moe-draft"},
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
- {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+ {"-ncmoed", "--n-cpu-moe-draft"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
[](common_params & params, int value) {
if (value < 0) {
@@ -1927,7 +2136,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -1999,6 +2208,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_MAIN_GPU"));
+ add_opt(common_arg(
+ { "-fit", "--fit" }, "[on|off]",
+ string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.fit_params = true;
+ } else if (is_falsey(value)) {
+ params.fit_params = false;
+ } else {
+ throw std::runtime_error(
+ string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+ }
+ }
+ ).set_env("LLAMA_ARG_FIT"));
+ add_opt(common_arg(
+ { "-fitt", "--fit-target" }, "MiB",
+ string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
+ [](common_params & params, int value) {
+ params.fit_params_target = value * size_t(1024*1024);
+ }
+ ).set_env("LLAMA_ARG_FIT_TARGET"));
+ add_opt(common_arg(
+ { "-fitc", "--fit-ctx" }, "N",
+ string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
+ [](common_params & params, int value) {
+ params.fit_params_min_ctx = value;
+ }
+ ).set_env("LLAMA_ARG_FIT_CTX"));
add_opt(common_arg(
{"--check-tensors"},
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@@ -2007,51 +2244,96 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
- {"--override-kv"}, "KEY=TYPE:VALUE",
- "advanced option to override model metadata by key. may be specified multiple times.\n"
- "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
[](common_params & params, const std::string & value) {
- if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
+ std::vector kv_overrides;
+
+ std::string current;
+ bool escaping = false;
+
+ for (const char c : value) {
+ if (escaping) {
+ current.push_back(c);
+ escaping = false;
+ } else if (c == '\\') {
+ escaping = true;
+ } else if (c == ',') {
+ kv_overrides.push_back(current);
+ current.clear();
+ } else {
+ current.push_back(c);
+ }
+ }
+
+ if (escaping) {
+ current.push_back('\\');
+ }
+
+ kv_overrides.push_back(current);
+
+ for (const auto & kv_override : kv_overrides) {
+ if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
+ }
}
}
));
add_opt(common_arg(
+ {"--op-offload"},
{"--no-op-offload"},
- string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
- [](common_params & params) {
- params.no_op_offload = true;
+ string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+ [](common_params & params, bool value) {
+ params.no_op_offload = !value;
}
));
add_opt(common_arg(
{"--lora"}, "FNAME",
- "path to LoRA adapter (can be repeated to use multiple adapters)",
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
[](common_params & params, const std::string & value) {
- params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
+ for (const auto & item : string_split(value, ',')) {
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+ }
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
- {"--lora-scaled"}, "FNAME", "SCALE",
- "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
- [](common_params & params, const std::string & fname, const std::string & scale) {
- params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
+ {"--lora-scaled"}, "FNAME:SCALE,...",
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
+ "note: use comma-separated values",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : string_split(value, ',')) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
+ }
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+ }
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"--control-vector"}, "FNAME",
- "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
[](common_params & params, const std::string & value) {
- params.control_vectors.push_back({ 1.0f, value, });
+ for (const auto & item : string_split(value, ',')) {
+ params.control_vectors.push_back({ 1.0f, item, });
+ }
}
));
add_opt(common_arg(
- {"--control-vector-scaled"}, "FNAME", "SCALE",
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
"add a control vector with user defined scaling SCALE\n"
- "note: this argument can be repeated to add multiple scaled control vectors",
- [](common_params & params, const std::string & fname, const std::string & scale) {
- params.control_vectors.push_back({ std::stof(scale), fname });
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : string_split(value, ',')) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
+ }
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
+ }
}
));
add_opt(common_arg(
@@ -2072,11 +2354,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
- ? std::string("model path from which to load base model")
- : string_format(
- "model path (default: `models/$filename` with filename from `--hf-file` "
- "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
- ),
+ ? "model path from which to load base model"
+ : "model path to load",
[](common_params & params, const std::string & value) {
params.model.path = value;
}
@@ -2144,13 +2423,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("HF_TOKEN"));
add_opt(common_arg(
{"--context-file"}, "FNAME",
- "file to load context from (repeat to specify multiple files)",
+ "file to load context from (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
- std::ifstream file(value, std::ios::binary);
- if (!file) {
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ for (const auto & item : string_split(value, ',')) {
+ std::ifstream file(item, std::ios::binary);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.context_files.push_back(item);
}
- params.context_files.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
@@ -2219,10 +2500,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
+ {"--ppl"},
{"--no-ppl"},
- string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
- [](common_params & params) {
- params.compute_ppl = false;
+ string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.compute_ppl = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
@@ -2341,12 +2623,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
- {"--no-webui"},
- string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
- [](common_params & params) {
- params.webui = false;
+ {"--webui-config"}, "JSON",
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+ add_opt(common_arg(
+ {"--webui-config-file"}, "PATH",
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = read_file(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
+ add_opt(common_arg(
+ {"--webui"},
+ {"--no-webui"},
+ string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.webui = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg(
{"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2355,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
- {"--reranking", "--rerank"},
+ {"--rerank", "--reranking"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
[](common_params & params) {
params.embedding = true;
@@ -2409,7 +2706,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2451,23 +2748,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg(
{"--slots"},
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
- [](common_params & params) {
- params.endpoint_slots = true;
+ {"--no-slots"},
+ string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.endpoint_slots = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
- add_opt(common_arg(
- {"--no-slots"},
- "disables slots monitoring endpoint",
- [](common_params & params) {
- params.endpoint_slots = false;
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--slot-save-path"}, "PATH",
"path to save slot kv cache (default: disabled)",
[](common_params & params, const std::string & value) {
params.slot_save_path = value;
+ if (!fs_is_directory(params.slot_save_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
params.slot_save_path += DIRECTORY_SEPARATOR;
@@ -2475,12 +2769,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
- {"--jinja"},
- "use jinja template for chat (default: disabled)",
- [](common_params & params) {
- params.use_jinja = true;
+ {"--media-path"}, "PATH",
+ "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.media_path = value;
+ if (!fs_is_directory(params.media_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.media_path += DIRECTORY_SEPARATOR;
+ }
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--models-dir"}, "PATH",
+ "directory containing models for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+ add_opt(common_arg(
+ {"--models-preset"}, "PATH",
+ "path to INI file containing model presets for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_preset = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
+ add_opt(common_arg(
+ {"--models-max"}, "N",
+ string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
+ [](common_params & params, int value) {
+ params.models_max = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+ add_opt(common_arg(
+ {"--models-autoload"},
+ {"--no-models-autoload"},
+ string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.models_autoload = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+ add_opt(common_arg(
+ {"--jinja"},
+ {"--no-jinja"},
+ string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_jinja = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2491,7 +2829,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2499,7 +2837,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -2511,7 +2849,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
@@ -2523,17 +2861,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = read_file(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
+ {"--prefill-assistant"},
{"--no-prefill-assistant"},
string_format(
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
),
- [](common_params & params) {
- params.prefill_assistant = false;
+ [](common_params & params, bool value) {
+ params.prefill_assistant = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2548,13 +2887,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.lora_init_without_apply = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--sleep-idle-seconds"}, "SECONDS",
+ string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
+ [](common_params & params, int value) {
+ if (value == 0 || value < -1) {
+ throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+ }
+ params.sleep_idle_seconds = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--simple-io"},
"use basic IO for better compatibility in subprocesses and limited consoles",
[](common_params & params) {
params.simple_io = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--positive-file"}, "FNAME",
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2614,7 +2963,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params &, const std::string & value) {
common_log_set_file(common_log_main(), value.c_str());
}
- ));
+ ).set_env("LLAMA_LOG_FILE"));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -2628,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
} else {
throw std::invalid_argument(
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
+ string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_LOG_COLORS"));
@@ -2637,7 +2986,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](common_params & params) {
params.verbosity = INT_MAX;
- common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
@@ -2649,10 +2997,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
- "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+ string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
+ " - 0: generic output\n"
+ " - 1: error\n"
+ " - 2: warning\n"
+ " - 3: info\n"
+ " - 4: debug\n"
+ "(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
- common_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(common_arg(
@@ -2780,19 +3133,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg(
- {"--draft-max", "--draft", "--draft-n"}, "N",
+ {"--draft", "--draft-n", "--draft-max"}, "N",
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
[](common_params & params, int value) {
params.speculative.n_max = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) {
params.speculative.n_min = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg(
{"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -2806,14 +3159,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) {
params.speculative.n_ctx = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg(
{"-devd", "--device-draft"}, "",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2821,7 +3174,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
@@ -2833,21 +3186,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.path = value;
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@@ -3111,7 +3464,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--gpt-oss-120b-default"},
@@ -3130,7 +3483,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-4b-default"},
@@ -3141,7 +3494,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-12b-default"},
@@ -3152,7 +3505,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
return ctx_arg;
}
+
+void common_params_add_preset_options(std::vector & args) {
+ // arguments below won't be treated as CLI args, only preset options
+ args.push_back(common_arg(
+ {"load-on-startup"}, "NAME",
+ "in server router mode, autoload this model on startup",
+ [](common_params &, const std::string &) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+ args.push_back(common_arg(
+ {"stop-timeout"}, "SECONDS",
+ "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
+ [](common_params &, int) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
+
+ // args.push_back(common_arg(
+ // {"pin"},
+ // "in server router mode, do not unload this model if models_max is exceeded",
+ // [](common_params &) { /* unused */ }
+ // ).set_preset_only());
+}
diff --git a/common/arg.h b/common/arg.h
index 7ab7e2cea4..a1b6a14e67 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,8 +3,14 @@
#include "common.h"
#include
+#include