diff --git a/.editorconfig b/.editorconfig
index 316448c7e9..c90b171f55 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
-[tools/mtmd/miniaudio.h]
+[vendor/miniaudio/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 278032ef2e..3c2f67707b 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -86,3 +86,10 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/
+
+Ascend NPU:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml/include/ggml-cann.h
+ - ggml/src/ggml-cann/**
+ - docs/backend/CANN.md
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index dbd31e589b..7cfc82ba4e 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -26,12 +26,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
- g++-14-riscv64-linux-gnu \
- libcurl4-openssl-dev:riscv64
+ g++-14-riscv64-linux-gnu
- name: Build
run: |
- cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
- libvulkan-dev:riscv64 \
- libcurl4-openssl-dev:riscv64
+ libvulkan-dev:riscv64
- name: Build
run: |
- cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
build-essential \
glslc \
crossbuild-essential-arm64 \
- libvulkan-dev:arm64 \
- libcurl4-openssl-dev:arm64
+ libvulkan-dev:arm64
- name: Build
run: |
- cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
- g++-14-powerpc64le-linux-gnu \
- libcurl4-openssl-dev:ppc64el
+ g++-14-powerpc64le-linux-gnu
- name: Build
run: |
- cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
- libvulkan-dev:ppc64el \
- libcurl4-openssl-dev:ppc64el
+ libvulkan-dev:ppc64el
- name: Build
run: |
- cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -231,3 +231,116 @@ jobs:
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
+
+ debian-13-loongarch64-cpu-cross:
+ runs-on: ubuntu-24.04
+ container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup LoongArch
+ run: |
+ rm -f /etc/apt/sources.list.d/*
+ cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
+ deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
+ EOF
+ ( echo 'quiet "true";'; \
+ echo 'APT::Get::Assume-Yes "true";'; \
+ echo 'APT::Install-Recommends "false";'; \
+ echo 'Acquire::Check-Valid-Until "false";'; \
+ echo 'Acquire::Retries "5";'; \
+ ) > /etc/apt/apt.conf.d/99snapshot-repos
+
+ apt-get update
+ apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
+ dpkg --add-architecture loong64
+
+ # Add arch-specific repositories for non-amd64 architectures
+ cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
+ deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
+ EOF
+
+ apt-get update || true ;# Prevent failure due to missing URLs.
+
+ apt-get install -y --no-install-recommends \
+ build-essential \
+ gcc-14-loongarch64-linux-gnu \
+ g++-14-loongarch64-linux-gnu
+
+ - name: Build
+ run: |
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_SYSTEM_NAME=Linux \
+ -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
+ -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
+ -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+ -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+ -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+ cmake --build build --config Release -j $(nproc)
+
+ debian-13-loongarch64-vulkan-cross:
+ runs-on: ubuntu-24.04
+ container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup LoongArch
+ run: |
+ rm -f /etc/apt/sources.list.d/*
+ cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
+ deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
+ EOF
+ ( echo 'quiet "true";'; \
+ echo 'APT::Get::Assume-Yes "true";'; \
+ echo 'APT::Install-Recommends "false";'; \
+ echo 'Acquire::Check-Valid-Until "false";'; \
+ echo 'Acquire::Retries "5";'; \
+ ) > /etc/apt/apt.conf.d/99snapshot-repos
+
+ apt-get update
+ apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
+ dpkg --add-architecture loong64
+
+ # Add arch-specific repositories for non-amd64 architectures
+ cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
+ deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
+ EOF
+
+ apt-get update || true ;# Prevent failure due to missing URLs.
+
+ apt-get install -y --no-install-recommends \
+ build-essential \
+ glslc \
+ gcc-14-loongarch64-linux-gnu \
+ g++-14-loongarch64-linux-gnu \
+ libvulkan-dev:loong64
+
+ - name: Build
+ run: |
+ cmake -B build -DLLAMA_CURL=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_VULKAN=ON \
+ -DGGML_OPENMP=OFF \
+ -DLLAMA_BUILD_EXAMPLES=ON \
+ -DLLAMA_BUILD_TOOLS=ON \
+ -DLLAMA_BUILD_TESTS=OFF \
+ -DCMAKE_SYSTEM_NAME=Linux \
+ -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
+ -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
+ -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
+ -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+ -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+ -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+ cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ee76d1799e..867a589ce1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -839,12 +839,12 @@ jobs:
-DGGML_CUDA=ON
cmake --build build
- windows-2019-cmake-cuda:
- runs-on: windows-2019
+ windows-2022-cmake-cuda:
+ runs-on: windows-2022
strategy:
matrix:
- cuda: ['12.4', '11.7']
+ cuda: ['12.4']
steps:
- name: Clone
@@ -878,7 +878,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
- call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+ call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 65ed244657..9874736cbd 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -131,8 +131,9 @@ jobs:
include:
- build: 'x64'
os: ubuntu-22.04
- - build: 'arm64'
- os: ubuntu-22.04-arm
+ # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
+ # - build: 'arm64'
+ # os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
@@ -159,6 +160,9 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DGGML_BACKEND_DL=ON \
+ -DGGML_NATIVE=OFF \
+ -DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_FATAL_WARNINGS=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -207,6 +211,9 @@ jobs:
id: cmake_build
run: |
cmake -B build \
+ -DGGML_BACKEND_DL=ON \
+ -DGGML_NATIVE=OFF \
+ -DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_VULKAN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -373,11 +380,11 @@ jobs:
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
windows-cuda:
- runs-on: windows-2019
+ runs-on: windows-2022
strategy:
matrix:
- cuda: ['12.4', '11.7']
+ cuda: ['12.4']
steps:
- name: Clone
@@ -405,7 +412,7 @@ jobs:
id: cmake_build
shell: cmd
run: |
- call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+ call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DGGML_BACKEND_DL=ON ^
-DGGML_NATIVE=OFF ^
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 4baf6f6c75..f6da488576 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -180,7 +180,7 @@ jobs:
server-windows:
- runs-on: windows-2019
+ runs-on: windows-2022
steps:
- name: Clone
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20e966e1b..41027bb898 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,6 +160,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
+if (MINGW)
+ # Target Windows 8 for PrefetchVirtualMemory
+ add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
#
# build the library
#
diff --git a/README.md b/README.md
index 540c29a4f1..385ac04d84 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@

[](https://opensource.org/licenses/MIT)
+[](https://github.com/ggml-org/llama.cpp/releases)
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
@@ -28,6 +29,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
----
+## Quick start
+
+Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
+
+- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
+- Run with Docker - see our [Docker documentation](docs/docker.md)
+- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
+- Build from source by cloning this repository - check out [our build guide](docs/build.md)
+
+Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
+
+Example command:
+
+```sh
+# Use a local model file
+llama-cli -m my_model.gguf
+
+# Or download and run a model directly from Hugging Face
+llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
+
+# Launch OpenAI-compatible API server
+llama-server -hf ggml-org/gemma-3-1b-it-GGUF
+```
+
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -130,6 +155,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
Bindings
+- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -229,6 +255,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
+
## Supported backends
| Backend | Target devices |
@@ -245,16 +272,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-## Building the project
-
-The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
-The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
-
-- Clone this repository and build locally, see [how to build](docs/build.md)
-- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
-- Use a Docker image, see [documentation for Docker](docs/docker.md)
-- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
-
## Obtaining and quantizing models
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -262,7 +279,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`.
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. For example:
+
+```sh
+llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
+```
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
diff --git a/ci/run.sh b/ci/run.sh
index b49a3a5f82..2968a7dd48 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -46,7 +46,20 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
+
+ if command -v nvidia-smi >/dev/null 2>&1; then
+ CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
+ if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
+ else
+ echo "Warning: Using fallback CUDA architectures"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
+ fi
+ else
+ echo "Error: nvidia-smi not found, cannot build with CUDA"
+ exit 1
+ fi
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index dac4cc770e..564af1448f 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -58,23 +58,20 @@ add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp
- chat.cpp
- chat.h
chat-parser.cpp
chat-parser.h
+ chat.cpp
+ chat.h
common.cpp
common.h
console.cpp
console.h
- json-schema-to-grammar.cpp
- json.hpp
- json-partial.h
json-partial.cpp
+ json-partial.h
+ json-schema-to-grammar.cpp
llguidance.cpp
log.cpp
log.h
- minja/chat-template.hpp
- minja/minja.hpp
ngram-cache.cpp
ngram-cache.h
regex-partial.cpp
@@ -147,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
-target_include_directories(${TARGET} PUBLIC .)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
index 69a58364f9..0d0daa3610 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,10 +1,11 @@
-#include "gguf.h" // for reading GGUF splits
#include "arg.h"
+#include "chat.h"
#include "common.h"
+#include "gguf.h" // for reading GGUF splits
+#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
-#include "chat.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -15,6 +16,9 @@
#include
#endif
+#define JSON_ASSERT GGML_ASSERT
+#include
+
#include
#include
#include
@@ -34,8 +38,6 @@
#include
#endif
-#include "json-schema-to-grammar.h"
-
using json = nlohmann::ordered_json;
std::initializer_list mmproj_examples = {
@@ -1346,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"--prio"}, "N",
- string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+ string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
[](common_params & params, int prio) {
- if (prio < 0 || prio > 3) {
+ if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
throw std::invalid_argument("invalid value");
}
params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -2867,6 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: deepseek)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+ else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else { throw std::invalid_argument("invalid value"); }
}
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index c314b8b519..65b664cb37 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
- if (!syntax_.thinking_forced_open) {
- throw common_chat_msg_partial_exception(end_think);
- }
+ // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
+ // if (!syntax_.thinking_forced_open) {
+ // throw common_chat_msg_partial_exception(end_think);
+ // }
return true;
}
}
diff --git a/common/chat-parser.h b/common/chat-parser.h
index 5d53f2df1d..7ee355056b 100644
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -2,9 +2,10 @@
#include "chat.h"
#include "json-partial.h"
-#include "json.hpp"
#include "regex-partial.h"
+#include
+
#include
#include
#include
diff --git a/common/chat.cpp b/common/chat.cpp
index 7584639b07..1d6974a8c5 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1,13 +1,14 @@
#include "chat.h"
#include "chat-parser.h"
#include "common.h"
+#include "json-partial.h"
#include "json-schema-to-grammar.h"
#include "log.h"
-#include "json-partial.h"
-#include "minja/chat-template.hpp"
-#include "minja/minja.hpp"
#include "regex-partial.h"
+#include
+#include
+
#include
#include
#include
@@ -16,7 +17,6 @@
#include
#include
-
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
std::vector diffs;
- // if (previous_msg.reasoning_content != current.reasoning_content) {
- // auto & diff = diffs.emplace_back();
- // diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
- // }
+ if (previous_msg.reasoning_content != new_msg.reasoning_content) {
+ auto & diff = diffs.emplace_back();
+ diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
+ }
if (previous_msg.content != new_msg.content) {
auto & diff = diffs.emplace_back();
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector & t
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
- // if (!diff.reasoning_content_delta.empty()) {
- // delta["reasoning_content"] = msg.reasoning_content;
- // }
+ if (!diff.reasoning_content_delta.empty()) {
+ delta["reasoning_content"] = diff.reasoning_content_delta;
+ }
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
default:
throw std::runtime_error("Unknown reasoning format");
}
diff --git a/common/chat.h b/common/chat.h
index f6b1d0ffcc..9f59e6b087 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -70,7 +70,7 @@ struct common_chat_msg {
};
struct common_chat_msg_diff {
- // std::string reasoning_content_delta;
+ std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
diff --git a/common/common.cpp b/common/common.cpp
index 2afa9b2d64..218f1e1dc0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
+ case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
int p = 0;
switch (prio) {
+ case GGML_SCHED_PRIO_LOW: p = 5; break;
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -903,13 +905,16 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false;
}
- if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
- LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
- ok = false;
- }
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
- if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+ if (!has_eos && !has_sep) {
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+ ok = false;
+ } else if (!has_eos) {
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+ } else if (!has_sep) {
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
@@ -929,7 +934,7 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -1036,7 +1041,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
- llama_kv_self_clear(lctx);
+ llama_memory_clear(llama_get_memory(lctx), true);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
diff --git a/common/common.h b/common/common.h
index cee1e3039c..f26724b6e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -215,7 +215,8 @@ struct common_params_vocoder {
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
};
struct common_params {
diff --git a/common/json-partial.cpp b/common/json-partial.cpp
index 7591a8e4cf..d9d9169989 100644
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -1,9 +1,10 @@
-#include
-#include "ggml.h"
-#include "log.h"
-#include
+#include "json-partial.h"
-#include
+#include "log.h"
+
+#include
+
+#include
using json = nlohmann::ordered_json;
diff --git a/common/json-partial.h b/common/json-partial.h
index 854db6a3ae..f63356dc48 100644
--- a/common/json-partial.h
+++ b/common/json-partial.h
@@ -1,5 +1,6 @@
#pragma once
-#include
+
+#include
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
struct common_healing_marker {
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 5b3059c2f7..d38a74f95c 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -1,8 +1,9 @@
#include "json-schema-to-grammar.h"
#include "common.h"
+#include
+
#include
-#include
#include