Merge branch 'master' into pr/19493

This commit is contained in:
Georgi Gerganov 2026-04-01 14:17:41 +03:00
commit 63c66f1512
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
421 changed files with 18925 additions and 6396 deletions

View File

@ -4,7 +4,7 @@
# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
# ==============================================================================
# BUILD STAGE

View File

@ -1,11 +1,13 @@
ARG UBUNTU_VERSION=22.04
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
RUN apt-get update && \
apt-get install -y build-essential git cmake libssl-dev
apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
ENV CC=gcc-14 CXX=g++-14
WORKDIR /app
@ -34,7 +36,7 @@ RUN mkdir -p /app/full \
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@ -55,8 +57,9 @@ RUN apt-get update \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=13.1.0
ARG CUDA_VERSION=13.1.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app
@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.4.0
ARG CUDA_VERSION=12.8.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app
@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@ -60,7 +62,8 @@ RUN apt-get update \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \

View File

@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
## Build Image
@ -33,8 +33,25 @@ RUN mkdir -p /app/full \
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -1,4 +1,4 @@
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
FROM ascendai/cann:$ASCEND_VERSION AS build

View File

@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -41,6 +41,7 @@
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false,
useWebUi ? true,
}:
let
@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "GGML_NATIVE" false)

View File

@ -78,7 +78,7 @@ ARG http_proxy
ARG https_proxy
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl\
&& apt-get install -y libgomp1 libtbb12 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -58,7 +58,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@ -79,7 +79,7 @@ RUN apt-get update \
git \
python3-pip \
python3 \
python3-wheel\
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \

View File

@ -49,17 +49,20 @@ COPY --from=build /app/full /app
WORKDIR /app
ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
# Flag for compatibility with pip
ARG UV_INDEX_STRATEGY="unsafe-best-match"
RUN apt-get update \
&& apt-get install -y \
build-essential \
curl \
git \
python3.13 \
python3.13-dev \
python3-pip \
python3-wheel \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
ca-certificates \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& uv python install 3.13 \
&& uv venv --python 3.13 /root/.venv \
&& uv pip install --python /root/.venv/bin/python -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@ -21,14 +21,6 @@ indent_style = tab
[prompts/*.txt]
insert_final_newline = unset
[tools/server/public/*]
indent_size = 2
[tools/server/public/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[tools/server/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
@ -61,6 +53,14 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[tools/server/public/**]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[benches/**]
indent_style = unset
indent_size = unset

4
.gitattributes vendored Normal file
View File

@ -0,0 +1,4 @@
# Treat the generated single-file WebUI build as binary for diff purposes.
# Git's pack-file delta compression still works (byte-level), but this prevents
# git diff from printing the entire minified file on every change.
tools/server/public/index.html -diff

View File

@ -41,7 +41,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
multiple: true
validations:
required: true

View File

@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
multiple: true
validations:
required: true

View File

@ -1 +1,16 @@
*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
## Overview
<!-- Describe what this PR does and why. Be concise but complete -->
## Additional information
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
# Requirements
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->

View File

@ -26,7 +26,9 @@ jobs:
{
"bash": {
"*": "deny",
"gh issue*": "allow",
"gh issue view*": "allow",
"gh issue list*": "allow",
"gh issue comment*": "allow",
"gh search issues*": "allow"
},
"webfetch": "deny"
@ -71,8 +73,8 @@ jobs:
[comment]
This issue might be similar or related to the following issue(s):
- #[related_issue_number]: [brief description of how they are related]
- #[related_issue_number]: [brief description of how they are related]
- #12942: [brief description of how they are related]
- #11234: [brief description of how they are related]
...
_This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_

View File

@ -40,13 +40,9 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v6
# Disabled due to size (400MB) and always 0 cache hits
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.16
# with:
# key: android-build
# evict-old-files: 1d
with:
fetch-depth: 0
lfs: false
- name: Set up JDK
uses: actions/setup-java@v5
@ -55,7 +51,7 @@ jobs:
distribution: zulu
- name: Setup Android SDK
uses: android-actions/setup-android@v3
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
with:
log-accepted-android-sdk-licenses: false
@ -66,10 +62,11 @@ jobs:
android-ndk:
runs-on: ubuntu-latest
env:
OPENCL_VERSION: 2025.07.22
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
defaults:
run:
shell: bash
strategy:
matrix:
include:
@ -82,59 +79,23 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
mkdir opencl
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
tar -xaf opencl/headers.tar.gz -C opencl
tar -xaf opencl/clhpp.tar.gz -C opencl
tar -xaf opencl/icd-loader.tar.gz -C opencl
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
cmake --build build
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
rm -rf opencl
- name: Install Hexagon SDK
id: install_hexsdk
if: ${{ matrix.build == 'arm64-snapdragon' }}
env:
HEXSDK_VER: 6.4.0.2
HEXTLS_VER: 19.0.04
run: |
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
mkdir hex-sdk
tar -xaf hex-sdk.tar.gz -C hex-sdk
ls -l hex-sdk
sudo mv hex-sdk /opt/hexagon
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
- name: Update CMake presets
id: update_presets
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
- name: Build
id: ndk_build
- name: Build Llama.CPP for Hexagon Android
id: build_llama_cpp_hexagon_android
run: |
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
cp docs/backend/snapdragon/CMakeUserPresets.json .
fi
cmake ${{ matrix.defines }} -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Test
id: cmake_test
run: |
echo "FIXME: test on devices"
- name: Upload Llama.CPP Hexagon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-${{ matrix.build }}
path: pkg-adb/llama.cpp

View File

@ -63,7 +63,7 @@ jobs:
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image

View File

@ -43,7 +43,7 @@ jobs:
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
with:
update: true
msystem: ${{matrix.sys}}

View File

@ -141,60 +141,61 @@ jobs:
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-webgpu:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Test
id: ggml-ci
run: |
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
# TODO: sandbox Mac runners
# ggml-ci-mac-metal:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Test
# id: ggml-ci
# run: |
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
#
# ggml-ci-mac-webgpu:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Dawn Dependency
# id: dawn-depends
# run: |
# DAWN_VERSION="v2.0.0"
# DAWN_OWNER="reeselevine"
# DAWN_REPO="dawn"
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
# curl -L -o artifact.zip \
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
# mkdir dawn
# unzip artifact.zip
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
#
# - name: Test
# id: ggml-ci
# run: |
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
#
# ggml-ci-mac-vulkan:
# runs-on: [self-hosted, macOS, ARM64]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-linux-intel-vulkan:
runs-on: [self-hosted, Linux, Intel]

View File

@ -87,7 +87,7 @@ jobs:
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
- name: Test
@ -124,7 +124,7 @@ jobs:
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
@ -165,8 +165,8 @@ jobs:
id: cmake_build
run: |
export CMAKE_PREFIX_PATH=dawn
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
@ -181,7 +181,7 @@ jobs:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-22.04-arm
os: ubuntu-24.04-arm
- build: 's390x'
os: ubuntu-24.04-s390x
- build: 'ppc64le'
@ -207,14 +207,22 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
python3 python3-pip python3-dev python3-wheel \
libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Toolchain workaround (GCC 14)
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Python Dependencies
id: python_depends
run: |
python3 -m pip install --upgrade pip
export PIP_BREAK_SYSTEM_PACKAGES="1"
python3 -m pip install --upgrade pip setuptools
pip3 install ./gguf-py
- name: Swap Endianness
@ -231,7 +239,7 @@ jobs:
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
@ -274,14 +282,16 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev
sudo apt-get install build-essential libssl-dev ninja-build
- name: Build
id: cmake_build
run: |
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
@ -290,7 +300,15 @@ jobs:
ctest -L main --verbose
ubuntu-24-vulkan:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-24.04
- build: 'arm64'
os: ubuntu-24.04-arm
runs-on: ${{ matrix.os }}
steps:
- name: Clone
@ -300,12 +318,16 @@ jobs:
- name: Dependencies
id: depends
run: |
sudo apt-get install -y glslc libvulkan-dev libssl-dev
sudo apt-get update
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Configure
id: cmake_configure
run: |
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@ -314,7 +336,7 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake --build build -j $(nproc)
time cmake --build build -j $(nproc)
ubuntu-24-webgpu:
runs-on: ubuntu-24.04
@ -336,7 +358,8 @@ jobs:
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
sudo apt-get install -y build-essential mesa-vulkan-drivers \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
@ -378,7 +401,7 @@ jobs:
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build \
-DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
@ -415,11 +438,13 @@ jobs:
run: |
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_WEBGPU=ON \
-DLLAMA_OPENSSL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
cmake --build build-wasm --target test-backend-ops -j $(nproc)
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
ubuntu-22-hip:
runs-on: ubuntu-22.04
@ -479,7 +504,7 @@ jobs:
run: |
cmake -B build -S . \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl:
runs-on: ubuntu-22.04
@ -528,7 +553,7 @@ jobs:
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl-fp16:
runs-on: ubuntu-22.04
@ -551,7 +576,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
- name: install oneAPI MKL library
shell: bash
@ -574,11 +599,13 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
@ -648,7 +675,7 @@ jobs:
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
@ -1039,7 +1066,7 @@ jobs:
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config Release -j $(nproc)
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test

View File

@ -54,4 +54,3 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
pip install flake8 pyright pre-commit

View File

@ -25,186 +25,13 @@ permissions:
packages: write
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ${{ matrix.config.runs_on }}
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
fail-fast: false
matrix:
config:
# Multi-stage build
# Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU
if: ${{ matrix.config.tag != 's390x' }}
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Determine image tag name
id: tag
shell: bash
run: |
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
# list all tags possible
tags="${{ matrix.config.tag }}"
for tag in $tags; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
CACHETAGS="${PREFIX}buildcache${TYPE}"
FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
done
echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "cache_output_tags=$CACHETAGS" # print out for debugging
echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }}
uses: ggml-org/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Build and push Full Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.full_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Light Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.light_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Server Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.server_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
create_tag:
name: Create and push git tag
runs-on: ubuntu-22.04
runs-on: ubuntu-slim
permissions:
contents: write
outputs:
source_tag: ${{ steps.srctag.outputs.name }}
steps:
- name: Clone
@ -225,3 +52,391 @@ jobs:
run: |
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0
prepare_matrices:
name: Prepare Docker matrices
runs-on: ubuntu-24.04
outputs:
build_matrix: ${{ steps.matrices.outputs.build_matrix }}
merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
steps:
- name: Generate build and merge matrices
id: matrices
shell: bash
run: |
set -euo pipefail
# Keep all build targets in one place and derive merge targets from it.
cat > build-matrix.json <<'JSON'
[
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
]
JSON
BUILD_MATRIX="$(jq -c . build-matrix.json)"
MERGE_MATRIX="$(jq -c '
reduce .[] as $entry ({}; .[$entry.tag] |= (
. // {
tag: $entry.tag,
arches: [],
full: false,
light: false,
server: false
}
| .full = (.full or ($entry.full // false))
| .light = (.light or ($entry.light // false))
| .server = (.server or ($entry.server // false))
| .arches += [($entry.platforms | sub("^linux/"; ""))]
))
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
| if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
. + {
s390x: {
tag: "s390x",
arches: ["s390x"],
full: .cpu.full,
light: .cpu.light,
server: .cpu.server
}
}
else
.
end
| [.[] | .arches = (.arches | unique | sort | join(" "))]
' build-matrix.json)"
echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
push_to_registry:
name: Push Docker image to Docker Registry
needs: [prepare_matrices, create_tag]
runs-on: ${{ matrix.config.runs_on }}
strategy:
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ needs.create_tag.outputs.source_tag }}
- name: Set up QEMU
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
with:
image: tonistiigi/binfmt:qemu-v10.2.1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
- name: Log in to Docker Registry
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine image metadata
id: meta
shell: bash
run: |
set -euo pipefail
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
PREFIX="${IMAGE_REPO}:"
PLATFORM="${{ matrix.config.platforms }}"
ARCH_SUFFIX="${PLATFORM#linux/}"
# list all tags possible
tags="${{ matrix.config.tag }}"
for tag in $tags; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
done
SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
echo "cache_output_tag=$CACHETAG" # print out for debugging
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }}
uses: ggml-org/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Build and push Full Docker image by digest
id: build_full
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Build and push Light Docker image by digest
id: build_light
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Build and push Server Docker image by digest
id: build_server
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Export digest metadata
shell: bash
run: |
set -euo pipefail
TAGS="${{ matrix.config.tag }}"
ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
mkdir -p /tmp/digests
add_digest_rows() {
local image_type="$1"
local digest="$2"
if [[ -z "$digest" ]]; then
echo "Missing digest for image_type=${image_type}" >&2
exit 1
fi
for tag in $TAGS; do
printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
done
}
if [[ "${{ matrix.config.full }}" == "true" ]]; then
add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
fi
if [[ "${{ matrix.config.light }}" == "true" ]]; then
add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
fi
if [[ "${{ matrix.config.server }}" == "true" ]]; then
add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
fi
- name: Upload digest metadata
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
with:
name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
if-no-files-found: error
merge_arch_tags:
name: Create shared tags from digests
needs: [prepare_matrices, push_to_registry, create_tag]
runs-on: ubuntu-24.04
strategy:
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Download digest metadata
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
with:
pattern: digests-*
path: /tmp/digests
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
- name: Log in to Docker Registry
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create tags from digests
shell: bash
run: |
set -euo pipefail
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
PREFIX="${IMAGE_REPO}:"
SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
TAGS="${{ matrix.config.tag }}"
ARCHES="${{ matrix.config.arches }}"
DIGEST_GLOB="/tmp/digests/*.tsv"
if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
echo "No digest metadata found in /tmp/digests" >&2
exit 1
fi
if [[ -z "$SRC_TAG" ]]; then
echo "Missing source tag from create_tag" >&2
exit 1
fi
find_digest() {
local tag_name="$1"
local arch="$2"
local image_type="$3"
local digest
digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
fi
if [[ -z "$digest" ]]; then
echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
exit 1
fi
echo "$digest"
}
create_manifest_tags() {
local image_type="$1"
local tag_name="$2"
local suffix="$3"
local merged_tag="${PREFIX}${image_type}${suffix}"
local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
local refs=()
for arch in $ARCHES; do
local digest
digest="$(find_digest "$tag_name" "$arch" "$image_type")"
refs+=("${IMAGE_REPO}@${digest}")
done
echo "Creating ${merged_tag} from ${refs[*]}"
docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
echo "Creating ${merged_versioned_tag} from ${refs[*]}"
docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
}
for tag in $TAGS; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
if [[ "${{ matrix.config.full }}" == "true" ]]; then
create_manifest_tags "full" "$tag" "$TYPE"
fi
if [[ "${{ matrix.config.light }}" == "true" ]]; then
create_manifest_tags "light" "$tag" "$TYPE"
fi
if [[ "${{ matrix.config.server }}" == "true" ]]; then
create_manifest_tags "server" "$tag" "$TYPE"
fi
done
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

View File

@ -23,7 +23,7 @@ jobs:
runs-on: ubuntu-slim
steps:
- uses: actions/checkout@v6
- uses: editorconfig-checker/action-editorconfig-checker@v2
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
with:
version: v3.0.3
- run: editorconfig-checker

View File

@ -28,17 +28,17 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.9.x'
python-version: '3.11'
- name: Install dependencies
run: |
cd gguf-py
python -m pip install poetry
python -m pip install poetry==2.3.2
poetry install
- name: Build package
run: cd gguf-py && poetry build
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
packages-dir: gguf-py/dist

View File

@ -8,7 +8,8 @@ on:
paths: [
'.github/workflows/hip-quality-check.yml',
'**/*.cu',
'**/*.cuh'
'**/*.cuh',
'scripts/hip/gcn-cdna-vgpr-check.py'
]
pull_request:
@ -16,7 +17,8 @@ on:
paths: [
'.github/workflows/hip-quality-check.yml',
'**/*.cu',
'**/*.cuh'
'**/*.cuh',
'scripts/hip/gcn-cdna-vgpr-check.py'
]
concurrency:

View File

@ -31,6 +31,6 @@ jobs:
with:
python-version: "3.11"
- name: flake8 Lint
uses: py-actions/flake8@v2
uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
with:
plugins: "flake8-no-print"

View File

@ -4,15 +4,17 @@ on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- 'ty.toml'
- '**.py'
- '**/requirements*.txt'
# - 'pyrightconfig.json'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- 'ty.toml'
- '**.py'
- '**/requirements*.txt'
# - 'pyrightconfig.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -20,8 +22,8 @@ concurrency:
jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
runs-on: ubuntu-slim
name: python type-check
steps:
- name: Check out source repository
uses: actions/checkout@v6
@ -29,10 +31,13 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.382
level: warning
warnings: true
pip-install: -r requirements/requirements-all.txt ty==0.0.26
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:
# version: 1.1.382
# level: warning
# warnings: true
- name: Type-check with ty
run: |
ty check --output-format=github

View File

@ -131,17 +131,16 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz
ubuntu-22-cpu:
ubuntu-cpu:
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-24.04-arm
- build: 's390x'
os: ubuntu-24.04-s390x
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
@ -165,6 +164,13 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential libssl-dev
- name: Toolchain workaround (GCC 14)
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Build
id: cmake_build
run: |
@ -194,8 +200,16 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
ubuntu-vulkan:
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-24.04-arm
runs-on: ${{ matrix.os }}
steps:
- name: Clone
@ -207,16 +221,23 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-vulkan
key: ubuntu-vulkan-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
else
sudo apt-get update -y
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
fi
- name: Build
id: cmake_build
@ -239,13 +260,13 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
ubuntu-24-openvino:
runs-on: ubuntu-24.04
@ -907,7 +928,7 @@ jobs:
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
@ -977,8 +998,8 @@ jobs:
- windows-sycl
- windows-hip
- ubuntu-22-rocm
- ubuntu-22-cpu
- ubuntu-22-vulkan
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
- macOS-arm64
- macOS-x64
@ -1061,9 +1082,11 @@ jobs:
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
**Windows:**

2
.gitignore vendored
View File

@ -95,6 +95,8 @@
# Server Web UI temporary files
/tools/server/webui/node_modules
/tools/server/webui/dist
# we no longer use gz for index.html
/tools/server/public/index.html.gz
# Python

View File

@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed):
If a user asks one of the above, STOP IMMEDIATELY and ask them:
- Whether they acknowledge the risk of being permanently banned from contributing to the project
- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
- To search for relevant issues and create a new one if needed

View File

@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)

View File

@ -10,6 +10,7 @@
/common/jinja/ @CISC
/common/ngram-map.* @srogmann
/convert_*.py @CISC
/docs/backend/snapdragon/ @ggml-org/ggml-hexagon
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
/examples/convert-llama2c-to-ggml/ @ggerganov
@ -65,6 +66,7 @@
/scripts/gen* @ggerganov
/scripts/get* @ggerganov
/scripts/sync* @ggerganov
/scripts/snapdragon/ @ggml-org/ggml-hexagon
/src/ @ggerganov
/src/llama-adapter.* @CISC
/src/llama-arch.* @CISC

View File

@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors:
> [!IMPORTANT]
> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
>
> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
>
> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@ -61,10 +63,10 @@ After submitting your PR:
- When merging a PR, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
- The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide.
- The contributor fails to adhere to this contributing guide or the AI policy.
# Coding guidelines

View File

@ -17,6 +17,7 @@ LLM inference in C/C++
## Hot topics
- **Hugging Face cache migration: models downloaded with `-hf` are now stored in the standard Hugging Face cache directory, enabling sharing with other HF tools.**
- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
@ -241,7 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Tools</summary>
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from Hugging Face Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
@ -300,13 +301,13 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
```sh
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
```
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. The `MODEL_ENDPOINT` must point to a Hugging Face compatible API endpoint.
After downloading a model, use the CLI tools to run it locally - see below.

View File

@ -57,6 +57,13 @@ SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
CTEST_EXTRA=""
# Default to use make unless specified for compatibility
CMAKE_GENERATOR="Unix Makefiles"
if [ ! -z "${GG_BUILD_NINJA}" ]; then
CMAKE_GENERATOR="Ninja"
fi
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
fi
@ -242,13 +249,13 @@ function gg_run_ctest_debug {
set -e
# Check cmake, make and ctest are installed
# Check cmake and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
@ -273,16 +280,16 @@ function gg_run_ctest_release {
set -e
# Check cmake, make and ctest are installed
# Check cmake and ctest are installed
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e
@ -340,7 +347,7 @@ function gg_run_ctest_with_model_debug {
cd build-ci-debug
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
cd ..
@ -353,7 +360,7 @@ function gg_run_ctest_with_model_release {
cd build-ci-release
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
# test memory leaks
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
@ -407,8 +414,8 @@ function gg_run_qwen3_0_6b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
@ -556,8 +563,8 @@ function gg_run_embd_bge_small {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -601,8 +608,8 @@ function gg_run_rerank_tiny {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -652,10 +659,6 @@ function gg_check_build_requirements {
gg_printf 'cmake not found, please install'
fi
if ! command -v make &> /dev/null; then
gg_printf 'make not found, please install'
fi
if ! command -v ctest &> /dev/null; then
gg_printf 'ctest not found, please install'
fi

View File

@ -63,6 +63,8 @@ add_library(${TARGET} STATIC
debug.h
download.cpp
download.h
hf-cache.cpp
hf-cache.h
http.h
json-partial.cpp
json-partial.h

View File

@ -3,6 +3,7 @@
#include "chat.h"
#include "common.h"
#include "download.h"
#include "hf-cache.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
@ -326,60 +327,48 @@ struct handle_model_result {
common_params_model mmproj;
};
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
bool offline) {
static handle_model_result common_params_handle_model(struct common_params_model & model,
const std::string & bearer_token,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo; // set name for consistency
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
if (model.path.empty()) {
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // error message already printed
}
model.name = model.hf_repo; // repo name with tag
model.hf_repo = auto_detected.repo; // repo name without tag
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
result.mmproj.hf_repo = model.hf_repo;
result.mmproj.hf_file = auto_detected.mmprojFile;
}
} else {
model.hf_file = model.path;
}
}
std::string model_endpoint = get_model_endpoint();
model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
model.path = fs_get_cache_file(filename);
}
} else if (!model.url.empty()) {
if (model.path.empty()) {
auto f = string_split<std::string>(model.url, '#').front();
f = string_split<std::string>(f, '?').front();
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
if (!model.docker_repo.empty()) {
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo;
} else if (!model.hf_repo.empty()) {
// If -m was used with -hf, treat the model "path" as the hf_file to download
if (model.hf_file.empty() && !model.path.empty()) {
model.hf_file = model.path;
model.path = "";
}
}
common_download_model_opts opts;
opts.download_mmproj = true;
opts.offline = offline;
auto download_result = common_download_model(model, bearer_token, opts);
// then, download it if needed
if (!model.url.empty()) {
bool ok = common_download_model(model, bearer_token, offline);
if (!ok) {
if (download_result.model_path.empty()) {
LOG_ERR("error: failed to download model from Hugging Face\n");
exit(1);
}
model.name = model.hf_repo;
model.path = download_result.model_path;
if (!download_result.mmproj_path.empty()) {
result.found_mmproj = true;
result.mmproj.path = download_result.mmproj_path;
}
} else if (!model.url.empty()) {
if (model.path.empty()) {
auto f = string_split<std::string>(model.url, '#').front();
f = string_split<std::string>(f, '?').front();
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
common_download_model_opts opts;
opts.offline = offline;
auto download_result = common_download_model(model, bearer_token, opts);
if (download_result.model_path.empty()) {
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
exit(1);
}
@ -434,6 +423,9 @@ static bool parse_bool_value(const std::string & value) {
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
// setup log directly from params.verbosity: see tools/cli/cli.cpp
common_log_set_verbosity_thold(params.verbosity);
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) {
@ -539,6 +531,13 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// parse the first time to get -hf option (used for remote preset)
parse_cli_args();
// TODO: Remove later
try {
hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline);
} catch (const std::exception & e) {
LOG_WRN("HF cache migration failed: %s\n", e.what());
}
// maybe handle remote preset
if (!params.model.hf_repo.empty()) {
std::string cli_hf_repo = params.model.hf_repo;
@ -635,8 +634,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
));
}
common_log_set_verbosity_thold(params.verbosity);
return true;
}
@ -1061,12 +1058,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-cl", "--cache-list"},
"show list of models in cache",
[](common_params &) {
printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
auto models = common_list_cached_models();
printf("number of models in cache: %zu\n", models.size());
for (size_t i = 0; i < models.size(); i++) {
auto & model = models[i];
printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
printf("%4zu. %s\n", i + 1, models[i].to_string().c_str());
}
exit(0);
}
@ -1084,7 +1079,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.verbose_prompt = true;
}
));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
{"--display-prompt"},
{"--no-display-prompt"},
@ -2583,7 +2578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
params.model.hf_repo = value;
@ -2812,6 +2807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.port = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
add_opt(common_arg(
{"--reuse-port"},
string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
[](common_params & params) {
params.reuse_port = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
add_opt(common_arg(
{"--path"}, "PATH",
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
@ -2848,6 +2850,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--tools"}, "TOOL1,TOOL2,...",
"experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
"specify \"all\" to enable all tools\n"
"available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
[](common_params & params, const std::string & value) {
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
@ -3250,6 +3261,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](common_params & params) {
params.verbosity = INT_MAX;
common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
@ -3270,6 +3282,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
common_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(common_arg(

View File

@ -65,7 +65,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
@ -112,8 +112,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
} else {
parser = content.build_parser(ctx);
}
parser = wrap_for_generation_prompt(p, parser, inputs, reasoning.start);
return parser;
return p.prefix(inputs.generation_prompt, reasoning.start) + parser;
});
}
@ -222,7 +221,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function");
std::string name = func.at("name");
const auto & schema = func.at("parameters");
const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
// Build call_id parser based on position (if supported)
common_peg_parser call_id_section = p.eps();
@ -283,19 +282,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
common_peg_parser tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function");
std::string name = func.at("name");
const auto & params = func.at("parameters");
if (!params.contains("properties") || !params.at("properties").is_object()) {
return;
}
const auto & properties = params.at("properties");
const auto & func = tool.at("function");
std::string name = func.at("name");
const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
std::set<std::string> required;
if (params.contains("required") && params.at("required").is_array()) {
params.at("required").get_to(required);
}
// Build parser for each argument, separating required and optional
std::vector<common_peg_parser> required_parsers;
@ -312,17 +303,18 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
}
}
auto arg = p.tool_arg(
p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
arguments.name_suffix) +
arguments.value_prefix +
(type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_json_value(p.schema(
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.space()) +
p.tool_arg_close(p.literal(arguments.value_suffix)));
auto arg =
p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
arguments.name_suffix) +
arguments.value_prefix +
(type == "string" ?
p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_json_value(p.schema(
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.space()) +
p.tool_arg_close(p.literal(arguments.value_suffix)));
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
if (is_required) {

View File

@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
result.suffix = "";
// pick prefix = all as representation
}
// When left has no unique content (result.left is empty), left is entirely
// shared with right. The simultaneous prefix/suffix segment matching can
// incorrectly consume trailing segments of left as suffix when those same
// segments also appear at the end of right (e.g. "\n" at the end of both
// the shared content and the generation prompt). This rotates the diff.
// Fix: if left is a prefix of right, enforce that directly.
if (result.left.empty() && !result.right.empty() &&
left.size() <= right.size() &&
right.substr(0, left.size()) == left) {
result.prefix = left;
result.suffix = "";
result.right = right.substr(left.size());
}
return result;
}
@ -293,22 +308,6 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
return result;
}
common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder & p,
const common_peg_parser & prs,
const autoparser::generation_params & inputs,
const std::string & reasoning_start) {
auto parser = prs;
if (!inputs.generation_prompt.empty()) {
size_t end_pos = inputs.generation_prompt.size();
if (!reasoning_start.empty() && inputs.generation_prompt.find(reasoning_start) != std::string::npos) {
end_pos = inputs.generation_prompt.find(reasoning_start);
}
std::string cut_genprompt = inputs.generation_prompt.substr(0, end_pos);
parser = p.literal(cut_genprompt) + parser;
}
return parser;
}
namespace autoparser {
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {

View File

@ -58,11 +58,6 @@ std::vector<segment> segmentize_markers(const std::string & text);
// (MARKER, "</function>"), (MARKER, "</tool_call>") ]
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
// Wrap parser with generation prompt parser
common_peg_parser wrap_for_generation_prompt(common_chat_peg_builder & p,
const common_peg_parser & prs,
const autoparser::generation_params & inputs,
const std::string & reasoning_start = {});
namespace autoparser {
// Apply a template with the given parameters, returning the rendered string (empty on failure)

View File

@ -287,7 +287,7 @@ void analyze_reasoning::compare_reasoning_presence() {
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
});
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
});
// try the more aggressive parse first, if it fails, fall back to the delimiter one
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@ -297,7 +297,7 @@ void analyze_reasoning::compare_reasoning_presence() {
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
start = trim_whitespace(result.tags["pre"]);
start = trim_leading_whitespace(result.tags["pre"]);
end = trim_trailing_whitespace(result.tags["post"]);
} else if (!result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
@ -333,7 +333,7 @@ void analyze_reasoning::compare_thinking_enabled() {
if (left_trimmed.empty() && !diff.right.empty()) {
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
if (start.empty()) {
start = right_trimmed;
start = trim_leading_whitespace(diff.right);
mode = reasoning_mode::TAG_BASED;
}
}
@ -344,10 +344,46 @@ void analyze_reasoning::compare_thinking_enabled() {
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
start = seg[seg.size() - 2].value;
}
end = left_trimmed;
end = trim_trailing_whitespace(diff.left);
mode = reasoning_mode::TAG_BASED;
}
}
} else if (!left_trimmed.empty() && !right_trimmed.empty()) {
// Full-output diff is noisy (e.g., SmolLM3 changes the system message when enable_thinking flips).
// Try to find reasoning markers by tail-anchoring:
// one output's generation prompt tail may appear in the other with extra reasoning markers appended.
const auto & output_A = comparison->output_A;
const auto & output_B = comparison->output_B;
const size_t anchor_len = 64;
for (int dir = 0; dir < 2; dir++) {
const auto & base = dir == 0 ? output_B : output_A;
const auto & extended = dir == 0 ? output_A : output_B;
size_t len = std::min(base.size(), anchor_len);
std::string anchor = base.substr(base.size() - len);
auto pos = extended.rfind(anchor);
if (pos == std::string::npos || pos + len >= extended.size()) {
continue;
}
std::string extra = trim_whitespace(extended.substr(pos + len));
if (extra.empty()) {
continue;
}
auto seg = prune_whitespace_segments(segmentize_markers(extra));
if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
if (start.empty()) {
start = seg[0].value;
}
if (end.empty()) {
end = seg[1].value;
}
mode = reasoning_mode::TAG_BASED;
break;
}
}
}
if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
@ -395,7 +431,7 @@ void analyze_reasoning::compare_reasoning_scope() {
LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
});
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
if (result.result.success()) {
@ -488,7 +524,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
// Take the more promising diff
std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
});
auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
start = result.tags["pre"];

View File

@ -802,6 +802,16 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
return tool_choices;
}
common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const std::string & delimiter) {
if (s.empty()) {
return eps();
}
if (delimiter.empty()) {
return literal(s);
}
return literal(s.substr(0, s.rfind(delimiter)));
}
common_peg_parser common_chat_peg_builder::standard_json_tools(
const std::string & section_start,
const std::string & section_end,

View File

@ -82,6 +82,10 @@ class common_chat_peg_builder : public common_peg_parser_builder {
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
// Return a parser that parses the prefix of a string, up to a given delimiter.
common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
// Legacy-compatible helper for building standard JSON tool calls
// Used by tests and manual parsers
// name_key/args_key: JSON key names for function name and arguments

View File

@ -221,7 +221,7 @@ using chat_template_caps = jinja::caps;
struct common_chat_templates {
bool add_bos;
bool add_eos;
bool has_explicit_template; // Model had builtin template or template overridde was specified.
bool has_explicit_template; // Model had builtin template or template overridden was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use;
};
@ -872,14 +872,14 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
};
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]");
auto reasoning =
extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
// Response format parser
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
// Ministral wants to emit json surrounded by code fences
return wrap_for_generation_prompt(p, reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```",
inputs, "[THINK]");
return generation_prompt + (reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```");
}
// Tool call parser
@ -899,13 +899,12 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
return wrap_for_generation_prompt(p, reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls,
inputs, "[THINK]");
return generation_prompt + (reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls);
}
// Content only parser
include_grammar = false;
return wrap_for_generation_prompt(p, reasoning << p.content(p.rest()), inputs, "[THINK]");
return generation_prompt + (reasoning << p.content(p.rest()));
});
data.parser = parser.save();
@ -972,6 +971,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto start = p.rule("start", p.literal("<|start|>assistant"));
@ -980,9 +980,19 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
if (extract_reasoning) {
p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
} else {
p.rule("analysis", p.content(p.literal("<|channel|>analysis<|message|>") + content + end));
}
auto analysis = p.ref("analysis");
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
// Consume any unsolicited tool calls, e.g. builtin functions
auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
auto any = p.rule("any", preamble | analysis);
if (has_response_format) {
@ -991,8 +1001,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
return wrap_for_generation_prompt(p, response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format),
inputs, "<|channel|>");
return p.zero_or_more(start + analysis) + start + response_format;
}
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
@ -1021,15 +1030,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto tool_call = p.trigger_rule("tool-call", tool_choice);
if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
return tool_call | ( any + p.zero_or_more(start + any) + start + tool_call);
return p.zero_or_more(start + any) + start + tool_call;
}
return wrap_for_generation_prompt(p, tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg)),
inputs, "<|channel|>");
return p.zero_or_more(start + any) + start + (tool_call | final_msg);
}
return wrap_for_generation_prompt(p, final_msg | (any + p.zero_or_more(start + any) + start + final_msg),
inputs, "<|channel|>");
return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
});
data.parser = parser.save();
@ -1080,11 +1087,12 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// When no tools, content goes until end
auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
auto content_until_end = p.literal("all\n") + p.content(p.rest());
auto generation_prompt = p.literal(inputs.generation_prompt);
// If no tools or tool_choice is NONE, just parse content
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
// When no tools, just match the prefix and capture everything after
return wrap_for_generation_prompt(p, content_until_end + p.end(), inputs);
return generation_prompt + content_until_end + p.end();
}
// Build tool call parsers for each available function
@ -1120,7 +1128,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
auto content_and_tool = content_until_tool + tool_choice;
ret = p.choice({ content_and_tool, content_only, tool_choice }) + p.end();
}
return wrap_for_generation_prompt(p, ret, inputs);
return generation_prompt + ret;
});
data.parser = parser.save();
@ -1201,12 +1209,12 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning(
p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) +
p.optional(p.literal(THINK_END))) : p.eps();
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
// Content only parser (no tools)
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end,
inputs, THINK_START);
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
// Build tool call parsers for each available function
@ -1242,8 +1250,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
auto content_before_tools = p.content(p.until_one_of({ SECTION_BEGIN, CALL_BEGIN }));
return wrap_for_generation_prompt(p, reasoning + content_before_tools + tool_calls + end,
inputs, THINK_START);
return generation_prompt + reasoning + content_before_tools + tool_calls + end;
});
data.parser = parser.save();
@ -1301,6 +1308,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
data.thinking_end_tag = THINK_END;
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
auto end = p.end();
auto reasoning = p.eps();
@ -1309,8 +1317,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return wrap_for_generation_prompt(p, reasoning + p.content(p.rest()) + end, inputs,
THINK_START);
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
@ -1322,8 +1329,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
auto content = p.content(p.until(TOOL_CALL_START));
return wrap_for_generation_prompt(p, reasoning + content + tool_calls + end, inputs,
THINK_START);
return generation_prompt + reasoning + content + tool_calls + end;
});
data.parser = parser.save();
@ -1396,7 +1402,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
ret = p.content(p.rest());
}
return wrap_for_generation_prompt(p, ret, inputs);
return p.literal(inputs.generation_prompt) + ret;
});
data.parser = parser.save();
@ -1621,7 +1627,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.generation_prompt = params.generation_prompt;
auto parser = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
return wrap_for_generation_prompt(p, p.content(p.rest()), params);
return p.prefix(params.generation_prompt) + p.content(p.rest());
});
data.parser = parser.save();
return data;

View File

@ -359,6 +359,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
}
void common_init() {
#if defined(_WIN32)
SetConsoleOutputCP(CP_UTF8);
SetConsoleCP(CP_UTF8);
#endif
llama_log_set(common_log_default_callback, NULL);
#ifdef NDEBUG
@ -367,7 +372,7 @@ void common_init() {
const char * build_type = " (debug)";
#endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
}
std::string common_params_get_system_info(const common_params & params) {
@ -656,6 +661,97 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
return true;
}
static inline bool glob_class_match(const char c, const char * pattern, const char * class_end) {
const char * class_start = pattern;
bool negated = false;
if (*class_start == '!') {
negated = true;
class_start++;
}
// If first character after negation is ']' or '-', treat it as literal
if (*class_start == ']' || *class_start == '-') {
if (class_start < class_end && *class_start == c) {
return !negated;
}
class_start++;
}
bool matched = false;
while (class_start < class_end) {
if (class_start + 2 < class_end && class_start[1] == '-' && class_start[2] != ']') {
char start_char = *class_start;
char end_char = class_start[2];
if (c >= start_char && c <= end_char) {
matched = true;
break;
}
class_start += 3;
} else {
if (*class_start == c) {
matched = true;
break;
}
class_start++;
}
}
return negated ? !matched : matched;
}
// simple glob: * matches non-/ chars, ** matches anything including /, [] matches character class
static inline bool glob_match(const char * pattern, const char * str) {
if (*pattern == '\0') {
return *str == '\0';
}
if (pattern[0] == '*' && pattern[1] == '*') {
const char * p = pattern + 2;
if (glob_match(p, str)) return true;
if (*str != '\0') return glob_match(pattern, str + 1);
return false;
}
if (*pattern == '*') {
const char * p = pattern + 1;
for (; *str != '\0' && *str != '/'; str++) {
if (glob_match(p, str)) return true;
}
return glob_match(p, str);
}
if (*pattern == '?' && *str != '\0' && *str != '/') {
return glob_match(pattern + 1, str + 1);
}
if (*pattern == '[') {
const char * class_end = pattern + 1;
// If first character after '[' is ']' or '-', treat it as literal
if (*class_end == ']' || *class_end == '-') {
class_end++;
}
while (*class_end != '\0' && *class_end != ']') {
class_end++;
}
if (*class_end == ']') {
if (*str == '\0') return false;
bool matched = glob_class_match(*str, pattern + 1, class_end);
return matched && glob_match(class_end + 1, str + 1);
} else {
if (*str == '[') {
return glob_match(pattern + 1, str + 1);
}
return false;
}
}
if (*pattern == *str) {
return glob_match(pattern + 1, str + 1);
}
return false;
}
bool glob_match(const std::string & pattern, const std::string & str) {
return glob_match(pattern.c_str(), str.c_str());
}
//
// Filesystem utils
//
@ -1152,6 +1248,9 @@ llama_context * common_init_result::context() {
}
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
return nullptr;
}
return pimpl->samplers[seq_id].get();
}

View File

@ -575,6 +575,7 @@ struct common_params {
// server params
int32_t port = 8080; // server listens on this network port
bool reuse_port = false; // allow multiple sockets to bind to the same port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
@ -615,6 +616,9 @@ struct common_params {
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
// enable built-in tools
std::vector<std::string> server_tools;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
@ -792,6 +796,8 @@ std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
bool glob_match(const std::string & pattern, const std::string & str);
//
// Filesystem utils
//

View File

@ -1,9 +1,9 @@
#include "arg.h"
#include "common.h"
#include "gguf.h" // for reading GGUF splits
#include "log.h"
#include "download.h"
#include "hf-cache.h"
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
@ -15,6 +15,7 @@
#include <map>
#include <mutex>
#include <regex>
#include <unordered_set>
#include <string>
#include <thread>
#include <vector>
@ -35,8 +36,6 @@
#endif
#endif
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
// isatty
#if defined(_WIN32)
#include <io.h>
@ -51,31 +50,6 @@ using json = nlohmann::ordered_json;
//
// validate repo name format: owner/repo
static bool validate_repo_name(const std::string & repo) {
static const std::regex repo_regex(R"(^[A-Za-z0-9_.\-]+\/[A-Za-z0-9_.\-]+$)");
return std::regex_match(repo, repo_regex);
}
static std::string get_manifest_path(const std::string & repo, const std::string & tag) {
// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string fname = "manifest=" + repo + "=" + tag + ".json";
if (!validate_repo_name(repo)) {
throw std::runtime_error("error: repo name must be in the format 'owner/repo'");
}
string_replace_all(fname, "/", "=");
return fs_get_cache_file(fname);
}
static std::string read_file(const std::string & fname) {
std::ifstream file(fname);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
}
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
file.close();
return content;
}
static void write_file(const std::string & fname, const std::string & content) {
const std::string fname_tmp = fname + ".tmp";
std::ofstream file(fname_tmp);
@ -132,7 +106,7 @@ static bool is_http_status_ok(int status) {
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string tag = parts.size() > 1 ? parts.back() : "";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
@ -145,6 +119,9 @@ class ProgressBar {
static inline std::map<const ProgressBar *, int> lines;
static inline int max_line = 0;
std::string filename;
size_t len = 0;
static void cleanup(const ProgressBar * line) {
lines.erase(line);
if (lines.empty()) {
@ -161,7 +138,23 @@ class ProgressBar {
}
public:
ProgressBar() = default;
ProgressBar(const std::string & url = "") : filename(url) {
if (auto pos = filename.rfind('/'); pos != std::string::npos) {
filename = filename.substr(pos + 1);
}
if (auto pos = filename.find('?'); pos != std::string::npos) {
filename = filename.substr(0, pos);
}
for (size_t i = 0; i < filename.size(); ++i) {
if ((filename[i] & 0xC0) != 0x80) {
if (len++ == 39) {
filename.resize(i);
filename += "";
break;
}
}
}
}
~ProgressBar() {
std::lock_guard<std::mutex> lock(mutex);
@ -169,11 +162,7 @@ public:
}
void update(size_t current, size_t total) {
if (!is_output_a_tty()) {
return;
}
if (!total) {
if (!total || !is_output_a_tty()) {
return;
}
@ -185,28 +174,27 @@ public:
}
int lines_up = max_line - lines[this];
size_t width = 50;
size_t bar = 55 - len;
size_t pct = (100 * current) / total;
size_t pos = (width * current) / total;
std::cout << "\033[s";
size_t pos = (bar * current) / total;
if (lines_up > 0) {
std::cout << "\033[" << lines_up << "A";
}
std::cout << "\033[2K\r["
<< std::string(pos, '=')
<< (pos < width ? ">" : "")
<< std::string(width - pos, ' ')
<< "] " << std::setw(3) << pct << "% ("
<< current / (1024 * 1024) << " MB / "
<< total / (1024 * 1024) << " MB) "
<< "\033[u";
std::cout << '\r' << "Downloading " << filename << " ";
std::cout.flush();
for (size_t i = 0; i < bar; ++i) {
std::cout << (i < pos ? "" : " ");
}
std::cout << std::setw(4) << pct << "%\033[K";
if (lines_up > 0) {
std::cout << "\033[" << lines_up << "B";
}
std::cout << '\r' << std::flush;
if (current == total) {
cleanup(this);
cleanup(this);
}
}
@ -234,7 +222,7 @@ static bool common_pull_file(httplib::Client & cli,
const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size;
size_t progress_step = 0;
ProgressBar bar;
ProgressBar bar(resolve_path);
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
@ -290,7 +278,8 @@ static bool common_pull_file(httplib::Client & cli,
static int common_download_file_single_online(const std::string & url,
const std::string & path,
const std::string & bearer_token,
const common_header_list & custom_headers) {
const common_header_list & custom_headers,
bool skip_etag = false) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
@ -310,11 +299,16 @@ static int common_download_file_single_online(const std::string & url,
const bool file_exists = std::filesystem::exists(path);
if (file_exists && skip_etag) {
LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
std::string last_etag;
if (file_exists) {
last_etag = read_etag(path);
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
}
auto head = cli.Head(parts.path);
@ -348,11 +342,11 @@ static int common_download_file_single_online(const std::string & url,
if (file_exists) {
if (etag.empty()) {
LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
if (!last_etag.empty() && last_etag == etag) {
LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
if (remove(path.c_str()) != 0) {
@ -361,6 +355,12 @@ static int common_download_file_single_online(const std::string & url,
}
}
{ // silent
std::error_code ec;
std::filesystem::path p(path);
std::filesystem::create_directories(p.parent_path(), ec);
}
const std::string path_temporary = path + ".downloadInProgress";
int delay = retry_delay_seconds;
@ -382,7 +382,7 @@ static int common_download_file_single_online(const std::string & url,
}
}
LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
__func__, common_http_show_masked_url(parts).c_str(),
path_temporary.c_str(), etag.c_str());
@ -391,7 +391,7 @@ static int common_download_file_single_online(const std::string & url,
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return -1;
}
if (!etag.empty()) {
if (!etag.empty() && !skip_etag) {
write_etag(path, etag);
}
return head->status;
@ -440,9 +440,10 @@ int common_download_file_single(const std::string & url,
const std::string & path,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
const common_header_list & headers,
bool skip_etag) {
if (!offline) {
return common_download_file_single_online(url, path, bearer_token, headers);
return common_download_file_single_online(url, path, bearer_token, headers, skip_etag);
}
if (!std::filesystem::exists(path)) {
@ -450,197 +451,311 @@ int common_download_file_single(const std::string & url,
return -1;
}
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
return 304; // Not Modified - fake cached response
}
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
futures_download.reserve(urls.size());
struct gguf_split_info {
std::string prefix; // tag included
std::string tag;
int index;
int count;
};
for (auto const & item : urls) {
futures_download.push_back(
std::async(
std::launch::async,
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
return is_http_status_ok(http_status);
},
item
)
);
static gguf_split_info get_gguf_split_info(const std::string & path) {
static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
static const std::regex re_tag("[-.]([A-Z0-9_]+)$", std::regex::icase);
std::smatch m;
std::string prefix = path;
if (!string_remove_suffix(prefix, ".gguf")) {
return {};
}
// Wait for all downloads to complete
for (auto & f : futures_download) {
if (!f.get()) {
return false;
int index = 1;
int count = 1;
if (std::regex_match(prefix, m, re_split)) {
index = std::stoi(m[2].str());
count = std::stoi(m[3].str());
prefix = m[1].str();
}
std::string tag;
if (std::regex_search(prefix, m, re_tag)) {
tag = m[1].str();
for (char & c : tag) {
c = std::toupper((unsigned char)c);
}
}
return true;
return {std::move(prefix), std::move(tag), index, count};
}
bool common_download_model(const common_params_model & model,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
// Basic validation of the model.url
if (model.url.empty()) {
LOG_ERR("%s: invalid model url\n", __func__);
// Q4_0 -> 4, F16 -> 16, NVFP4 -> 4, Q8_K_M -> 8, etc
static int extract_quant_bits(const std::string & filename) {
auto split = get_gguf_split_info(filename);
auto pos = split.tag.find_first_of("0123456789");
if (pos == std::string::npos) {
return 0;
}
return std::stoi(split.tag.substr(pos));
}
static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
const hf_cache::hf_file & file) {
auto split = get_gguf_split_info(file.path);
if (split.count <= 1) {
return {file};
}
hf_cache::hf_files result;
for (const auto & f : files) {
auto split_f = get_gguf_split_info(f.path);
if (split_f.count == split.count && split_f.prefix == split.prefix) {
result.push_back(f);
}
}
return result;
}
static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
const std::string & model) {
hf_cache::hf_file best;
size_t best_depth = 0;
int best_diff = 0;
bool found = false;
auto model_bits = extract_quant_bits(model);
auto model_parts = string_split<std::string>(model, '/');
auto model_dir = model_parts.end() - 1;
for (const auto & f : files) {
if (!string_ends_with(f.path, ".gguf") ||
f.path.find("mmproj") == std::string::npos) {
continue;
}
auto mmproj_parts = string_split<std::string>(f.path, '/');
auto mmproj_dir = mmproj_parts.end() - 1;
auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
mmproj_parts.begin(), mmproj_dir);
if (dir != mmproj_dir) {
continue;
}
size_t depth = dir - mmproj_parts.begin();
auto bits = extract_quant_bits(f.path);
auto diff = std::abs(bits - model_bits);
if (!found || depth > best_depth || (depth == best_depth && diff < best_diff)) {
best = f;
best_depth = depth;
best_diff = diff;
found = true;
}
}
return best;
}
static bool gguf_filename_is_model(const std::string & filepath) {
if (!string_ends_with(filepath, ".gguf")) {
return false;
}
const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
if (!is_http_status_ok(http_status)) {
return false;
std::string filename = filepath;
if (auto pos = filename.rfind('/'); pos != std::string::npos) {
filename = filename.substr(pos + 1);
}
// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
return false;
}
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}
gguf_free(ctx_gguf);
}
if (n_split > 1) {
char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
return false;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
return false;
}
}
std::vector<std::pair<std::string, std::string>> urls;
for (int idx = 1; idx < n_split; idx++) {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
char split_url[LLAMA_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
if (std::string(split_path) == model.path) {
continue; // skip the already downloaded file
}
urls.push_back({split_url, split_path});
}
// Download in parallel
common_download_file_multiple(urls, bearer_token, offline, headers);
}
return true;
return filename.find("mmproj") == std::string::npos &&
filename.find("imatrix") == std::string::npos;
}
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline,
const common_header_list & custom_headers) {
// the returned hf_repo is without tag
auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
const std::string & tag) {
std::vector<std::string> tags;
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
// headers
common_header_list headers = custom_headers;
headers.push_back({"Accept", "application/json"});
if (!bearer_token.empty()) {
headers.push_back({"Authorization", "Bearer " + bearer_token});
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here
// make the request
common_remote_params params;
params.headers = headers;
long res_code = 0;
std::string res_str;
bool use_cache = false;
std::string cached_response_path = get_manifest_path(hf_repo, tag);
if (!offline) {
try {
auto res = common_remote_get_content(url, params);
res_code = res.first;
res_str = std::string(res.second.data(), res.second.size());
} catch (const std::exception & e) {
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
}
}
if (res_code == 0) {
if (std::filesystem::exists(cached_response_path)) {
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
res_str = read_file(cached_response_path);
res_code = 200;
use_cache = true;
} else {
throw std::runtime_error(
offline ? "error: failed to get manifest (offline mode)"
: "error: failed to get manifest (check your internet connection)");
}
}
std::string ggufFile;
std::string mmprojFile;
if (res_code == 200 || res_code == 304) {
try {
auto j = json::parse(res_str);
if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
}
if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
}
} catch (const std::exception & e) {
throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
}
if (!use_cache) {
// if not using cached response, update the cache file
write_file(cached_response_path, res_str);
}
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
if (!tag.empty()) {
tags.push_back(tag);
} else {
throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
tags = {"Q4_K_M", "Q4_0"};
}
// check response
if (ggufFile.empty()) {
throw std::runtime_error("error: model does not have ggufFile");
for (const auto & t : tags) {
std::regex pattern(t + "[.-]", std::regex::icase);
for (const auto & f : files) {
if (gguf_filename_is_model(f.path) &&
std::regex_search(f.path, pattern)) {
return f;
}
}
}
return { hf_repo, ggufFile, mmprojFile };
for (const auto & f : files) {
if (gguf_filename_is_model(f.path)) {
return f;
}
}
return {};
}
static void list_available_gguf_files(const hf_cache::hf_files & files) {
LOG_INF("Available GGUF files:\n");
for (const auto & f : files) {
if (string_ends_with(f.path, ".gguf")) {
LOG_INF(" - %s\n", f.path.c_str());
}
}
}
struct hf_plan {
hf_cache::hf_files model_files;
hf_cache::hf_file mmproj;
};
static hf_plan get_hf_plan(const common_params_model & model,
const std::string & token,
const common_download_model_opts & opts) {
hf_plan plan;
hf_cache::hf_files all;
auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
if (!opts.offline) {
all = hf_cache::get_repo_files(repo, token);
}
if (all.empty()) {
all = hf_cache::get_cached_files(repo);
}
if (all.empty()) {
return plan;
}
hf_cache::hf_file primary;
if (!model.hf_file.empty()) {
for (const auto & f : all) {
if (f.path == model.hf_file) {
primary = f;
break;
}
}
if (primary.path.empty()) {
LOG_ERR("%s: file '%s' not found in repository\n", __func__, model.hf_file.c_str());
list_available_gguf_files(all);
return plan;
}
} else {
primary = find_best_model(all, tag);
if (primary.path.empty()) {
LOG_ERR("%s: no GGUF files found in repository %s\n", __func__, repo.c_str());
list_available_gguf_files(all);
return plan;
}
}
plan.model_files = get_split_files(all, primary);
if (opts.download_mmproj) {
plan.mmproj = find_best_mmproj(all, primary.path);
}
return plan;
}
struct download_task {
std::string url;
std::string path;
};
static std::vector<download_task> get_url_tasks(const common_params_model & model) {
auto split = get_gguf_split_info(model.url);
if (split.count <= 1) {
return {{model.url, model.path}};
}
auto filename = split.prefix;
if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
filename = split.prefix.substr(pos + 1);
}
auto parent_path = std::filesystem::path(model.path).parent_path();
auto prefix_path = (parent_path / filename).string();
std::vector<download_task> tasks;
for (int i = 1; i <= split.count; i++) {
auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
tasks.push_back({split.prefix + suffix, prefix_path + suffix});
}
return tasks;
}
common_download_model_result common_download_model(const common_params_model & model,
const std::string & bearer_token,
const common_download_model_opts & opts,
const common_header_list & headers) {
common_download_model_result result;
std::vector<download_task> tasks;
hf_plan hf;
bool is_hf = !model.hf_repo.empty();
if (is_hf) {
hf = get_hf_plan(model, bearer_token, opts);
for (const auto & f : hf.model_files) {
tasks.push_back({f.url, f.local_path});
}
if (!hf.mmproj.path.empty()) {
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
}
} else if (!model.url.empty()) {
tasks = get_url_tasks(model);
} else {
result.model_path = model.path;
return result;
}
if (tasks.empty()) {
return result;
}
std::vector<std::future<bool>> futures;
for (const auto & task : tasks) {
futures.push_back(std::async(std::launch::async,
[&task, &bearer_token, offline = opts.offline, &headers, is_hf]() {
int status = common_download_file_single(task.url, task.path, bearer_token, offline, headers, is_hf);
return is_http_status_ok(status);
}
));
}
for (auto & f : futures) {
if (!f.get()) {
return {};
}
}
if (is_hf) {
for (const auto & f : hf.model_files) {
hf_cache::finalize_file(f);
}
result.model_path = hf.model_files[0].final_path;
if (!hf.mmproj.path.empty()) {
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
}
} else {
result.model_path = model.path;
}
return result;
}
//
@ -765,28 +880,21 @@ std::string common_docker_resolve_model(const std::string & docker) {
}
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();
const std::vector<common_file_info> files = fs_list(cache_dir, false);
for (const auto & file : files) {
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
common_cached_model_info model_info;
model_info.manifest_path = file.path;
std::string fname = file.name;
string_replace_all(fname, ".json", ""); // remove extension
auto parts = string_split<std::string>(fname, '=');
if (parts.size() == 4) {
// expect format: manifest=<user>=<model>=<tag>=<other>
model_info.user = parts[1];
model_info.model = parts[2];
model_info.tag = parts[3];
} else {
// invalid format
continue;
}
model_info.size = 0; // TODO: get GGUF size, not manifest size
models.push_back(model_info);
std::unordered_set<std::string> seen;
std::vector<common_cached_model_info> result;
auto files = hf_cache::get_cached_files();
for (const auto & f : files) {
auto split = get_gguf_split_info(f.path);
if (split.index != 1 || split.tag.empty() ||
split.prefix.find("mmproj") != std::string::npos) {
continue;
}
if (seen.insert(f.repo_id + ":" + split.tag).second) {
result.push_back({f.repo_id, split.tag});
}
}
return models;
return result;
}

View File

@ -17,54 +17,60 @@ struct common_remote_params {
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
// split HF repo with tag into <repo, tag>
// for example: "user/model:tag" -> <"user/model", "tag">
// if tag is not present, default to "latest"
// example: "user/model" -> <"user/model", "latest">
// split HF repo with tag into <repo, tag>, for example:
// - "ggml-org/models:F16" -> <"ggml-org/models", "F16">
// tag is optional and can be empty
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);
// Result of common_list_cached_models
struct common_cached_model_info {
std::string manifest_path;
std::string user;
std::string model;
std::string repo;
std::string tag;
size_t size = 0; // GGUF size in bytes
// return string representation like "user/model:tag"
// if tag is "latest", it will be omitted
std::string to_string() const {
return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
return repo + ":" + tag;
}
};
struct common_hf_file_res {
std::string repo; // repo name with ":tag" removed
std::string ggufFile;
std::string mmprojFile;
// Options for common_download_model
struct common_download_model_opts {
bool download_mmproj = false;
bool offline = false;
};
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
common_hf_file_res common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {}
);
// Result of common_download_model
struct common_download_model_result {
std::string model_path;
std::string mmproj_path;
};
// returns true if download succeeded
bool common_download_model(
// Download model from HuggingFace repo or URL
//
// input (via model struct):
// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
// - model.hf_file: specific file in the repo (requires hf_repo)
// - model.url: simple download (used if hf_repo is empty)
// - model.path: local file path
//
// tag matching (for HF repos without model.hf_file):
// - if tag is specified, searches for GGUF matching that quantization
// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
//
// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
// detected and all parts are downloaded
//
// caching:
// - HF repos: uses HuggingFace cache
// - URLs: uses ETag-based caching
//
// when opts.offline=true, no network requests are made
// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
// then with the closest quantization bits
//
// returns result with model_path and mmproj_path (empty on failure)
common_download_model_result common_download_model(
const common_params_model & model,
const std::string & bearer_token,
bool offline,
const common_download_model_opts & opts = {},
const common_header_list & headers = {}
);
@ -73,11 +79,13 @@ std::vector<common_cached_model_info> common_list_cached_models();
// download single file from url to local path
// returns status code or -1 on error
// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
int common_download_file_single(const std::string & url,
const std::string & path,
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {});
const common_header_list & headers = {},
bool skip_etag = false);
// resolve and download model from Docker registry
// return local path to downloaded model file

771
common/hf-cache.cpp Normal file
View File

@ -0,0 +1,771 @@
#include "hf-cache.h"
#include "common.h"
#include "log.h"
#include "http.h"
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
#include <filesystem>
#include <fstream>
#include <atomic>
#include <regex> // migration only
#include <string>
#include <string_view>
#include <stdexcept>
namespace nl = nlohmann;
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#define HOME_DIR "USERPROFILE"
#include <windows.h>
#else
#define HOME_DIR "HOME"
#include <unistd.h>
#include <pwd.h>
#endif
namespace hf_cache {
namespace fs = std::filesystem;
static fs::path get_cache_directory() {
static const fs::path cache = []() {
struct {
const char * var;
fs::path path;
} entries[] = {
{"LLAMA_CACHE", fs::path()},
{"HF_HUB_CACHE", fs::path()},
{"HUGGINGFACE_HUB_CACHE", fs::path()},
{"HF_HOME", fs::path("hub")},
{"XDG_CACHE_HOME", fs::path("huggingface") / "hub"},
{HOME_DIR, fs::path(".cache") / "huggingface" / "hub"}
};
for (const auto & entry : entries) {
if (auto * p = std::getenv(entry.var); p && *p) {
fs::path base(p);
return entry.path.empty() ? base : base / entry.path;
}
}
#ifndef _WIN32
const struct passwd * pw = getpwuid(getuid());
if (pw->pw_dir && *pw->pw_dir) {
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
}
#endif
throw std::runtime_error("Failed to determine HF cache directory");
}();
return cache;
}
static std::string folder_name_to_repo(const std::string & folder) {
constexpr std::string_view prefix = "models--";
if (folder.rfind(prefix, 0)) {
return {};
}
std::string result = folder.substr(prefix.length());
string_replace_all(result, "--", "/");
return result;
}
static std::string repo_to_folder_name(const std::string & repo_id) {
constexpr std::string_view prefix = "models--";
std::string result = std::string(prefix) + repo_id;
string_replace_all(result, "/", "--");
return result;
}
static fs::path get_repo_path(const std::string & repo_id) {
return get_cache_directory() / repo_to_folder_name(repo_id);
}
static bool is_hex_char(const char c) {
return (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f') ||
(c >= '0' && c <= '9');
}
static bool is_hex_string(const std::string & s, size_t expected_len) {
if (s.length() != expected_len) {
return false;
}
for (const char c : s) {
if (!is_hex_char(c)) {
return false;
}
}
return true;
}
static bool is_alphanum(const char c) {
return (c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9');
}
static bool is_special_char(char c) {
return c == '/' || c == '.' || c == '-';
}
// base chars [A-Za-z0-9_] are always valid
// special chars [/.-] must be surrounded by base chars
// exactly one '/' required
static bool is_valid_repo_id(const std::string & repo_id) {
if (repo_id.empty() || repo_id.length() > 256) {
return false;
}
int slash = 0;
bool special = true;
for (const char c : repo_id) {
if (is_alphanum(c) || c == '_') {
special = false;
} else if (is_special_char(c)) {
if (special) {
return false;
}
slash += (c == '/');
special = true;
} else {
return false;
}
}
return !special && slash == 1;
}
static bool is_valid_hf_token(const std::string & token) {
if (token.length() < 37 || token.length() > 256 ||
!string_starts_with(token, "hf_")) {
return false;
}
for (size_t i = 3; i < token.length(); ++i) {
if (!is_alphanum(token[i])) {
return false;
}
}
return true;
}
static bool is_valid_commit(const std::string & hash) {
return is_hex_string(hash, 40);
}
static bool is_valid_oid(const std::string & oid) {
return is_hex_string(oid, 40) || is_hex_string(oid, 64);
}
static bool is_valid_subpath(const fs::path & path, const fs::path & subpath) {
if (subpath.is_absolute()) {
return false; // never do a / b with b absolute
}
auto b = fs::absolute(path).lexically_normal();
auto t = (b / subpath).lexically_normal();
auto [b_end, _] = std::mismatch(b.begin(), b.end(), t.begin(), t.end());
return b_end == b.end();
}
static void safe_write_file(const fs::path & path, const std::string & data) {
fs::path path_tmp = path.string() + ".tmp";
if (path.has_parent_path()) {
fs::create_directories(path.parent_path());
}
std::ofstream file(path_tmp);
file << data;
file.close();
std::error_code ec;
if (!file.fail()) {
fs::rename(path_tmp, path, ec);
}
if (file.fail() || ec) {
fs::remove(path_tmp, ec);
throw std::runtime_error("failed to write file: " + path.string());
}
}
static nl::json api_get(const std::string & url,
const std::string & token) {
auto [cli, parts] = common_http_client(url);
httplib::Headers headers = {
{"User-Agent", "llama-cpp/" + build_info},
{"Accept", "application/json"}
};
if (is_valid_hf_token(token)) {
headers.emplace("Authorization", "Bearer " + token);
} else if (!token.empty()) {
LOG_WRN("%s: invalid token, authentication disabled\n", __func__);
}
if (auto res = cli.Get(parts.path, headers)) {
auto body = res->body;
if (res->status == 200) {
return nl::json::parse(res->body);
}
try {
body = nl::json::parse(res->body)["error"].get<std::string>();
} catch (...) { }
throw std::runtime_error("GET failed (" + std::to_string(res->status) + "): " + body);
} else {
throw std::runtime_error("HTTPLIB failed: " + httplib::to_string(res.error()));
}
}
static std::string get_repo_commit(const std::string & repo_id,
const std::string & token) {
try {
auto endpoint = get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
if (!json.is_object() ||
!json.contains("branches") || !json["branches"].is_array()) {
LOG_WRN("%s: missing 'branches' for '%s'\n", __func__, repo_id.c_str());
return {};
}
fs::path refs_path = get_repo_path(repo_id) / "refs";
std::string name;
std::string commit;
for (const auto & branch : json["branches"]) {
if (!branch.is_object() ||
!branch.contains("name") || !branch["name"].is_string() ||
!branch.contains("targetCommit") || !branch["targetCommit"].is_string()) {
continue;
}
std::string _name = branch["name"].get<std::string>();
std::string _commit = branch["targetCommit"].get<std::string>();
if (!is_valid_subpath(refs_path, _name)) {
LOG_WRN("%s: skip invalid branch: %s\n", __func__, _name.c_str());
continue;
}
if (!is_valid_commit(_commit)) {
LOG_WRN("%s: skip invalid commit: %s\n", __func__, _commit.c_str());
continue;
}
if (_name == "main") {
name = _name;
commit = _commit;
break;
}
if (name.empty() || commit.empty()) {
name = _name;
commit = _commit;
}
}
if (name.empty() || commit.empty()) {
LOG_WRN("%s: no valid branch for '%s'\n", __func__, repo_id.c_str());
return {};
}
safe_write_file(refs_path / name, commit);
return commit;
} catch (const nl::json::exception & e) {
LOG_ERR("%s: JSON error: %s\n", __func__, e.what());
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
}
return {};
}
hf_files get_repo_files(const std::string & repo_id,
const std::string & token) {
if (!is_valid_repo_id(repo_id)) {
LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
return {};
}
std::string commit = get_repo_commit(repo_id, token);
if (commit.empty()) {
LOG_WRN("%s: failed to resolve commit for %s\n", __func__, repo_id.c_str());
return {};
}
fs::path blobs_path = get_repo_path(repo_id) / "blobs";
fs::path commit_path = get_repo_path(repo_id) / "snapshots" / commit;
hf_files files;
try {
auto endpoint = get_model_endpoint();
auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
if (!json.is_array()) {
LOG_WRN("%s: response is not an array for '%s'\n", __func__, repo_id.c_str());
return {};
}
for (const auto & item : json) {
if (!item.is_object() ||
!item.contains("type") || !item["type"].is_string() || item["type"] != "file" ||
!item.contains("path") || !item["path"].is_string()) {
continue;
}
hf_file file;
file.repo_id = repo_id;
file.path = item["path"].get<std::string>();
if (!is_valid_subpath(commit_path, file.path)) {
LOG_WRN("%s: skip invalid path: %s\n", __func__, file.path.c_str());
continue;
}
if (item.contains("lfs") && item["lfs"].is_object()) {
if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
file.oid = item["lfs"]["oid"].get<std::string>();
}
if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
file.size = item["lfs"]["size"].get<size_t>();
}
} else if (item.contains("oid") && item["oid"].is_string()) {
file.oid = item["oid"].get<std::string>();
}
if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
file.size = item["size"].get<size_t>();
}
if (!file.oid.empty() && !is_valid_oid(file.oid)) {
LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
continue;
}
file.url = endpoint + repo_id + "/resolve/" + commit + "/" + file.path;
fs::path final_path = commit_path / file.path;
file.final_path = final_path.string();
if (!file.oid.empty() && !fs::exists(final_path)) {
fs::path local_path = blobs_path / file.oid;
file.local_path = local_path.string();
} else {
file.local_path = file.final_path;
}
files.push_back(file);
}
} catch (const nl::json::exception & e) {
LOG_ERR("%s: JSON error: %s\n", __func__, e.what());
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
}
return files;
}
static std::string get_cached_ref(const fs::path & repo_path) {
fs::path refs_path = repo_path / "refs";
if (!fs::is_directory(refs_path)) {
return {};
}
std::string fallback;
for (const auto & entry : fs::directory_iterator(refs_path)) {
if (!entry.is_regular_file()) {
continue;
}
std::ifstream f(entry.path());
std::string commit;
if (!f || !std::getline(f, commit) || commit.empty()) {
continue;
}
if (!is_valid_commit(commit)) {
LOG_WRN("%s: skip invalid commit: %s\n", __func__, commit.c_str());
continue;
}
if (entry.path().filename() == "main") {
return commit;
}
if (fallback.empty()) {
fallback = commit;
}
}
return fallback;
}
hf_files get_cached_files(const std::string & repo_id) {
fs::path cache_dir = get_cache_directory();
if (!fs::exists(cache_dir)) {
return {};
}
if (!repo_id.empty() && !is_valid_repo_id(repo_id)) {
LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
return {};
}
hf_files files;
for (const auto & repo : fs::directory_iterator(cache_dir)) {
if (!repo.is_directory()) {
continue;
}
fs::path snapshots_path = repo.path() / "snapshots";
if (!fs::exists(snapshots_path)) {
continue;
}
std::string _repo_id = folder_name_to_repo(repo.path().filename().string());
if (!is_valid_repo_id(_repo_id)) {
continue;
}
if (!repo_id.empty() && _repo_id != repo_id) {
continue;
}
std::string commit = get_cached_ref(repo.path());
fs::path commit_path = snapshots_path / commit;
if (commit.empty() || !fs::is_directory(commit_path)) {
continue;
}
for (const auto & entry : fs::recursive_directory_iterator(commit_path)) {
if (!entry.is_regular_file() && !entry.is_symlink()) {
continue;
}
fs::path path = entry.path().lexically_relative(commit_path);
if (!path.empty()) {
hf_file file;
file.repo_id = _repo_id;
file.path = path.generic_string();
file.local_path = entry.path().string();
file.final_path = file.local_path;
files.push_back(std::move(file));
}
}
}
return files;
}
std::string finalize_file(const hf_file & file) {
static std::atomic<bool> symlinks_disabled{false};
std::error_code ec;
fs::path local_path(file.local_path);
fs::path final_path(file.final_path);
if (local_path == final_path || fs::exists(final_path, ec)) {
return file.final_path;
}
if (!fs::exists(local_path, ec)) {
return file.final_path;
}
fs::create_directories(final_path.parent_path(), ec);
if (!symlinks_disabled) {
fs::path target = fs::relative(local_path, final_path.parent_path(), ec);
if (!ec) {
fs::create_symlink(target, final_path, ec);
}
if (!ec) {
return file.final_path;
}
}
if (!symlinks_disabled.exchange(true)) {
LOG_WRN("%s: failed to create symlink: %s\n", __func__, ec.message().c_str());
LOG_WRN("%s: switching to degraded mode\n", __func__);
}
fs::rename(local_path, final_path, ec);
if (ec) {
LOG_WRN("%s: failed to move file to snapshots: %s\n", __func__, ec.message().c_str());
fs::copy(local_path, final_path, ec);
if (ec) {
LOG_ERR("%s: failed to copy file to snapshots: %s\n", __func__, ec.message().c_str());
}
}
return file.final_path;
}
// delete everything after this line, one day
// copied from download.cpp without the tag part
struct gguf_split_info {
std::string prefix; // tag included
int index;
int count;
};
static gguf_split_info get_gguf_split_info(const std::string & path) {
static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
std::smatch m;
std::string prefix = path;
if (!string_remove_suffix(prefix, ".gguf")) {
return {};
}
int index = 1;
int count = 1;
if (std::regex_match(prefix, m, re_split)) {
index = std::stoi(m[2].str());
count = std::stoi(m[3].str());
prefix = m[1].str();
}
return {std::move(prefix), index, count};
}
static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
std::smatch match;
if (std::regex_match(filename, match, re)) {
return {match[1].str(), match[2].str()};
}
return {};
}
static std::string make_old_cache_filename(const std::string & owner,
const std::string & repo,
const std::string & filename) {
auto result = owner + "_" + repo + "_" + filename;
string_replace_all(result, "/", "_");
return result;
}
struct migrate_file {
std::string path;
std::string sha256;
size_t size;
fs::path old_path;
fs::path etag_path;
const hf_file * file;
};
using migrate_files = std::vector<migrate_file>;
static bool collect_file(const fs::path & old_cache,
const std::string & owner,
const std::string & repo,
const std::string & path,
const std::string & sha256,
const hf_files & files,
migrate_files & to_migrate) {
const hf_file * file = nullptr;
for (const auto & f : files) {
if (f.path == path) {
file = &f;
break;
}
}
std::string old_filename = make_old_cache_filename(owner, repo, path);
fs::path old_path = old_cache / old_filename;
fs::path etag_path = old_path.string() + ".etag";
if (!fs::exists(old_path)) {
if (file && fs::exists(file->final_path)) {
return true;
}
LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
return false;
}
if (!file) {
LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
return false;
}
if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
return false;
}
if (file->size > 0) {
size_t size = fs::file_size(old_path);
if (size != file->size) {
LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
return false;
}
}
to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
return true;
}
static bool collect_files(const fs::path & old_cache,
const std::string & owner,
const std::string & repo,
const nl::json & node,
const hf_files & files,
migrate_files & to_migrate) {
if (!node.contains("rfilename") ||
!node.contains("lfs") ||
!node["lfs"].contains("sha256")) {
return true;
}
std::string path = node["rfilename"];
std::string sha256 = node["lfs"]["sha256"];
auto split = get_gguf_split_info(path);
if (split.count <= 1) {
return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
}
std::vector<std::pair<std::string, std::string>> splits;
for (const auto & f : files) {
auto split_f = get_gguf_split_info(f.path);
if (split_f.count == split.count && split_f.prefix == split.prefix) {
// sadly the manifest only provides the sha256 of the first file (index == 1)
// the rest will be verified using the size...
std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
splits.emplace_back(f.path, f_sha256);
}
}
if ((int)splits.size() != split.count) {
LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
return false;
}
for (const auto & [f_path, f_sha256] : splits) {
if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
return false;
}
}
return true;
}
static bool migrate_file(const migrate_file & file) {
std::error_code ec;
fs::path new_path(file.file->local_path);
fs::create_directories(new_path.parent_path(), ec);
if (!fs::exists(new_path, ec)) {
fs::rename(file.old_path, new_path, ec);
if (ec) {
fs::copy_file(file.old_path, new_path, ec);
if (ec) {
LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
return false;
}
}
fs::remove(file.old_path, ec);
}
fs::remove(file.etag_path, ec);
std::string filename = finalize_file(*file.file);
LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
return true;
}
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
fs::path old_cache = fs_get_cache_directory();
if (!fs::exists(old_cache)) {
return;
}
if (offline) {
LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
return; // -hf is not going to work
}
bool warned = false;
for (const auto & entry : fs::directory_iterator(old_cache)) {
if (!entry.is_regular_file()) {
continue;
}
auto filename = entry.path().filename().string();
auto [owner, repo] = parse_manifest_name(filename);
if (owner.empty() || repo.empty()) {
continue;
}
if (!warned) {
warned = true;
LOG_WRN("================================================================================\n"
"WARNING: Migrating cache to HuggingFace cache directory\n"
" Old cache: %s\n"
" New cache: %s\n"
"This one-time migration moves models previously downloaded with -hf\n"
"from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
"Models downloaded with --model-url are not affected.\n"
"================================================================================\n",
old_cache.string().c_str(), get_cache_directory().string().c_str());
}
auto repo_id = owner + "/" + repo;
auto files = get_repo_files(repo_id, token);
if (files.empty()) {
LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
continue;
}
migrate_files to_migrate;
bool ok = true;
try {
std::ifstream manifest(entry.path());
auto json = nl::json::parse(manifest);
for (const char * key : {"ggufFile", "mmprojFile"}) {
if (json.contains(key)) {
if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
ok = false;
break;
}
}
}
} catch (const std::exception & e) {
LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
continue;
}
if (!ok) {
LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
continue;
}
for (const auto & file : to_migrate) {
if (!migrate_file(file)) {
ok = false;
break;
}
}
if (!ok) {
LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
continue;
}
LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
fs::remove(entry.path());
}
}
} // namespace hf_cache

36
common/hf-cache.h Normal file
View File

@ -0,0 +1,36 @@
#pragma once
#include <string>
#include <vector>
// Ref: https://huggingface.co/docs/hub/local-cache.md
namespace hf_cache {
struct hf_file {
std::string path;
std::string url;
std::string local_path;
std::string final_path;
std::string oid;
std::string repo_id;
size_t size = 0; // only for the migration
};
using hf_files = std::vector<hf_file>;
// Get files from HF API
hf_files get_repo_files(
const std::string & repo_id,
const std::string & token
);
hf_files get_cached_files(const std::string & repo_id = {});
// Create snapshot path (link or move/copy) and return it
std::string finalize_file(const hf_file & file);
// TODO: Remove later
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
} // namespace hf_cache

View File

@ -53,6 +53,13 @@ private:
return tokens[current + offset];
}
const token & next() {
if (current >= tokens.size()) {
throw parser_exception("Parser Error: Unexpected EOF", source, tokens.empty() ? 0 : tokens.back().pos);
}
return tokens[current++];
}
token expect(token::type type, const std::string& error) {
const auto & t = peek();
if (t.t != type) {
@ -90,9 +97,9 @@ private:
size_t start_pos = current;
switch (peek().t) {
case token::comment:
return mk_stmt<comment_statement>(start_pos, tokens[current++].value);
return mk_stmt<comment_statement>(start_pos, next().value);
case token::text:
return mk_stmt<string_literal>(start_pos, tokens[current++].value);
return mk_stmt<string_literal>(start_pos, next().value);
case token::open_statement:
return parse_jinja_statement();
case token::open_expression:
@ -119,8 +126,7 @@ private:
}
size_t start_pos = current;
std::string name = peek().value;
current++; // consume identifier
std::string name = next().value;
statement_ptr result;
if (name == "set") {
@ -202,7 +208,7 @@ private:
// Ignore generation blocks (transformers-specific)
// See https://github.com/huggingface/transformers/pull/30650 for more information.
result = mk_stmt<noop_statement>(start_pos);
current++;
++current;
} else {
throw std::runtime_error("Unknown statement: " + name);
@ -217,7 +223,7 @@ private:
statements body;
if (is(token::equals)) {
current++;
++current;
value = parse_expression_sequence();
} else {
// parsing multiline set here
@ -280,7 +286,7 @@ private:
exprs.push_back(primary ? parse_primary_expression() : parse_expression());
bool is_tuple = is(token::comma);
while (is(token::comma)) {
current++; // consume comma
++current; // consume comma
exprs.push_back(primary ? parse_primary_expression() : parse_expression());
}
return is_tuple ? mk_stmt<tuple_literal>(start_pos, std::move(exprs)) : std::move(exprs[0]);
@ -290,7 +296,7 @@ private:
// e.g., `message` in `for message in messages`
auto loop_var = parse_expression_sequence(true); // should be an identifier/tuple
if (!is_identifier("in")) throw std::runtime_error("Expected 'in'");
current++;
++current; // consume 'in'
// `messages` in `for message in messages`
auto iterable = parse_expression();
@ -305,7 +311,8 @@ private:
}
if (is_statement({"else"})) {
current += 2;
++current; // consume {%
++current; // consume 'else'
expect(token::close_statement, "Expected %}");
while (!is_statement({"endfor"})) {
alternate.push_back(parse_any());
@ -347,7 +354,7 @@ private:
auto left = parse_logical_and_expression();
while (is_identifier("or")) {
size_t start_pos = current;
token op = tokens[current++];
token op = next();
left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_and_expression());
}
return left;
@ -357,7 +364,7 @@ private:
auto left = parse_logical_negation_expression();
while (is_identifier("and")) {
size_t start_pos = current;
auto op = tokens[current++];
auto op = next();
left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_logical_negation_expression());
}
return left;
@ -367,7 +374,7 @@ private:
// Try parse unary operators
if (is_identifier("not")) {
size_t start_pos = current;
auto op = tokens[current++];
auto op = next();
return mk_stmt<unary_expression>(start_pos, op, parse_logical_negation_expression());
}
return parse_comparison_expression();
@ -382,11 +389,12 @@ private:
size_t start_pos = current;
if (is_identifier("not") && peek(1).t == token::identifier && peek(1).value == "in") {
op = {token::identifier, "not in", tokens[current].pos};
current += 2;
++current; // consume 'not'
++current; // consume 'in'
} else if (is_identifier("in")) {
op = tokens[current++];
op = next();
} else if (is(token::comparison_binary_operator)) {
op = tokens[current++];
op = next();
} else break;
left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_additive_expression());
}
@ -397,7 +405,7 @@ private:
auto left = parse_multiplicative_expression();
while (is(token::additive_binary_operator)) {
size_t start_pos = current;
auto op = tokens[current++];
auto op = next();
left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_multiplicative_expression());
}
return left;
@ -407,7 +415,7 @@ private:
auto left = parse_test_expression();
while (is(token::multiplicative_binary_operator)) {
size_t start_pos = current;
auto op = tokens[current++];
auto op = next();
left = mk_stmt<binary_expression>(start_pos, op, std::move(left), parse_test_expression());
}
return left;
@ -417,9 +425,9 @@ private:
auto operand = parse_filter_expression();
while (is_identifier("is")) {
size_t start_pos = current;
current++;
++current; // consume 'is'
bool negate = false;
if (is_identifier("not")) { current++; negate = true; }
if (is_identifier("not")) { ++current; negate = true; }
auto test_id = parse_primary_expression();
// FIXME: tests can also be expressed like this: if x is eq 3
if (is(token::open_paren)) test_id = parse_call_expression(std::move(test_id));
@ -432,7 +440,7 @@ private:
auto operand = parse_call_member_expression();
while (is(token::pipe)) {
size_t start_pos = current;
current++;
++current; // consume pipe
auto filter = parse_primary_expression();
if (is(token::open_paren)) filter = parse_call_expression(std::move(filter));
operand = mk_stmt<filter_expression>(start_pos, std::move(operand), std::move(filter));
@ -490,7 +498,7 @@ private:
statement_ptr parse_member_expression(statement_ptr object) {
size_t start_pos = current;
while (is(token::dot) || is(token::open_square_bracket)) {
auto op = tokens[current++];
auto op = next();
bool computed = op.t == token::open_square_bracket;
statement_ptr prop;
if (computed) {
@ -531,12 +539,15 @@ private:
statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr;
return mk_stmt<slice_expression>(start_pos, std::move(start), std::move(stop), std::move(step));
}
if (slices.empty()) {
return mk_stmt<blank_expression>(start_pos);
}
return std::move(slices[0]);
}
statement_ptr parse_primary_expression() {
size_t start_pos = current;
auto t = tokens[current++];
auto t = next();
switch (t.t) {
case token::numeric_literal:
if (t.value.find('.') != std::string::npos) {
@ -547,7 +558,7 @@ private:
case token::string_literal: {
std::string val = t.value;
while (is(token::string_literal)) {
val += tokens[current++].value;
val += next().value;
}
return mk_stmt<string_literal>(start_pos, val);
}
@ -562,9 +573,9 @@ private:
statements vals;
while (!is(token::close_square_bracket)) {
vals.push_back(parse_expression());
if (is(token::comma)) current++;
if (is(token::comma)) ++current;
}
current++;
++current;
return mk_stmt<array_literal>(start_pos, std::move(vals));
}
case token::open_curly_bracket: {
@ -573,9 +584,9 @@ private:
auto key = parse_expression();
expect(token::colon, "Expected :");
pairs.push_back({std::move(key), parse_expression()});
if (is(token::comma)) current++;
if (is(token::comma)) ++current;
}
current++;
++current;
return mk_stmt<object_literal>(start_pos, std::move(pairs));
}
default:

View File

@ -667,8 +667,9 @@ value macro_statement::execute_impl(context & ctx) {
if (is_stmt<identifier>(this->args[i])) {
// normal parameter
std::string param_name = cast_stmt<identifier>(this->args[i])->val;
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
macro_ctx.set_val(param_name, args.get_pos(i));
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
macro_ctx.set_val(param_name, param_value);
} else if (is_stmt<keyword_argument_expression>(this->args[i])) {
// default argument used as normal parameter
auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
@ -676,8 +677,9 @@ value macro_statement::execute_impl(context & ctx) {
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
}
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
macro_ctx.set_val(param_name, args.get_pos(i));
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
macro_ctx.set_val(param_name, param_value);
} else {
throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
}
@ -769,10 +771,15 @@ value member_expression::execute_impl(context & ctx) {
}
JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
ensure_key_type_allowed(property);
value val = mk_val<value_undefined>("object_property");
if (property->is_undefined()) {
JJ_DEBUG("%s", "Member expression property is undefined, returning undefined");
return val;
}
ensure_key_type_allowed(property);
if (is_val<value_undefined>(object)) {
JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined");
return val;

View File

@ -263,6 +263,14 @@ struct comment_statement : public statement {
// Expressions
// Represents an omitted expression in a computed member, e.g. `a[]`.
struct blank_expression : public expression {
std::string type() const override { return "BlankExpression"; }
value execute_impl(context &) override {
return mk_val<value_undefined>();
}
};
struct member_expression : public expression {
statement_ptr object;
statement_ptr property;

View File

@ -416,15 +416,30 @@ private:
i++;
} else if (c == '(') {
i++;
if (i < length) {
if (sub_pattern[i] == '?') {
if (i < length && sub_pattern[i] == '?') {
if (i + 1 < length && sub_pattern[i + 1] == ':') {
i += 2; // skip "?:" for non-capturing group, treat as regular group
} else {
// lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
_warnings.push_back("Unsupported pattern syntax");
// skip to matching ')' to avoid UB on empty seq
int depth = 1;
while (i < length && depth > 0) {
if (sub_pattern[i] == '\\' && i + 1 < length) {
i += 2; // skip escaped character
} else {
if (sub_pattern[i] == '(') depth++;
else if (sub_pattern[i] == ')') depth--;
i++;
}
}
continue;
}
}
seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') {
i++;
if (start > 0 && sub_pattern[start - 1] != '(') {
if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
_errors.push_back("Unbalanced parentheses");
}
return join_seq();

View File

@ -51,7 +51,7 @@ struct common_ngram_map_value {
// statistics of a n-gram
struct common_ngram_map_key {
size_t key_idx; // index of key n-gram in token-history
size_t stat_idx; // index of last token of stastistics computation (key_num, values)
size_t stat_idx; // index of last token of statistics computation (key_num, values)
uint16_t key_num; // number of occurrences of this key n-gram in token-history
common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key

View File

@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
break;
}
case REASONING_BUDGET_FORCING:
// force_pos is advanced in apply(), not here.
// This ensures the first forced token isn't skipped when the sampler
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
break;
case REASONING_BUDGET_DONE:
break;
@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
cur_p->data[i].logit = -INFINITY;
}
}
// advance to next forced token (done here rather than in accept so that
// the first forced token isn't skipped when starting in FORCING state)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
}
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
common_reasoning_budget_state initial_state) {
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
}
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
if (!smpl) {
return REASONING_BUDGET_IDLE;
}
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
}

View File

@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state);
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);

View File

@ -7,6 +7,7 @@
#include <algorithm>
#include <cctype>
#include <climits>
#include <cmath>
#include <cstring>
#include <unordered_map>
@ -109,6 +110,7 @@ struct common_sampler {
common_params_sampling params;
struct llama_sampler * grmr;
struct llama_sampler * rbudget;
struct llama_sampler * chain;
ring_buffer<llama_token> prev;
@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
lparams.no_perf = params.no_perf;
llama_sampler * grmr = nullptr;
llama_sampler * rbudget = nullptr;
llama_sampler * chain = llama_sampler_chain_init(lparams);
std::vector<llama_sampler *> samplers;
@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
if (grmr) {
if (grmr && !params.grammar_lazy) {
try {
for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token);
@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
samplers.push_back(common_reasoning_budget_init(
// reasoning budget sampler
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
rbudget = common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
prefill_tokens));
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
prefill_tokens);
}
if (params.has_logit_bias()) {
@ -380,9 +383,16 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
params.backend_sampling = false;
}
if (rbudget && params.backend_sampling) {
LOG_WRN("%s: backend sampling is not compatible with reasoning budget, disabling\n", __func__);
params.backend_sampling = false;
}
auto * result = new common_sampler {
/* .params = */ params,
/* .grmr = */ grmr,
/* .rbudget = */ rbudget,
/* .chain = */ chain,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
@ -398,11 +408,27 @@ void common_sampler_free(struct common_sampler * gsmpl) {
}
llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->rbudget);
llama_sampler_free(gsmpl->chain);
delete gsmpl;
}
static bool grammar_should_apply(struct common_sampler * gsmpl) {
if (!gsmpl->grmr) {
return false;
}
if (!gsmpl->rbudget) {
return true;
}
if (gsmpl->params.grammar_lazy) {
// if grammar is lazy, only apply when reasoning budget is not active
const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
}
return true;
}
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (!gsmpl) {
return;
@ -410,6 +436,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
const auto tm = gsmpl->tm();
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
llama_sampler_accept(gsmpl->rbudget, token);
if (gsmpl->grmr && accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
@ -431,6 +462,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur,
@ -500,6 +532,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
llama_token id = LLAMA_TOKEN_NULL;
auto & grmr = gsmpl->grmr;
auto & rbudget = gsmpl->rbudget;
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
@ -511,7 +544,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
@ -524,7 +558,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
gsmpl->set_logits(ctx, idx);
if (grammar_first) {
// apply reasoning budget first
llama_sampler_apply(rbudget, &cur_p);
if (grammar_first && grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p);
}
@ -532,7 +569,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
id = cur_p.data[cur_p.selected].id;
if (grammar_first) {
if (grammar_first || !grammar_should_apply(gsmpl)) {
return id;
}
@ -553,7 +590,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(rbudget, &cur_p);
if (grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

View File

@ -31,10 +31,10 @@ import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
SentencePieceTokenizer,
)
@ -45,9 +45,9 @@ except ImportError:
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
_mistral_common_installed = False
TokenizerVersion = None
Tekkenizer = None
SentencePieceTokenizer = None
TokenizerVersion: Any = None
Tekkenizer: Any = None
SentencePieceTokenizer: Any = None
_mistral_import_error_msg = (
"Mistral format requires `mistral-common` to be installed. Please run "
"`pip install mistral-common[image,audio]` to install it."
@ -145,6 +145,7 @@ class ModelBase:
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self._is_nvfp4 = False
self._is_mxfp4 = False
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -220,7 +221,7 @@ class ModelBase:
if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
tensor_names_from_index.update(weight_map.keys())
part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
part_names = sorted(part_dict.keys())
else:
weight_map = {}
@ -485,7 +486,7 @@ class ModelBase:
elif quant_method == "modelopt":
# Mixed-precision ModelOpt models: NVFP4 tensors are handled by
# _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
# are dequantized here. input_scale tensors are unused.
# are dequantized here. k/v scale tensors are unused.
for name in self.model_tensors.keys():
if name.endswith(".weight_scale"):
weight_name = name.removesuffix("_scale")
@ -493,7 +494,7 @@ class ModelBase:
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
tensors_to_remove.append(name)
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
if name.endswith((".k_scale", ".v_scale")):
tensors_to_remove.append(name)
elif quant_method is not None:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@ -541,7 +542,6 @@ class ModelBase:
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
# Handle gate/up expert tensor fusion if enabled
@ -606,7 +606,12 @@ class ModelBase:
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
if "language_model." in name:
name = name.replace("language_model.", "")
new_name = self.map_tensor_name(name)
raw, shape = self._nvfp4_pack(weight, scale)
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
@ -618,10 +623,18 @@ class ModelBase:
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale2_f32)
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(input_scale):
input_scale_f32 = input_scale.float().numpy().flatten()
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
def _generate_nvfp4_tensors(self):
# Per-layer expert merging to avoid holding all experts in memory
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
expert_shapes: dict[tuple[int, str], list[int]] = {}
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
consumed: list[str] = []
@ -631,6 +644,7 @@ class ModelBase:
continue
scale_name = name.replace(".weight", ".weight_scale")
scale2_name = name.replace(".weight", ".weight_scale_2")
input_scale_name = name.replace(".weight", ".input_scale")
if scale_name not in self.model_tensors:
continue
# Force eager materialization of lazy tensors
@ -642,11 +656,14 @@ class ModelBase:
continue
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))())
# Mark tensors for removal from model_tensors (already written to gguf)
consumed.extend([name, scale_name])
if scale2_name in self.model_tensors:
consumed.append(scale2_name)
if input_scale_name in self.model_tensors:
consumed.append(input_scale_name)
# Check if this is a per-expert tensor
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
@ -662,34 +679,37 @@ class ModelBase:
if key not in expert_blocks:
expert_blocks[key] = []
expert_scales[key] = []
expert_input_scales[key] = []
expert_shapes[key] = shape
expert_blocks[key].append((expert_id, raw.copy()))
# Collect per-expert scale2 (scalar per expert)
expert_scales[key].append((expert_id, float(scale2.float().sum())))
# Collect per-expert input_scale (scalar per expert)
expert_input_scales[key].append((expert_id, float(input_scale.float().sum())))
# Flush when all experts for this (layer, proj) are collected
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
else:
new_name = self.map_tensor_name(name)
self._repack_nvfp4(new_name, weight, scale, scale2)
self._repack_nvfp4(name, weight, scale, scale2, input_scale)
# Flush any remaining experts (fallback if n_experts was unknown)
for (bid, proj_type) in list(expert_blocks.keys()):
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
# Remove consumed tensors so get_tensors/modify_tensors won't see them
for name in consumed:
self.model_tensors.pop(name, None)
# Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
# Remove any remaining unused auxiliary tensors
for name in list(self.model_tensors.keys()):
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
if name.endswith((".k_scale", ".v_scale")):
del self.model_tensors[name]
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type):
experts = expert_blocks.pop(key)
scales = expert_scales.pop(key)
input_scales = expert_input_scales.pop(key)
shape = expert_shapes.pop(key)
experts.sort(key=lambda x: x[0])
@ -707,11 +727,20 @@ class ModelBase:
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
input_scales.sort(key=lambda x: x[0])
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
del experts, merged
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
quant_config_file = self.dir_model / "hf_quant_config.json"
@ -728,6 +757,7 @@ class ModelBase:
quant_algo = "NVFP4"
self._is_nvfp4 = quant_algo == "NVFP4"
self._is_mxfp4 = quant_method == "mxfp4"
# NVFP4 weights are repacked and written directly to gguf_writer.
# This must run before dequant_model so NVFP4 tensors are removed
@ -876,6 +906,12 @@ class ModelBase:
if self.metadata.name is None:
self.metadata.name = self.dir_model.name
if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
if self._is_nvfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
elif self._is_mxfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
# Generate parameter weight class (useful for leader boards) if not yet determined
if self.metadata.size_label is None and total_params > 0:
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@ -938,6 +974,9 @@ class ModelBase:
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
if "language_config" in config:
# rename for DeepSeekOCR
config["text_config"] = config["language_config"]
if "lfm" in config:
# rename for LFM2-Audio
config["text_config"] = config["lfm"]
@ -1299,6 +1338,9 @@ class TextModel(ModelBase):
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
# ref: https://huggingface.co/aari1995/German_Semantic_V3
res = "jina-v2-de"
if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
# ref: https://huggingface.co/evilfreelancer/ruGPT3XL
res = "gpt-2"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@ -1494,6 +1536,9 @@ class TextModel(ModelBase):
if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
# ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
res = "kanana2"
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
res = "f2llmv2"
if res is None:
logger.warning("\n")
@ -2062,7 +2107,7 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"]
has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False
@ -4264,6 +4309,16 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
@ModelBase.register("InternVisionModel")
class InternVisionModel(MmprojModel):
min_dynamic_tiles: int = 0
max_dynamic_tiles: int = 0
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0)
self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0)
def set_gguf_parameters(self):
assert self.hparams_vision is not None
if isinstance(self.hparams_vision['image_size'], list):
@ -4286,6 +4341,11 @@ class InternVisionModel(MmprojModel):
downsample_ratio = self.global_config.get("downsample_ratio")
assert downsample_ratio is not None
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
# older models may not have min/max_dynamic_patch in config
if self.min_dynamic_tiles > 0:
self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles)
if self.max_dynamic_tiles > 0:
self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
@ -4548,7 +4608,7 @@ class Qwen2MoeModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("Qwen3ForCausalLM")
@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model")
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3
@ -4981,6 +5041,97 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
return tensor.permute(*perm).contiguous().reshape(*shape)
def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
if not name.endswith((
".linear_attn.in_proj_qkv.weight",
".linear_attn.in_proj_z.weight",
".linear_attn.in_proj_a.weight",
".linear_attn.in_proj_b.weight",
".linear_attn.out_proj.weight",
)):
return weight, scale
num_k_heads = self.hparams["linear_num_key_heads"]
num_v_heads = self.hparams["linear_num_value_heads"]
head_k_dim = self.hparams["linear_key_head_dim"]
head_v_dim = self.hparams["linear_value_head_dim"]
num_v_per_k = num_v_heads // num_k_heads
def unpack_nibbles(qs: Tensor) -> Tensor:
lo = torch.bitwise_and(qs, 0x0F)
hi = torch.bitwise_right_shift(qs, 4)
return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2)
def pack_nibbles(codes: Tensor) -> Tensor:
codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2)
lo = torch.bitwise_and(codes[..., 0], 0x0F)
hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4)
return torch.bitwise_or(lo, hi).contiguous()
def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]:
assert qs.ndim >= 2
assert scales.ndim >= 2
k = qs.shape[-1] * 2
assert col_perm.numel() == k
assert k % 16 == 0
group_cols = col_perm.reshape(-1, 16)
group_starts = group_cols[:, 0]
expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype)
assert torch.equal(group_cols, expected)
assert torch.all(group_starts % 16 == 0)
group_perm = (group_starts // 16).to(dtype=torch.long)
expected_groups = torch.arange(scales.shape[-1], dtype=torch.long)
assert group_perm.numel() == scales.shape[-1]
assert torch.equal(torch.sort(group_perm).values, expected_groups)
codes = unpack_nibbles(qs)
codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long))
qs = pack_nibbles(codes)
scales = scales.index_select(-1, group_perm.to(device=scales.device))
return qs, scales
def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]:
row_perm = self._reorder_v_heads(
torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1),
0, num_k_heads, num_v_per_k, head_dim,
).squeeze(-1)
return (
qs.index_select(0, row_perm.to(device=qs.device)),
scales.index_select(0, row_perm.to(device=scales.device)),
)
if name.endswith(".linear_attn.in_proj_qkv.weight"):
q_dim = head_k_dim * num_k_heads
k_dim = head_k_dim * num_k_heads
q = weight[:q_dim]
k = weight[q_dim:q_dim + k_dim]
v = weight[q_dim + k_dim:]
q_scale = scale[:q_dim]
k_scale = scale[q_dim:q_dim + k_dim]
v_scale = scale[q_dim + k_dim:]
v, v_scale = reorder_rows(v, v_scale, head_v_dim)
return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0)
if name.endswith(".linear_attn.in_proj_z.weight"):
weight, scale = reorder_rows(weight, scale, head_v_dim)
elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")):
weight, scale = reorder_rows(weight, scale, 1)
elif name.endswith(".linear_attn.out_proj.weight"):
col_perm = self._reorder_v_heads(
torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0),
1, num_k_heads, num_v_per_k, head_v_dim,
).squeeze(0)
weight, scale = apply_col_perm(weight, scale, col_perm)
return weight, scale
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
weight, scale = self._transform_nvfp4_weight(name, weight, scale)
super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_k_heads = self.hparams.get("linear_num_key_heads", 0)
num_v_heads = self.hparams.get("linear_num_value_heads", 0)
@ -5070,6 +5221,47 @@ class GPT2Model(TextModel):
yield from super().modify_tensors(data_torch, new_name, bid)
@ModelBase.register("RuGPT3XLForCausalLM")
class RuGPT3XLModel(TextModel):
model_arch = gguf.MODEL_ARCH.GPT2
_qkv_parts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Fuse separate Q, K, V projections into a single QKV tensor
if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name:
suffix = "weight" if name.endswith(".weight") else "bias"
part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v")
key = f"{part}.{suffix}"
assert bid is not None
if self._qkv_parts is None:
self._qkv_parts = [{} for _ in range(self.block_count)]
self._qkv_parts[bid][key] = data_torch
q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}"
if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]):
q = self._qkv_parts[bid].pop(q_key)
k = self._qkv_parts[bid].pop(k_key)
v = self._qkv_parts[bid].pop(v_key)
data_torch = torch.cat([q, k, v], dim=0)
name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}")
logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}")
else:
return
yield from super().modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
super().prepare_tensors()
if self._qkv_parts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()]
if len(parts) > 0:
raise ValueError(f"Unprocessed Q/K/V parts: {parts}")
@ModelBase.register("PhiForCausalLM")
class Phi2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PHI2
@ -5882,7 +6074,7 @@ class InternLM2Model(TextModel):
logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1)
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -6203,7 +6395,7 @@ class BertModel(TextModel):
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
@ -6911,6 +7103,70 @@ class ConformerAudioModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
# calculate proj_scale_factor (used by tinygemma3 test model)
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
n_per_side = int(image_seq_length ** 0.5)
image_size = self.hparams["image_size"]
patch_size = self.hparams["patch_size"]
proj_scale_factor = (image_size // patch_size) // n_per_side
if proj_scale_factor > 0 and proj_scale_factor != 4:
# we only need to write this if it's not the default value
# in this case, we are converting a test model
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
# @bluebread: there's no window_size in config but just add it here anyway
self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14))
# SAM configuration
sam_hparams = hparams['sam']
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads'])
def get_vision_config(self) -> dict[str, Any]:
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
if not vision_config:
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4
return vision_config
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name or 'pos_embed' in name:
return gguf.GGMLQuantizationType.F32
if ".rel_pos_h" in name or '.rel_pos_w' in name:
return gguf.GGMLQuantizationType.F32
if ".neck." in name or ".net_" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Only process vision-related tensors, skip language model tensors
# Vision components: sam_model, vision_model, projector, image_newline, view_seperator
# Language model components to skip: lm_head, embed_tokens, layers, norm
if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
return
if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"):
name += ".weight"
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Gemma3nForConditionalGeneration")
class Gemma3nVisionAudioModel(ConformerAudioModel):
has_audio_encoder = True
@ -8256,6 +8512,19 @@ class DeepseekV2Model(TextModel):
merge_expert = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# special handling for Deepseek OCR
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
# default jinja template
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
def set_vocab(self):
try:
self._set_vocab_gpt2()
@ -8311,9 +8580,15 @@ class DeepseekV2Model(TextModel):
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
def set_gguf_parameters(self):
is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1
if is_ocr:
self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
else:
# note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1
self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6)
super().set_gguf_parameters()
hparams = self.hparams
@ -8327,16 +8602,18 @@ class DeepseekV2Model(TextModel):
# Default: if no MoE, all layers are dense; if MoE, none are dense
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
kv_lora_rank = hparams.get("kv_lora_rank", 512)
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
if not is_ocr:
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(kv_lora_rank)
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
# MoE parameters (required by C++ code for DEEPSEEK2 arch)
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
@ -8368,8 +8645,15 @@ class DeepseekV2Model(TextModel):
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
# skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR
if ("vision_tower" in name
or "multi_modal_projector" in name
or "mm_projector" in name
or "vision_model" in name
or "image_newline" in name
or "model.projector" in name
or "sam_model" in name
or "view_seperator" in name):
return
if name.startswith("siglip2.") or name.startswith("merger."):
return
@ -8880,7 +9164,7 @@ class T5Model(TextModel):
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -9017,7 +9301,7 @@ class T5EncoderModel(TextModel):
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -11125,8 +11409,7 @@ class GptOssModel(TextModel):
# TODO: remove once MXFP4 is supported more generally
def dequant_model(self):
quant_config = self.hparams.get("quantization_config")
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
if self._is_mxfp4:
return
return super().dequant_model()
@ -12279,6 +12562,7 @@ class LazyTorchTensor(gguf.LazyBase):
kwargs = {}
if func is torch.Tensor.numpy:
assert len(args)
return args[0].numpy()
return cls._wrap_fn(func)(*args, **kwargs)

View File

@ -154,6 +154,7 @@ models = [
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@ -177,6 +178,7 @@ pre_computed_hashes = [
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
]

View File

@ -112,11 +112,11 @@ class Tensor:
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = gguf.GGML_QUANT_SIZES.get(dtype)
self.dtype = gguf.GGMLQuantizationType(dtype)
quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])

View File

@ -199,10 +199,13 @@ class LoraTorchTensor:
kwargs = {}
if func is torch.permute:
assert len(args)
return type(args[0]).permute(*args, **kwargs)
elif func is torch.reshape:
assert len(args)
return type(args[0]).reshape(*args, **kwargs)
elif func is torch.stack:
assert len(args)
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
@ -211,6 +214,7 @@ class LoraTorchTensor:
torch.stack([b._lora_B for b in args[0]], dim),
)
elif func is torch.cat:
assert len(args)
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
@ -362,7 +366,7 @@ if __name__ == '__main__':
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
class LoraModel(model_class):
class LoraModel(model_class): # ty: ignore[unsupported-base]
model_arch = model_class.model_arch
lora_alpha: float

View File

@ -42,12 +42,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
### Ascend NPU
**Verified devices**
You can retrieve your Ascend device IDs using the following command:
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
| Atlas 300I Duo | Support |
```sh
lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
```
**Devices**
| Device Id | Product Series | Product Models | Chip Model | Verified Status |
|:---------:|----------------|----------------|:----------:|:---------------:|
| d803 | Atlas A3 Train | | 910C | |
| d803 | Atlas A3 Infer | | 910C | |
| d802 | Atlas A2 Train | | 910B | |
| d802 | Atlas A2 Infer | Atlas 300I A2 | 910B | Support |
| d801 | Atlas Train | | 910 | |
| d500 | Atlas Infer | Atlas 300I Duo | 310P | Support |
*Notes:*
@ -57,6 +67,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
## Model Supports
<details>
<summary>Text-only</summary>
| Model Name | FP16 | Q4_0 | Q8_0 |
|:----------------------------|:-----:|:----:|:----:|
| Llama-2 | √ | √ | √ |
@ -118,8 +131,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
| Trillion-7B-preview | √ | √ | √ |
| Ling models | √ | √ | √ |
</details>
<details>
<summary>Multimodal</summary>
**Multimodal**
| Model Name | FP16 | Q4_0 | Q8_0 |
|:----------------------------|:-----:|:----:|:----:|
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
@ -134,15 +150,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
| GLM-EDGE | √ | √ | √ |
| Qwen2-VL | √ | √ | √ |
</details>
## DataType Supports
| DataType | Status |
|:----------------------:|:-------:|
| FP16 | Support |
| Q8_0 | Support |
| Q4_0 | Support |
| DataType | 910B | 310P |
|:----------------------:|:-------:|:-------:|
| FP16 | Support | Support |
| Q8_0 | Support | Partial |
| Q4_0 | Support | Partial |
| BF16 | Support | |
> **310P note**
> - `Q8_0`: data transform / buffer path is implemented, and `GET_ROWS` is supported, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
> - `Q4_0`: data transform / buffer path is implemented, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
## Docker
@ -160,7 +183,20 @@ npu-smi info
# Select the cards that you want to use, make sure these cards are not used by someone.
# Following using cards of device0.
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
docker run --name llamacpp \
--device /dev/davinci0 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /PATH_TO_YOUR_MODELS/:/app/models \
-it llama-cpp-cann \
-m /app/models/MODEL_PATH \
-ngl 32 \
-p "Building a website can be done in 10 simple steps:"
```
*Notes:*
@ -171,69 +207,57 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
### I. Setup Environment
1. **Install Ascend Driver and firmware**
1. **Configure Ascend user and group**
```sh
# create driver running user.
sudo groupadd -g HwHiAiUser
sudo groupadd HwHiAiUser
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
sudo usermod -aG HwHiAiUser $USER
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
```
Once installed, run `npu-smi info` to check whether driver is installed successfully.
2. **Install dependencies**
**Ubuntu/Debian:**
```sh
+-------------------------------------------------------------------------------------------+
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
+----------------------+---------------+----------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
+======================+===============+====================================================+
| 2 xxx | OK | 64.4 51 15 / 15 |
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
+======================+===============+====================================================+
| 5 xxx | OK | 64.0 52 15 / 15 |
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
+======================+===============+====================================================+
| No running processes found in NPU 2 |
+======================+===============+====================================================+
| No running processes found in NPU 5 |
+======================+===============+====================================================+
sudo apt-get update
sudo apt-get install -y gcc python3 python3-pip linux-headers-$(uname -r)
```
2. **Install Ascend Firmware**
**RHEL/CentOS:**
```sh
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
sudo yum makecache
sudo yum install -y gcc python3 python3-pip kernel-headers-$(uname -r) kernel-devel-$(uname -r)
```
If the following message appears, firmware is installed successfully.
3. **Install CANN (driver + toolkit)**
> The `Ascend-cann` package includes both the driver and toolkit.
> `$ARCH` can be `x86_64` or `aarch64`, `$CHIP` can be `910b` or `310p`.
```sh
Firmware package installed successfully!
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann_8.5.0_linux-$ARCH.run
sudo bash ./Ascend-cann_8.5.0_linux-$ARCH.run --install
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run
sudo bash ./Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run --install
```
4. **Verify installation**
3. **Install CANN toolkit and kernels**
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
```sh
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
npu-smi info
```
Set Ascend Variables:
If device information is displayed correctly, the driver is functioning properly.
```sh
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
source ~/.bashrc
# Set environment variables (adjust path if needed)
source /usr/local/Ascend/cann/set_env.sh
python3 -c "import acl; print(acl.get_soc_name())"
```
Upon a successful installation, CANN is enabled for the available ascend devices.
If the command outputs the chip model, the installation was successful.
### II. Build llama.cpp

View File

@ -1,6 +1,9 @@
# OpenVINO Backend for llama.cpp
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
> [!NOTE]
> Performance and memory optimizations, accuracy validation, broader quantization coverage, broader operator and model support are work in progress.
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:
@ -179,31 +182,73 @@ curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/L
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
> [!NOTE]
> Default context size is set to the model training context, which may be very large. For example, 131072 for Llama 3.2 1B, which may result in lower performance, especially on edge/laptop devices. Use `-c` to limit context size in supported llama.cpp tools for better performance. For example, `-c 512`.
```bash
# If device is unset or unavailable, defaults to CPU.
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
# Linux
export GGML_OPENVINO_DEVICE=GPU
# Enable stateful execution with GPU device to avoid known stateless execution failures.
export GGML_OPENVINO_STATEFUL_EXECUTION=1
# To run llama-simple:
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
# To run in chat mode:
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
# To run llama-bench, -fa 1 is needed
GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
export GGML_OPENVINO_DEVICE=NPU
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
# Windows Command Line
set GGML_OPENVINO_DEVICE=GPU
# Enable stateful execution with GPU device to avoid known stateless execution failures.
set GGML_OPENVINO_STATEFUL_EXECUTION=1
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
# To run llama-simple
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
# To run in chat mode:
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
# To run llama-bench, -fa 1 is needed
build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
# Windows Command Line
set GGML_OPENVINO_DEVICE=NPU
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "NPU"
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
```
> [!NOTE]
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
### Known Issues and Current Workarounds
- GPU stateless execution is currently affected by a known issue.
- Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
- Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
- Additional NPU limitations:
- Model caching is not yet supported.
- `llama-server -np > 1` (multiple parallel sequences) is not supported.
- `llama-perplexity` is only supported with `-b 512` or smaller.
- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
> [!NOTE]
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
### Docker Build
@ -229,31 +274,42 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
Run llama.cpp with OpenVINO backend Docker container.
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
> [!NOTE]
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
```bash
# Run Docker container
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# With Intel GPU access (iGPU or dGPU)
docker run --rm -it -v ~/models:/models \
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# With Intel NPU access
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
docker run --rm -it -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
--env=GGML_OPENVINO_DEVICE=NPU \
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
```
Run Llama.cpp Server with OpenVINO Backend:
Run Llama.cpp Server with OpenVINO Backend.
> [!NOTE]
> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
```bash
# Run the Server Docker container
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# In a NEW terminal, test the server with curl
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
# Or Using llama-server executable
./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
export NO_PROXY=localhost,127.0.0.1
# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
# Option 2: In a NEW terminal, test the server with curl
# Test health endpoint
curl -f http://localhost:8080/health
@ -295,6 +351,7 @@ The OpenVINO backend can be configured using the following environment variables
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
export GGML_OPENVINO_PROFILING=1
export GGML_OPENVINO_DEVICE=GPU
export GGML_OPENVINO_STATEFUL_EXECUTION=1
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
@ -302,38 +359,27 @@ export GGML_OPENVINO_DEVICE=GPU
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
set GGML_OPENVINO_PROFILING=1
set GGML_OPENVINO_DEVICE=GPU
set GGML_OPENVINO_STATEFUL_EXECUTION=1
# Windows PowerShell
$env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
$env:GGML_OPENVINO_PROFILING = "1"
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
```
#### llama-bench
```bash
# -fa 1 is required when running llama-bench with the OpenVINO backend.
GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
```
### NPU Notes
- Model caching is not yet supported
- Does not support llama-server -np > 1 (multiple parallel sequences)
- Only supports llama-perplexity -b 512 or smaller
## Llama.cpp Tools
The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-simple
- llama-run
- llama-cli
- llama-server
- llama-bench
- llama-cli
- llama-completion
- llama-perplexity
- llama-server
- llama-simple
## Work in Progress

View File

@ -13,24 +13,30 @@ We have three Docker images available for this project:
Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-cuda13`: Same as `full` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda13`: Same as `light` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda13`: Same as `server` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-s390x`: Identical to `full`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:light-s390x`: Identical to `light`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:server-s390x`: Identical to `server`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
@ -82,7 +88,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are:
- `CUDA_VERSION` set to `12.4.0`
- `CUDA_VERSION` set to `12.8.1`
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images:

View File

@ -31,6 +31,13 @@ llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.g
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
```
> [!IMPORTANT]
>
> OCR models are trained with specific prompt and input structure, please refer to these discussions for more info:
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
## Pre-quantized models
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc

View File

@ -24,12 +24,12 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
params.n_predict = 32;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
return 1;
}
common_init();
// number of parallel batches
int n_parallel = params.n_parallel;

View File

@ -213,12 +213,12 @@ static bool run(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
return 1;
}
common_init();
llama_backend_init();
llama_numa_init(params.numa);

View File

@ -545,11 +545,12 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
return 1;
}
common_init();
llama_backend_init();
llama_model_params model_params = llama_model_default_params();

View File

@ -99,12 +99,12 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
return 1;
}
common_init();
params.embedding = true;
// get max number of sequences per batch

View File

@ -37,12 +37,12 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
common_init();
llama_backend_init();
llama_numa_init(params.numa);

View File

@ -19,12 +19,12 @@ static void print_usage(int /*argc*/, char ** argv) {
int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
}
common_init();
// init LLM
llama_backend_init();

View File

@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
return f'({result})?' if min_items == 0 else result
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
has_min = min_value != None
has_max = max_value != None
def digit_range(from_char: str, to_char: str):
out.append("[")
if from_char == to_char:
@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
out.append(to_str[i])
out.append("]")
if has_min and has_max:
if min_value is not None and max_value is not None:
if min_value < 0 and max_value < 0:
out.append("\"-\" (")
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
less_decimals = max(decimals_left - 1, 1)
if has_min:
if min_value is not None:
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
more_digits(length - 1, less_decimals)
return
if has_max:
if max_value is not None:
if max_value >= 0:
if top_level:
out.append("\"-\" [1-9] ")

View File

@ -365,13 +365,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processSystemPrompt(
const auto *system_prompt = env->GetStringUTFChars(jsystem_prompt, nullptr);
LOGd("%s: System prompt received: \n%s", __func__, system_prompt);
std::string formatted_system_prompt(system_prompt);
env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);
// Format system prompt if applicable
const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
if (has_chat_template) {
formatted_system_prompt = chat_add_and_format(ROLE_SYSTEM, system_prompt);
}
env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);
// Tokenize system prompt
const auto system_tokens = common_tokenize(g_context, formatted_system_prompt,
@ -414,13 +414,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processUserPrompt(
const auto *const user_prompt = env->GetStringUTFChars(juser_prompt, nullptr);
LOGd("%s: User prompt received: \n%s", __func__, user_prompt);
std::string formatted_user_prompt(user_prompt);
env->ReleaseStringUTFChars(juser_prompt, user_prompt);
// Format user prompt if applicable
const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
if (has_chat_template) {
formatted_user_prompt = chat_add_and_format(ROLE_USER, user_prompt);
}
env->ReleaseStringUTFChars(juser_prompt, user_prompt);
// Decode formatted user prompts
auto user_tokens = common_tokenize(g_context, formatted_user_prompt, has_chat_template, has_chat_template);

View File

@ -43,12 +43,12 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
common_init();
const int W = 15; // lookahead window
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams

View File

@ -12,6 +12,8 @@ int main(int argc, char ** argv){
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}

View File

@ -18,12 +18,12 @@ int main(int argc, char ** argv){
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
common_init();
const int n_draft = params.speculative.n_max;
// init llama.cpp

View File

@ -18,12 +18,12 @@ int main(int argc, char ** argv){
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
common_init();
// max. number of additional tokens to draft if match is found
const int n_draft = params.speculative.n_max;

View File

@ -7,7 +7,7 @@ import os
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""

View File

@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print("Using SentenceTransformer to apply all numbered layers")
model = SentenceTransformer(model_path)
tokenizer = model.tokenizer
config = model[0].auto_model.config # type: ignore
config = model[0].auto_model.config
else:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window
if hasattr(model.config, 'sliding_window'): # type: ignore
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
if hasattr(model.config, 'sliding_window'):
print(f"Model's sliding_window: {model.config.sliding_window}")
else:
print("Model config does not have sliding_window attribute")
@ -152,7 +152,7 @@ def main():
device = next(model.parameters()).device
else:
# For SentenceTransformer, get device from the underlying model
device = next(model[0].auto_model.parameters()).device # type: ignore
device = next(model[0].auto_model.parameters()).device
model_name = os.path.basename(model_path)
@ -177,7 +177,7 @@ def main():
print(f"{token_id:6d} -> '{token_str}'")
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
else:
# Standard approach: use base model output only
encoded = tokenizer(
@ -205,12 +205,12 @@ def main():
print(f"Embedding dimension: {all_embeddings.shape[1]}")
if len(all_embeddings.shape) == 1:
n_embd = all_embeddings.shape[0] # type: ignore
n_embd = all_embeddings.shape[0]
n_embd_count = 1
all_embeddings = all_embeddings.reshape(1, -1)
else:
n_embd = all_embeddings.shape[1] # type: ignore
n_embd_count = all_embeddings.shape[0] # type: ignore
n_embd = all_embeddings.shape[1]
n_embd_count = all_embeddings.shape[0]
print()

View File

@ -5,7 +5,7 @@ import sys
import os
import argparse
from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found]
from common import get_model_name_from_env_path # type: ignore[import-not-found, ty:unresolved-import]
def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2)

View File

@ -2,7 +2,7 @@
import argparse
import sys
from common import compare_tokens # type: ignore
from common import compare_tokens # type: ignore[import-not-found, ty:unresolved-import]
def parse_arguments():

View File

@ -7,7 +7,7 @@ import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

View File

@ -163,12 +163,12 @@ int main(int argc, char ** argv) {
params.n_predict = 128;
params.n_junk = 1;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
}
common_init();
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;

View File

@ -25,12 +25,12 @@ int main(int argc, char ** argv) {
params.n_keep = 32;
params.i_pos = -1;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
return 1;
}
common_init();
int n_junk = params.n_junk;
int n_keep = params.n_keep;
int n_grp = params.grp_attn_n;

View File

@ -6,7 +6,7 @@ import re
from copy import copy
from enum import Enum
from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse
from pydantic import BaseModel, create_model
@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a type annotation
if param.annotation == inspect.Parameter.empty:
raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
# Find the parameter's description in the docstring
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a description
if not param_doc or not param_doc.description:
raise ValueError(
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
# Add parameter details to the schema
param_docs.append((param.name, param_doc))
@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
for name, param_doc in param_docs:
dynamic_model.model_fields[name].description = param_doc.description
@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
if items != {}:
array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...)
fields[field_name] = (list[array_type], ...) # ty: ignore[invalid-type-form]
else:
fields[field_name] = (list, ...)
elif field_type == "object":

View File

@ -117,12 +117,12 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
return 1;
}
common_init();
// For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
params.embedding = true;

View File

@ -17,6 +17,8 @@ int main(int argc, char ** argv) {
const std::string_view state_file = "dump_state.bin";
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
@ -27,8 +29,6 @@ int main(int argc, char ** argv) {
params.kv_unified = true;
}
common_init();
if (params.n_predict < 0) {
params.n_predict = 16;
}

View File

@ -16,6 +16,8 @@ int main(int argc, char ** argv) {
common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1;
}
@ -25,8 +27,6 @@ int main(int argc, char ** argv) {
return 1;
}
common_init();
if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;

View File

@ -38,6 +38,8 @@ int main(int argc, char ** argv) {
// needed to get candidate probs even for temp <= 0.0
params.sampling.n_probs = 128;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1;
}
@ -47,8 +49,6 @@ int main(int argc, char ** argv) {
return 1;
}
common_init();
if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;

View File

@ -20,4 +20,4 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA
#cmake --build . --config Release --target llama-bench
#build all binary
cmake --build . --config Release -j -v
cmake --build . --config Release -j$((($(nproc)+1)/2)) -v

View File

@ -23,9 +23,9 @@ if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
fi

View File

@ -20,6 +20,8 @@ int main(int argc, char ** argv) {
common_params params;
params.escape = false;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
return 1;
}
@ -38,7 +40,6 @@ int main(int argc, char ** argv) {
params.cache_type_v = GGML_TYPE_F32;
}
common_init();
llama_backend_init();
llama_numa_init(params.numa);
// load the model and apply lora adapter, if any

View File

@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 8)
set(GGML_VERSION_PATCH 9)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@ -166,15 +166,16 @@ if (NOT MSVC)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause" ON)
option(GGML_RV_ZVFBFWMA "ggml: enable riscv zvfbfwma" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")

View File

@ -77,6 +77,7 @@ extern "C" {
};
GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
@ -189,6 +190,7 @@ extern "C" {
//
// write the entire context to a binary file
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding

View File

@ -434,6 +434,9 @@ void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src = dst->src[0];
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
@ -456,6 +459,13 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
float p_value = 2.0f;
acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
// Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
float flt_max = FLT_MAX;
acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
}
@ -3011,6 +3021,58 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
}
}
void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
GGML_TENSOR_UNARY_OP_LOCALS
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_imrope || mrope_used) {
is_neox = true;
}
int64_t rope_dims = n_dims;
if (is_vision) {
rope_dims = src0->ne[0];
}
// Run the full cache init on the non-captured stream. This performs all
// host-to-device memcpy, aclrtMalloc/Free, and on-device computations
// so that the memory pool is warmed up and cache metadata is populated.
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
mrope_used, is_imrope, is_vision, rope_dims);
// Reset `cached` so that during graph capture the on-device computations
// (sin/cos, position multiply, repeat, etc.) still execute and get recorded
// into the captured graph. The cache metadata (theta_scale_length,
// theta_scale, sections, position_length, etc.) remains set, which causes
// all host-to-device copy and malloc/free branches to be skipped.
ctx.rope_cache.cached = false;
}
void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];

View File

@ -543,6 +543,21 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
*/
void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Pre-load the RoPE cache before ACL graph capture.
*
* This function must be called outside of graph capture to perform
* host-to-device memory copies and device memory allocations that are
* not allowed on a captured stream. After pre-loading, the rope cache
* metadata is updated so that the subsequent call to
* aclnn_rope_cache_init (inside graph capture) skips these operations
* and only records the on-device computations into the captured graph.
*
* @param ctx CANN backend context.
* @param dst A ROPE destination tensor from the computation graph.
*/
void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Computes the index of the maximum value along the specified dimension
* of a ggml tensor using the CANN backend.

View File

@ -216,14 +216,16 @@ struct ggml_cann_pool_alloc {
#ifdef USE_ACL_GRAPH
struct ggml_graph_node_properties {
// dst tensor
void * node_address;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
void * node_address;
ggml_type node_type;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
// src tensor
void * src_address[GGML_MAX_SRC];
int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
void * src_address[GGML_MAX_SRC];
ggml_type src_type[GGML_MAX_SRC];
int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
// op
ggml_op node_op;
@ -247,6 +249,10 @@ struct ggml_graph_node_properties {
return false;
}
if (node->type != this->node_type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != this->ne[i]) {
return false;
@ -262,6 +268,10 @@ struct ggml_graph_node_properties {
return false;
}
if (node->src[i]->type != this->src_type[i]) {
return false;
}
for (int d = 0; d < GGML_MAX_DIMS; d++) {
if (node->src[i]->ne[d] != this->src_ne[i][d]) {
return false;
@ -277,10 +287,7 @@ struct ggml_graph_node_properties {
}
}
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
}
return true;
return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
}
};
@ -322,6 +329,7 @@ struct ggml_cann_graph {
prop.node_address = node->data;
prop.node_op = node->op;
prop.node_type = node->type;
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
@ -329,10 +337,12 @@ struct ggml_cann_graph {
for (int src = 0; src < GGML_MAX_SRC; ++src) {
if (node->src[src]) {
prop.src_address[src] = node->src[src]->data;
prop.src_type[src] = node->src[src]->type;
std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
} else {
prop.src_address[src] = nullptr;
prop.src_type[src] = GGML_TYPE_COUNT;
std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
}

Some files were not shown because too many files have changed in this diff Show More