Compare commits
27 Commits
2bfdbf230b
...
98d5b8fa51
| Author | SHA1 | Date |
|---|---|---|
|
|
98d5b8fa51 | |
|
|
d006858316 | |
|
|
e439700992 | |
|
|
50e0ad08fb | |
|
|
f1f793ad06 | |
|
|
af5c13841f | |
|
|
277ff5fff7 | |
|
|
384c0076bc | |
|
|
1f34806c44 | |
|
|
887535c33f | |
|
|
d3416a4aa9 | |
|
|
43a4ee4a2c | |
|
|
f851fa5ab0 | |
|
|
f1ac84119c | |
|
|
b069b10ab4 | |
|
|
0c58ba3365 | |
|
|
57ace0d612 | |
|
|
39b27f0da0 | |
|
|
f49e917876 | |
|
|
7c7d6ce5c7 | |
|
|
5208e2d5ba | |
|
|
7992aa7c8e | |
|
|
a1cfb64530 | |
|
|
5803c8d115 | |
|
|
63f8fe0ef4 | |
|
|
223373742b | |
|
|
c82e486fdb |
|
|
@ -1,97 +0,0 @@
|
|||
ARG UBUNTU_VERSION=24.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=13.1.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
curl,
|
||||
openssl,
|
||||
shaderc,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
|
|
@ -160,7 +160,8 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs;
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
++ [ openssl ];
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=7.2
|
||||
ARG AMDGPU_VERSION=7.2
|
||||
ARG ROCM_VERSION=7.2.1
|
||||
ARG AMDGPU_VERSION=7.2.1
|
||||
|
||||
# Target the ROCm build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
|
@ -12,11 +12,11 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
|||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
|
||||
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
|
||||
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
|
||||
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
|
||||
|
||||
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
|
||||
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
|
||||
|
||||
# Set ROCm architectures
|
||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,11 @@ IBM zDNN:
|
|||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zdnn.h
|
||||
- ggml/src/ggml-zdnn/**
|
||||
AMD ZenDNN:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zendnn.h
|
||||
- ggml/src/ggml-zendnn/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
|
|
|||
|
|
@ -213,6 +213,27 @@ jobs:
|
|||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
|
||||
env:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
|
|
|
|||
|
|
@ -472,6 +472,7 @@ jobs:
|
|||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGPU_TARGETS="gfx1030" \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
|
|
@ -941,7 +942,7 @@ jobs:
|
|||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
|
|
@ -984,12 +985,13 @@ jobs:
|
|||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||
-DROCM_DIR="${env:HIP_PATH}" `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGPU_TARGETS="gfx1100" `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
|
|
|
|||
|
|
@ -73,10 +73,10 @@ jobs:
|
|||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ env:
|
|||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:7.2
|
||||
container: rocm/dev-ubuntu-22.04:7.2.1
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
|
@ -59,7 +59,7 @@ jobs:
|
|||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGPU_TARGETS=gfx908 \
|
||||
-DGPU_TARGETS=gfx942 \
|
||||
-DGGML_HIP=ON \
|
||||
-DGGML_HIP_EXPORT_METRICS=Off \
|
||||
-DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
|
||||
|
|
|
|||
|
|
@ -639,8 +639,8 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- ROCM_VERSION: "7.2"
|
||||
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
|
||||
- ROCM_VERSION: "7.2.1"
|
||||
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
|
||||
build: 'x64'
|
||||
|
||||
steps:
|
||||
|
|
@ -662,7 +662,7 @@ jobs:
|
|||
sudo apt install -y build-essential git cmake wget
|
||||
|
||||
- name: Setup Legacy ROCm
|
||||
if: matrix.ROCM_VERSION == '7.2'
|
||||
if: matrix.ROCM_VERSION == '7.2.1'
|
||||
id: legacy_env
|
||||
run: |
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
|
|
@ -683,7 +683,7 @@ jobs:
|
|||
sudo apt-get install -y libssl-dev rocm-hip-sdk
|
||||
|
||||
- name: Setup TheRock
|
||||
if: matrix.ROCM_VERSION != '7.2'
|
||||
if: matrix.ROCM_VERSION != '7.2.1'
|
||||
id: therock_env
|
||||
run: |
|
||||
wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
|
||||
|
|
@ -699,7 +699,6 @@ jobs:
|
|||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
|
|
@ -717,17 +716,20 @@ jobs:
|
|||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Get ROCm short version
|
||||
run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-2022
|
||||
|
|
@ -749,7 +751,7 @@ jobs:
|
|||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
|
|
@ -806,7 +808,7 @@ jobs:
|
|||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DGGML_BACKEND_DL=ON `
|
||||
-DGGML_NATIVE=OFF `
|
||||
|
|
|
|||
37
ci/run.sh
37
ci/run.sh
|
|
@ -119,6 +119,11 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
|||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
# Build shared libs on Windows
|
||||
# to reduce binary size and avoid errors in library loading unit tests
|
||||
if uname -s | grep -qi nt; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
|
|
@ -221,7 +226,7 @@ function gg_run_ctest_debug {
|
|||
|
||||
set -e
|
||||
|
||||
# Check cmake and ctest are installed
|
||||
# Check required binaries are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
|
|
@ -252,7 +257,7 @@ function gg_run_ctest_release {
|
|||
|
||||
set -e
|
||||
|
||||
# Check cmake and ctest are installed
|
||||
# Check required binaries are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
|
|
@ -627,10 +632,38 @@ function gg_sum_rerank_tiny {
|
|||
}
|
||||
|
||||
function gg_check_build_requirements {
|
||||
if ! command -v git &> /dev/null; then
|
||||
gg_printf 'git not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v git-lfs &> /dev/null; then
|
||||
gg_printf 'git-lfs not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v wget &> /dev/null; then
|
||||
gg_printf 'wget not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
gg_printf 'python3 not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v pip3 &> /dev/null; then
|
||||
gg_printf 'pip3 not found, please install'
|
||||
fi
|
||||
|
||||
if ! python3 -m ensurepip --help &> /dev/null; then
|
||||
gg_printf 'ensurepip not found, please install python3-venv package'
|
||||
fi
|
||||
|
||||
if ! command -v cmake &> /dev/null; then
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ccache &> /dev/null; then
|
||||
gg_printf 'ccache not found, please consider installing for faster builds'
|
||||
fi
|
||||
|
||||
if ! command -v ctest &> /dev/null; then
|
||||
gg_printf 'ctest not found, please install'
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -537,9 +537,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
} catch (const std::exception & e) {
|
||||
LOG_WRN("HF cache migration failed: %s\n", e.what());
|
||||
}
|
||||
// export_graph_ops loads only metadata
|
||||
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
||||
|
||||
// maybe handle remote preset
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
if (!params.model.hf_repo.empty() && !skip_model_download) {
|
||||
std::string cli_hf_repo = params.model.hf_repo;
|
||||
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
||||
|
||||
|
|
@ -570,7 +572,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
}
|
||||
|
||||
// handle model and download
|
||||
{
|
||||
if (!skip_model_download) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
|
|
@ -591,7 +593,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
|
||||
// model is required (except for server)
|
||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
|
||||
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
|
||||
throw std::invalid_argument("error: --model is required\n");
|
||||
}
|
||||
|
||||
|
|
@ -1309,6 +1311,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.kv_unified = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"--clear-idle"},
|
||||
{"--no-clear-idle"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.clear_idle = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
|
|
|
|||
|
|
@ -6,12 +6,111 @@
|
|||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
namespace {
|
||||
|
||||
// Gemma4-specific PEG builder extending the standard chat builder.
|
||||
// Adds value type parsers that use <|\"|> as string delimiters
|
||||
// instead of JSON's double quotes, and disables json-to-schema
|
||||
// conversion for these types.
|
||||
class common_peg_gemma4_builder {
|
||||
common_chat_peg_builder & p_;
|
||||
static constexpr const char * QUOTE = "<|\"|>";
|
||||
|
||||
public:
|
||||
explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
|
||||
|
||||
common_peg_parser gemma4_string() {
|
||||
return p_.rule("gemma4-string", [&]() {
|
||||
return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_number() {
|
||||
return p_.rule("gemma4-number", [&]() {
|
||||
auto digit1_9 = p_.chars("[1-9]", 1, 1);
|
||||
auto digits = p_.chars("[0-9]");
|
||||
auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
|
||||
auto frac = p_.sequence({p_.literal("."), digits});
|
||||
auto exp = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
|
||||
p_.optional(p_.chars("[+-]", 1, 1)), digits});
|
||||
auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
|
||||
return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
|
||||
p_.optional(exp), not_number_continuation});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_bool() {
|
||||
return p_.rule("gemma4-bool", [&]() {
|
||||
return p_.choice({p_.literal("true"), p_.literal("false")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_null() {
|
||||
return p_.rule("gemma4-null", [&]() {
|
||||
return p_.literal("null");
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_dict() {
|
||||
return p_.rule("gemma4-dict", [&]() {
|
||||
auto ws = p_.space();
|
||||
auto key = p_.until(":");
|
||||
auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
|
||||
auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
|
||||
return p_.sequence({
|
||||
p_.literal("{"), ws,
|
||||
p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_array() {
|
||||
return p_.rule("gemma4-array", [&]() {
|
||||
auto ws = p_.space();
|
||||
auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
|
||||
return p_.sequence({
|
||||
p_.literal("["), ws,
|
||||
p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_value() {
|
||||
return p_.rule("gemma4-value", [&]() {
|
||||
return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
|
||||
gemma4_number(), gemma4_bool(), gemma4_null()});
|
||||
});
|
||||
}
|
||||
|
||||
// Select the appropriate value parser based on JSON schema type.
|
||||
// Does NOT use schema() - the gemma4 types are pure PEG without
|
||||
// JSON schema metadata, so GBNF is generated directly from the
|
||||
// PEG structure.
|
||||
common_peg_parser gemma4_value_for_type(const json & schema) {
|
||||
if (!schema.contains("type") || !schema.at("type").is_string()) {
|
||||
return gemma4_value();
|
||||
}
|
||||
std::string type = schema.at("type").get<std::string>();
|
||||
if (type == "string") { return gemma4_string(); }
|
||||
if (type == "number") { return gemma4_number(); }
|
||||
if (type == "integer") { return gemma4_number(); }
|
||||
if (type == "boolean") { return gemma4_bool(); }
|
||||
if (type == "object") { return gemma4_dict(); }
|
||||
if (type == "array") { return gemma4_array(); }
|
||||
return gemma4_value();
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// Helper to iterate over tools/functions
|
||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
||||
for (const auto & tool : tools) {
|
||||
|
|
@ -43,7 +142,9 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
|||
// Create the result structure
|
||||
common_chat_params data;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.format = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
|
||||
? COMMON_CHAT_FORMAT_PEG_GEMMA4
|
||||
: COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
|
||||
auto parser = autoparser.build_parser(inputs);
|
||||
|
|
@ -92,6 +193,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
|
|||
|
||||
ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
|
||||
ctx.content = &content;
|
||||
ctx.reasoning = &reasoning;
|
||||
|
||||
// Build reasoning parser
|
||||
ctx.reasoning_parser = reasoning.build_parser(ctx);
|
||||
|
|
@ -169,6 +271,8 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
|||
return build_tool_parser_tag_json(ctx);
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return build_tool_parser_tag_tagged(ctx);
|
||||
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||
return build_tool_parser_tag_gemma4_dict(ctx);
|
||||
default:
|
||||
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||
|
|
@ -214,6 +318,44 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
|||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const {
|
||||
auto open = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
|
||||
bool matched_atomic = false;
|
||||
common_peg_parser func_parser = p.eps();
|
||||
|
||||
if (!function.name_suffix.empty()) {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (have_call_id) {
|
||||
func_parser = p.atomic(open + call_id_section) + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (atomic_peek.has_value()) {
|
||||
func_parser = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
|
||||
matched_atomic = true;
|
||||
} else {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
}
|
||||
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
|
||||
} else if (!format.per_call_end.empty()) {
|
||||
// When there's no func_close but there is a per_call_end marker, use peek() to ensure
|
||||
// we only emit tool_close when we can actually see the closing marker. This prevents
|
||||
// premature closing during partial parsing when we've seen e.g. "</" which could be
|
||||
// either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
|
||||
func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
|
||||
} else {
|
||||
func_parser = func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
|
||||
}
|
||||
if (!matched_atomic) {
|
||||
func_parser = p.atomic(func_parser);
|
||||
}
|
||||
return func_parser;
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
|
|
@ -227,17 +369,27 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
|
|||
const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
bool have_call_id = false;
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
have_call_id = true;
|
||||
}
|
||||
auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||
if (!arguments.start.empty()) {
|
||||
args_parser = p.literal(arguments.start) + args_parser;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_parser = args_parser + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + function.close;
|
||||
}
|
||||
auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
|
|
@ -297,12 +449,34 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
|||
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||
bool is_required = required.find(param_name) != required.end();
|
||||
std::string type = "object";
|
||||
auto type_obj = param_schema.contains("type") ? param_schema.at("type") : json::object();
|
||||
if (type_obj.is_string()) {
|
||||
type_obj.get_to(type);
|
||||
} else if (type_obj.is_object()) {
|
||||
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
|
||||
type_obj.at("type").get_to(type);
|
||||
if (param_schema.contains("type")) {
|
||||
const auto & type_obj = param_schema.at("type");
|
||||
if (type_obj.is_string()) {
|
||||
type_obj.get_to(type);
|
||||
} else if (type_obj.is_array()) {
|
||||
// Handle nullable types like ["string", "null"]
|
||||
for (const auto & t : type_obj) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
type = t.get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (type_obj.is_object()) {
|
||||
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
|
||||
type_obj.at("type").get_to(type);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Infer string type from enum values when type is unspecified
|
||||
if (type == "object" && param_schema.contains("enum")) {
|
||||
const auto & enum_vals = param_schema.at("enum");
|
||||
if (enum_vals.is_array()) {
|
||||
for (const auto & v : enum_vals) {
|
||||
if (v.is_string()) {
|
||||
type = "string";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -345,52 +519,31 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
|||
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
|
||||
}
|
||||
|
||||
if (!arguments.start.empty()) {
|
||||
args_seq = p.literal(arguments.start) + args_seq;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_seq = args_seq + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
bool have_call_id = false;
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
!call_id.suffix.empty()) {
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
have_call_id = true;
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
|
||||
}
|
||||
|
||||
bool matched_atomic = false;
|
||||
common_peg_parser func_parser = p.eps();
|
||||
if (!function.name_suffix.empty()) {
|
||||
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + args_seq;
|
||||
matched_atomic = true;
|
||||
} else if (have_call_id) {
|
||||
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section) + p.space() + args_seq;
|
||||
matched_atomic = true;
|
||||
} else if (!arguments.name_prefix.empty() && !required_parsers.empty()) {
|
||||
// Only peek for an arg tag when there are required args that must follow.
|
||||
// When all args are optional, the model may emit no arg tags at all (#20650).
|
||||
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
|
||||
matched_atomic = true;
|
||||
} else {
|
||||
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + args_seq;
|
||||
}
|
||||
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
|
||||
} else if (!format.per_call_end.empty()) {
|
||||
// When there's no func_close but there is a per_call_end marker, use peek() to ensure
|
||||
// we only emit tool_close when we can actually see the closing marker. This prevents
|
||||
// premature closing during partial parsing when we've seen e.g. "</" which could be
|
||||
// either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
|
||||
func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
|
||||
} else {
|
||||
func_parser =
|
||||
func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
|
||||
}
|
||||
if (!matched_atomic) {
|
||||
func_parser = p.atomic(func_parser);
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
}
|
||||
|
||||
// Only peek for an arg tag when there are required args that must follow.
|
||||
// When all args are optional, the model may emit no arg tags at all (#20650).
|
||||
auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
|
||||
std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
|
|
@ -433,4 +586,145 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
|||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_gemma4_builder g4(p);
|
||||
static const std::string QUOTE = "<|\"|>";
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & func = tool.at("function");
|
||||
std::string name = func.at("name");
|
||||
const auto & params = func.at("parameters");
|
||||
|
||||
if (!params.contains("properties") || !params.at("properties").is_object()) {
|
||||
auto func_parser = p.atomic(
|
||||
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||
p.tool_args(p.eps()) +
|
||||
p.tool_close(p.literal("}")));
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & properties = params.at("properties");
|
||||
std::set<std::string> required;
|
||||
if (params.contains("required") && params.at("required").is_array()) {
|
||||
params.at("required").get_to(required);
|
||||
}
|
||||
|
||||
// Build per-argument parsers, sorted alphabetically (matching template's dictsort)
|
||||
struct arg_entry {
|
||||
std::string param_name;
|
||||
common_peg_parser parser;
|
||||
};
|
||||
std::vector<arg_entry> arg_entries;
|
||||
|
||||
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||
std::string type = "object";
|
||||
if (param_schema.contains("type")) {
|
||||
const auto & type_v = param_schema.at("type");
|
||||
if (type_v.is_string()) {
|
||||
type_v.get_to(type);
|
||||
} else if (type_v.is_array()) {
|
||||
// Handle nullable types like ["string", "null"]
|
||||
for (const auto & t : type_v) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
type = t.get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Infer string type from enum values when type is unspecified
|
||||
if (type == "object" && param_schema.contains("enum")) {
|
||||
const auto & enum_vals = param_schema.at("enum");
|
||||
if (enum_vals.is_array()) {
|
||||
for (const auto & v : enum_vals) {
|
||||
if (v.is_string()) {
|
||||
type = "string";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
common_peg_parser value_parser = p.eps();
|
||||
if (type == "string") {
|
||||
// String values are delimited by <|"|>...<|"|>
|
||||
value_parser =
|
||||
p.literal(QUOTE) +
|
||||
p.tool_arg_string_value(p.schema(p.until(QUOTE),
|
||||
"tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
|
||||
p.literal(QUOTE);
|
||||
} else if (type == "number" || type == "integer") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_number());
|
||||
} else if (type == "boolean") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_bool());
|
||||
} else if (type == "null") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_null());
|
||||
} else if (type == "object") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_dict());
|
||||
} else if (type == "array") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_array());
|
||||
} else {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_value());
|
||||
}
|
||||
|
||||
auto arg = p.tool_arg(
|
||||
p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
|
||||
value_parser +
|
||||
p.tool_arg_close(p.eps()));
|
||||
|
||||
arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
|
||||
}
|
||||
|
||||
// Sort alphabetically to match Jinja's dictsort
|
||||
std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
|
||||
return a.param_name < b.param_name;
|
||||
});
|
||||
|
||||
// Build arg sequence: any arg, then zero-or-more comma-separated additional args
|
||||
common_peg_parser args_seq = p.eps();
|
||||
if (!arg_entries.empty()) {
|
||||
common_peg_parser any_arg = p.choice();
|
||||
for (auto & entry : arg_entries) {
|
||||
any_arg |= entry.parser;
|
||||
}
|
||||
args_seq = p.optional(
|
||||
any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
|
||||
}
|
||||
|
||||
// Full parser: call:name{args}
|
||||
auto func_parser = p.atomic(
|
||||
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||
p.tool_args(args_seq) +
|
||||
p.tool_close(p.literal("}")));
|
||||
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
// Wrap each call in <|tool_call>...</tool_call|>
|
||||
auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
|
||||
|
||||
common_peg_parser tool_calls = p.eps();
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
}
|
||||
|
||||
if (!force_tools) {
|
||||
tool_calls = p.optional(tool_calls);
|
||||
}
|
||||
|
||||
auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
|
||||
return ctx.reasoning_parser +
|
||||
(force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
|
||||
tool_calls + p.end();
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "chat-auto-parser.h"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "common.h"
|
||||
#include "jinja/caps.h"
|
||||
#include "peg-parser.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include <chrono>
|
||||
#include <optional>
|
||||
|
|
@ -144,6 +145,7 @@ enum class tool_format {
|
|||
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
|
||||
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
|
||||
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
|
||||
TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
|
||||
|
|
@ -156,6 +158,8 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
|
|||
return os << "TAG_WITH_JSON";
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return os << "TAG_WITH_TAGGED";
|
||||
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||
return os << "TAG_WITH_GEMMA4_DICT";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
|
|
@ -212,12 +216,14 @@ struct tool_id_analysis {
|
|||
// ============================================================================
|
||||
|
||||
struct analyze_content;
|
||||
struct analyze_reasoning;
|
||||
|
||||
struct parser_build_context {
|
||||
common_chat_peg_builder & p;
|
||||
const generation_params & inputs;
|
||||
const generation_params & inputs;
|
||||
common_peg_parser reasoning_parser;
|
||||
bool extracting_reasoning = false;
|
||||
const analyze_reasoning * reasoning = nullptr;
|
||||
const analyze_content * content = nullptr;
|
||||
|
||||
parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
|
||||
|
|
@ -350,6 +356,14 @@ struct analyze_tools : analyze_base {
|
|||
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
|
||||
|
||||
// Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
|
||||
// atomic_peek: if present, used as the peek expression in the third atomicity branch.
|
||||
common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const;
|
||||
common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
|
|
|
|||
|
|
@ -25,6 +25,9 @@ static const std::string ARG_SECOND = "BB_ARG_SND_BB";
|
|||
static const std::string USER_MSG = "U_USER_MSG Hello END_U";
|
||||
static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
|
||||
static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
|
||||
static const std::string CALL_ID_001 = "call00001";
|
||||
static const std::string CALL_ID_002 = "call00002";
|
||||
static const std::string CALL_ID_999 = "call99999";
|
||||
|
||||
static std::vector<std::function<void(const common_chat_template & tmpl, autoparser &)>> workarounds(
|
||||
{ // Old reasoning Qwen templates - they don't really display reasoning content, but we still want to
|
||||
|
|
@ -92,6 +95,34 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
|||
LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||
analysis.tools.format.mode = tool_format::TAG_WITH_GEMMA4_DICT;
|
||||
analysis.tools.format.per_call_start = "<|tool_call>";
|
||||
analysis.tools.format.per_call_end = "<tool_call|>";
|
||||
analysis.tools.format.section_start = "";
|
||||
analysis.tools.format.section_end = "";
|
||||
analysis.tools.function.name_prefix = "call:";
|
||||
analysis.tools.function.name_suffix = "";
|
||||
analysis.tools.arguments.start = "{";
|
||||
analysis.tools.arguments.end = "}";
|
||||
analysis.tools.arguments.name_prefix = "";
|
||||
analysis.tools.arguments.name_suffix = ":";
|
||||
analysis.tools.arguments.separator = ",";
|
||||
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||
analysis.reasoning.start = "<|channel>thought";
|
||||
analysis.reasoning.end = "<channel|>";
|
||||
analysis.preserved_tokens.clear();
|
||||
analysis.preserved_tokens.push_back("<|tool_call>");
|
||||
analysis.preserved_tokens.push_back("<tool_call|>");
|
||||
analysis.preserved_tokens.push_back("<|tool_response>");
|
||||
analysis.preserved_tokens.push_back("<tool_response|>");
|
||||
analysis.preserved_tokens.push_back("<|\"|>");
|
||||
analysis.preserved_tokens.push_back("<|turn>");
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// DeepSeek-R1-Distill-Qwen
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find(
|
||||
|
|
@ -103,6 +134,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
|||
analysis.tools.function.name_prefix = "<|tool▁sep|>";
|
||||
analysis.tools.format.per_call_end = "<|tool▁call▁end|>";
|
||||
analysis.tools.function.close = "```";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
@ -130,7 +162,7 @@ static json user_msg = json{
|
|||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
static json build_tool_call(const std::string & name, const json & args, const std::string & id = "call00001") {
|
||||
static json build_tool_call(const std::string & name, const json & args, const std::string & id = CALL_ID_001) {
|
||||
return json{
|
||||
{ "id", id },
|
||||
{ "type", "function" },
|
||||
|
|
@ -138,17 +170,17 @@ static json build_tool_call(const std::string & name, const json & args, const s
|
|||
};
|
||||
}
|
||||
|
||||
static json first_tool_call_zero_args = build_tool_call(FUN_FIRST, json::object(), "call00001");
|
||||
static json first_tool_call_one_arg = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, "call00001");
|
||||
static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, "call00001");
|
||||
static json first_tool_call_other_arg = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, "call00001");
|
||||
static json first_tool_call_zero_args = build_tool_call(FUN_FIRST, json::object(), CALL_ID_001);
|
||||
static json first_tool_call_one_arg = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, CALL_ID_001);
|
||||
static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, CALL_ID_001);
|
||||
static json first_tool_call_other_arg = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, CALL_ID_001);
|
||||
|
||||
static json first_tool_call =
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00001");
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_001);
|
||||
static json second_tool_call =
|
||||
build_tool_call(FUN_SECOND, json{ { ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00002");
|
||||
build_tool_call(FUN_SECOND, json{ { ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_002);
|
||||
static json first_tool_call_alt_id =
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call99999");
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_999);
|
||||
|
||||
template <typename T>
|
||||
static std::string mode_to_str(T mode) {
|
||||
|
|
@ -187,6 +219,11 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
|||
LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
|
||||
LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
|
||||
LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
|
||||
LOG_DBG("call_id_prefix: '%s'\n", tools.call_id.prefix.c_str());
|
||||
LOG_DBG("call_id_suffix: '%s'\n", tools.call_id.suffix.c_str());
|
||||
LOG_DBG("call_id_pos: '%s'\n", mode_to_str(tools.call_id.pos).c_str());
|
||||
LOG_DBG("args_start: '%s'\n", tools.arguments.start.c_str());
|
||||
LOG_DBG("args_end: '%s'\n", tools.arguments.end.c_str());
|
||||
LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
|
||||
LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
|
||||
LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
|
||||
|
|
@ -555,12 +592,15 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
|||
if (caps.supports_parallel_tool_calls) {
|
||||
check_per_call_markers();
|
||||
}
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3a: Function call analysis\n" ANSI_RESET);
|
||||
extract_function_markers();
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3b: Argument analysis\n" ANSI_RESET);
|
||||
if (format.mode == tool_format::TAG_WITH_TAGGED) {
|
||||
analyze_arguments();
|
||||
}
|
||||
extract_argument_separator();
|
||||
extract_args_markers();
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3c: Call id analysis\n" ANSI_RESET);
|
||||
extract_call_id_markers();
|
||||
}
|
||||
}
|
||||
|
|
@ -951,8 +991,6 @@ void analyze_tools::extract_function_markers() {
|
|||
}
|
||||
|
||||
void analyze_tools::analyze_arguments() {
|
||||
LOG_DBG(ANSI_ORANGE "Phase 4: Argument analysis\n" ANSI_RESET);
|
||||
|
||||
extract_argument_name_markers();
|
||||
extract_argument_value_markers();
|
||||
}
|
||||
|
|
@ -1161,7 +1199,7 @@ void analyze_tools::extract_args_markers() {
|
|||
|
||||
const auto & diff = comparison->diff;
|
||||
|
||||
if (format.mode != tool_format::JSON_NATIVE) {
|
||||
if (format.mode == tool_format::JSON_NATIVE) {
|
||||
std::string prefix_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
std::string suffix_marker = !format.section_end.empty() ? format.section_end : format.per_call_end;
|
||||
// these might happen earlier in the tools section as an example or somewhere else, so we need to find the closest ones
|
||||
|
|
@ -1183,6 +1221,10 @@ void analyze_tools::extract_args_markers() {
|
|||
if (find_fun != std::string::npos) {
|
||||
args_start = args_start.substr(find_fun + FUN_FIRST.size(), args_start.size() - find_fun - FUN_FIRST.size());
|
||||
}
|
||||
size_t find_call_id = args_start.find(CALL_ID_001);
|
||||
if (find_call_id != std::string::npos) {
|
||||
args_start = args_start.substr(find_call_id + CALL_ID_001.size(), args_start.size() - find_call_id - CALL_ID_001.size());
|
||||
}
|
||||
arguments.start = args_start;
|
||||
arguments.end = args_end;
|
||||
}
|
||||
|
|
@ -1222,8 +1264,8 @@ void analyze_tools::extract_call_id_markers() {
|
|||
return;
|
||||
}
|
||||
|
||||
std::string id_value_1 = "call00001";
|
||||
std::string id_value_2 = "call99999";
|
||||
std::string id_value_1 = CALL_ID_001;
|
||||
std::string id_value_2 = CALL_ID_999;
|
||||
|
||||
size_t common_id_prefix_len = 0;
|
||||
for (size_t i = 0; i < std::min(id_value_1.length(), id_value_2.length()); i++) {
|
||||
|
|
@ -1322,6 +1364,14 @@ void analyze_tools::extract_call_id_markers() {
|
|||
call_id.suffix = find_first_marker(before_func);
|
||||
}
|
||||
|
||||
if (call_id.prefix == arguments.end) {
|
||||
call_id.prefix = "";
|
||||
}
|
||||
|
||||
if (call_id.suffix == arguments.start) {
|
||||
call_id.suffix = "";
|
||||
}
|
||||
|
||||
// When call_id is detected, per_call_end may have been incorrectly set to include
|
||||
// the call_id_suffix and sample args. Clear it if it starts with call_id_suffix.
|
||||
if (call_id.pos != call_id_position::NONE && !call_id.suffix.empty() &&
|
||||
|
|
|
|||
|
|
@ -75,6 +75,84 @@ static std::string escape_json_string_inner(const std::string & s) {
|
|||
return escaped;
|
||||
}
|
||||
|
||||
static const std::string GEMMA4_QUOTE = "<|\"|>";
|
||||
|
||||
static std::string normalize_gemma4_to_json(const std::string & input) {
|
||||
std::string result;
|
||||
result.reserve(input.size() * 2);
|
||||
|
||||
enum Ctx { DICT, ARRAY };
|
||||
std::vector<Ctx> ctx;
|
||||
|
||||
auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
|
||||
auto skip_ws = [&](size_t & pos) {
|
||||
while (pos < input.size() && is_ws(input[pos])) {
|
||||
result += input[pos++];
|
||||
}
|
||||
};
|
||||
|
||||
auto quote_unquoted_key = [&](size_t & pos) {
|
||||
if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
|
||||
result += '"';
|
||||
while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
|
||||
result += input[pos++];
|
||||
}
|
||||
result += '"';
|
||||
skip_ws(pos);
|
||||
}
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
while (i < input.size()) {
|
||||
if (i + GEMMA4_QUOTE.size() <= input.size() &&
|
||||
input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
|
||||
result += '"';
|
||||
i += GEMMA4_QUOTE.size();
|
||||
continue;
|
||||
}
|
||||
|
||||
char c = input[i];
|
||||
|
||||
if (c == '{') {
|
||||
result += c;
|
||||
ctx.push_back(DICT);
|
||||
++i;
|
||||
skip_ws(i);
|
||||
quote_unquoted_key(i);
|
||||
continue;
|
||||
}
|
||||
if (c == '}') {
|
||||
result += c;
|
||||
if (!ctx.empty()) ctx.pop_back();
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == '[') {
|
||||
result += c;
|
||||
ctx.push_back(ARRAY);
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == ']') {
|
||||
result += c;
|
||||
if (!ctx.empty()) ctx.pop_back();
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
|
||||
result += c;
|
||||
++i;
|
||||
skip_ws(i);
|
||||
quote_unquoted_key(i);
|
||||
continue;
|
||||
}
|
||||
|
||||
result += c;
|
||||
++i;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Convert Python-style single-quoted strings to JSON double-quoted strings
|
||||
// Only converts outer string delimiters, properly handling escape sequences:
|
||||
// - {'key': 'value'} -> {"key": "value"}
|
||||
|
|
@ -214,6 +292,14 @@ std::string & common_chat_peg_mapper::args_target() {
|
|||
return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
|
||||
}
|
||||
|
||||
std::string common_chat_peg_mapper::normalize_container_value(const std::string & input) {
|
||||
return normalize_quotes_to_json(input);
|
||||
}
|
||||
|
||||
std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
|
||||
return normalize_quotes_to_json(normalize_gemma4_to_json(input));
|
||||
}
|
||||
|
||||
void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena,
|
||||
const common_peg_parse_result & parse_result_arg) {
|
||||
arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
|
||||
|
|
@ -352,7 +438,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
|||
// For potential containers, normalize Python-style single quotes to JSON double quotes
|
||||
bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
|
||||
if (is_potential_container) {
|
||||
value_content = normalize_quotes_to_json(value_content);
|
||||
value_content = normalize_container_value(value_content);
|
||||
}
|
||||
|
||||
// Try to parse as JSON value (number, bool, null, object, array)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,9 @@ class common_chat_peg_mapper {
|
|||
|
||||
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||
virtual void map(const common_peg_ast_node & node);
|
||||
private:
|
||||
protected:
|
||||
virtual std::string normalize_container_value(const std::string & input);
|
||||
private:
|
||||
// Tool call handling state
|
||||
std::optional<common_chat_tool_call> pending_tool_call; // Tool call waiting for name
|
||||
common_chat_tool_call * current_tool = nullptr;
|
||||
|
|
@ -30,6 +32,13 @@ class common_chat_peg_mapper {
|
|||
std::string & args_target();
|
||||
};
|
||||
|
||||
class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
|
||||
public:
|
||||
common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
|
||||
protected:
|
||||
std::string normalize_container_value(const std::string & input) override;
|
||||
};
|
||||
|
||||
struct content_structure;
|
||||
struct tool_call_structure;
|
||||
|
||||
|
|
|
|||
123
common/chat.cpp
123
common/chat.cpp
|
|
@ -13,6 +13,8 @@
|
|||
#include "jinja/caps.h"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
|
|
@ -694,6 +696,8 @@ const char * common_chat_format_name(common_chat_format format) {
|
|||
return "peg-simple";
|
||||
case COMMON_CHAT_FORMAT_PEG_NATIVE:
|
||||
return "peg-native";
|
||||
case COMMON_CHAT_FORMAT_PEG_GEMMA4:
|
||||
return "peg-gemma4";
|
||||
default:
|
||||
throw std::runtime_error("Unknown chat format");
|
||||
}
|
||||
|
|
@ -760,12 +764,12 @@ static void foreach_parameter(const json &
|
|||
}
|
||||
}
|
||||
|
||||
std::string common_chat_template_direct_apply(
|
||||
static std::string common_chat_template_direct_apply_impl(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs,
|
||||
const std::optional<json> & messages_override,
|
||||
const std::optional<json> & tools_override,
|
||||
const std::optional<json> & additional_context) {
|
||||
const std::optional<json> & messages_override = std::nullopt,
|
||||
const std::optional<json> & tools_override = std::nullopt,
|
||||
const std::optional<json> & additional_context = std::nullopt) {
|
||||
jinja::context ctx(tmpl.source());
|
||||
|
||||
nlohmann::ordered_json inp = nlohmann::ordered_json{
|
||||
|
|
@ -812,6 +816,12 @@ std::string common_chat_template_direct_apply(
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string common_chat_template_direct_apply(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
|
@ -862,7 +872,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
|
|||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "[THINK]";
|
||||
data.thinking_end_tag = "[/THINK]";
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
"[THINK]",
|
||||
|
|
@ -945,7 +955,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
|
||||
auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
auto prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
|
||||
// Check if we need to replace the return token with end token during
|
||||
// inference and without generation prompt. For more details see:
|
||||
|
|
@ -980,15 +990,19 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
|
||||
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
|
||||
|
||||
// Occasionally, gpt-oss-20b will prefix channels with this commentary
|
||||
auto stray_commentary = p.optional(p.literal("<|channel|>commentary") + p.optional(p.literal(" to=assistant")));
|
||||
auto start_analysis = stray_commentary + p.literal("<|channel|>analysis<|message|>");
|
||||
|
||||
if (extract_reasoning) {
|
||||
p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
|
||||
p.rule("analysis", start_analysis + p.reasoning(content) + end);
|
||||
} else {
|
||||
p.rule("analysis", p.content(p.literal("<|channel|>analysis<|message|>") + content + end));
|
||||
p.rule("analysis", p.content(start_analysis + content + end));
|
||||
}
|
||||
|
||||
auto analysis = p.ref("analysis");
|
||||
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
|
||||
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
|
||||
auto final_msg = p.rule("final", stray_commentary + p.literal("<|channel|>final<|message|>") + p.content(content));
|
||||
|
||||
// Consume any unsolicited tool calls, e.g. builtin functions
|
||||
auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
|
||||
|
|
@ -996,7 +1010,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
auto any = p.rule("any", preamble | analysis);
|
||||
|
||||
if (has_response_format) {
|
||||
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
|
||||
auto constraint = p.optional(p.space() + p.optional(p.literal("<|constrain|>")) + constrain_type);
|
||||
auto response_format = p.rule("response-format",
|
||||
p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
|
||||
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
|
||||
|
|
@ -1013,7 +1027,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
const auto & params = function.at("parameters");
|
||||
|
||||
auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
|
||||
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
|
||||
auto constraint = p.optional(p.space() + p.optional(p.literal("<|constrain|>")) + constrain_type);
|
||||
auto args = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", params));
|
||||
|
||||
// recipient in role header
|
||||
|
|
@ -1054,6 +1068,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^\\s+to$" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^<\\|channel\\|>(?:commentary|analysis)\\s+to=functions$" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(\\s+to)" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(<\\|channel\\|>(?:commentary|analysis)\\s+to)" }
|
||||
};
|
||||
|
|
@ -1067,7 +1082,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
|||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
">>>all",
|
||||
|
|
@ -1161,7 +1176,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
|||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
|
|
@ -1284,7 +1299,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
|||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
|
|
@ -1363,7 +1378,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
|
|||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
|
|
@ -1434,7 +1449,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
|
|||
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = false;
|
||||
data.preserved_tokens = {
|
||||
|
|
@ -1540,6 +1555,50 @@ static void requires_non_null_content(json & messages) {
|
|||
}
|
||||
}
|
||||
|
||||
// Gemma4 uses a custom tool_responses field instead of role:tool messages.
|
||||
// Convert consecutive role:tool messages into a single user message with tool_responses.
|
||||
static void convert_tool_responses_gemma4(json & messages) {
|
||||
json result = json::array();
|
||||
size_t i = 0;
|
||||
while (i < messages.size()) {
|
||||
if (messages[i].contains("role") && messages[i].at("role") == "tool") {
|
||||
json tool_responses = json::array();
|
||||
while (i < messages.size() &&
|
||||
messages[i].contains("role") &&
|
||||
messages[i].at("role") == "tool") {
|
||||
const auto & tool_msg = messages[i];
|
||||
std::string name;
|
||||
if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
|
||||
name = tool_msg.at("tool_call_id");
|
||||
} else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
|
||||
name = tool_msg.at("name");
|
||||
}
|
||||
json response;
|
||||
if (tool_msg.contains("content")) {
|
||||
const auto & content = tool_msg.at("content");
|
||||
if (content.is_string()) {
|
||||
// Try to parse the content as JSON; fall back to raw string
|
||||
try {
|
||||
response = json::parse(content.get<std::string>());
|
||||
} catch (...) {
|
||||
response = content;
|
||||
}
|
||||
} else {
|
||||
response = content;
|
||||
}
|
||||
}
|
||||
tool_responses.push_back({{"name", name}, {"response", response}});
|
||||
i++;
|
||||
}
|
||||
result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
|
||||
} else {
|
||||
result.push_back(messages[i]);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
messages = result;
|
||||
}
|
||||
|
||||
static void func_args_not_string(json & messages) {
|
||||
GGML_ASSERT(messages.is_array());
|
||||
for (auto & message : messages) {
|
||||
|
|
@ -1572,7 +1631,7 @@ static json common_chat_extra_context() {
|
|||
return ctx;
|
||||
}
|
||||
|
||||
static std::optional<common_chat_params> try_specialized_template(
|
||||
std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
const autoparser::generation_params & params) {
|
||||
|
|
@ -1668,10 +1727,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
|||
workaround::func_args_not_string(params.messages);
|
||||
}
|
||||
|
||||
if (src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||
workaround::convert_tool_responses_gemma4(params.messages);
|
||||
}
|
||||
|
||||
params.add_generation_prompt = false;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply(tmpl, params);
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
params.add_generation_prompt = true;
|
||||
std::string gen_prompt = common_chat_template_direct_apply(tmpl, params);
|
||||
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
|
||||
params.generation_prompt = diff.right;
|
||||
|
||||
|
|
@ -1705,7 +1768,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
|||
common_chat_params data;
|
||||
auto params_copy = params;
|
||||
params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, params_copy);
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, params_copy);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.generation_prompt = params.generation_prompt;
|
||||
auto parser = build_chat_peg_parser([¶ms](common_chat_peg_builder &p) {
|
||||
|
|
@ -1715,7 +1778,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
|||
return data;
|
||||
}
|
||||
|
||||
if (auto result = try_specialized_template(tmpl, src, params)) {
|
||||
if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
|
||||
result->generation_prompt = params.generation_prompt;
|
||||
return *result;
|
||||
}
|
||||
|
|
@ -1852,8 +1915,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
|||
// Try to extract any partial results from what was successfully parsed
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
auto mapper = common_chat_peg_mapper(msg);
|
||||
mapper.from_ast(ctx.ast, result);
|
||||
std::unique_ptr<common_chat_peg_mapper> mapper;
|
||||
if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
|
||||
mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
|
||||
} else {
|
||||
mapper = std::make_unique<common_chat_peg_mapper>(msg);
|
||||
}
|
||||
mapper->from_ast(ctx.ast, result);
|
||||
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
|
||||
|
|
@ -1868,8 +1936,13 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
|||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
|
||||
auto mapper = common_chat_peg_mapper(msg);
|
||||
mapper.from_ast(ctx.ast, result);
|
||||
std::unique_ptr<common_chat_peg_mapper> mapper;
|
||||
if (params.format == COMMON_CHAT_FORMAT_PEG_GEMMA4) {
|
||||
mapper = std::make_unique<common_chat_peg_gemma4_mapper>(msg);
|
||||
} else {
|
||||
mapper = std::make_unique<common_chat_peg_mapper>(msg);
|
||||
}
|
||||
mapper->from_ast(ctx.ast, result);
|
||||
|
||||
if (ctx.is_debug()) {
|
||||
fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
|
||||
|
|
|
|||
|
|
@ -3,12 +3,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "jinja/parser.h"
|
||||
#include "nlohmann/json_fwd.hpp"
|
||||
#include "peg-parser.h"
|
||||
#include "jinja/parser.h"
|
||||
#include "jinja/runtime.h"
|
||||
#include "jinja/caps.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include "nlohmann/json_fwd.hpp"
|
||||
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
|
|
@ -19,8 +19,6 @@
|
|||
using chat_template_caps = jinja::caps;
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
struct common_chat_templates;
|
||||
|
||||
namespace autoparser {
|
||||
|
|
@ -75,41 +73,9 @@ struct common_chat_template {
|
|||
const std::string & bos_token() const { return bos_tok; }
|
||||
const std::string & eos_token() const { return eos_tok; }
|
||||
|
||||
// TODO: this is ugly, refactor it somehow
|
||||
json add_system(const json & messages, const std::string & system_prompt) const {
|
||||
GGML_ASSERT(messages.is_array());
|
||||
auto msgs_copy = messages;
|
||||
if (!caps.supports_system_role) {
|
||||
if (msgs_copy.empty()) {
|
||||
msgs_copy.insert(msgs_copy.begin(), json{
|
||||
{"role", "user"},
|
||||
{"content", system_prompt}
|
||||
});
|
||||
} else {
|
||||
auto & first_msg = msgs_copy[0];
|
||||
if (!first_msg.contains("content")) {
|
||||
first_msg["content"] = "";
|
||||
}
|
||||
first_msg["content"] = system_prompt + "\n\n"
|
||||
+ first_msg["content"].get<std::string>();
|
||||
}
|
||||
} else {
|
||||
if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
|
||||
msgs_copy.insert(msgs_copy.begin(), json{
|
||||
{"role", "system"},
|
||||
{"content", system_prompt}
|
||||
});
|
||||
} else if (msgs_copy[0].at("role") == "system") {
|
||||
msgs_copy[0]["content"] = system_prompt;
|
||||
}
|
||||
}
|
||||
return msgs_copy;
|
||||
}
|
||||
|
||||
chat_template_caps original_caps() const {
|
||||
return caps;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct common_chat_msg {
|
||||
|
|
@ -184,6 +150,7 @@ enum common_chat_format {
|
|||
// These are intended to be parsed by the PEG parser
|
||||
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
||||
COMMON_CHAT_FORMAT_PEG_NATIVE,
|
||||
COMMON_CHAT_FORMAT_PEG_GEMMA4,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
|
|
@ -256,8 +223,8 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
|
|||
const std::string & bos_token_override = "",
|
||||
const std::string & eos_token_override = "");
|
||||
|
||||
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
||||
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
||||
|
||||
struct common_chat_params common_chat_templates_apply(const struct common_chat_templates * tmpls,
|
||||
const struct common_chat_templates_inputs & inputs);
|
||||
|
|
@ -274,9 +241,9 @@ std::string common_chat_format_example(const struct common_chat_templates *
|
|||
bool use_jinja,
|
||||
const std::map<std::string, std::string> & chat_template_kwargs);
|
||||
|
||||
const char * common_chat_format_name(common_chat_format format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
|
||||
const char * common_chat_format_name(common_chat_format format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
|
||||
|
||||
// used by arg and server
|
||||
const char * common_reasoning_format_name(common_reasoning_format format);
|
||||
|
|
@ -302,7 +269,9 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
|
|||
|
||||
std::string common_chat_template_direct_apply(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs,
|
||||
const std::optional<json> & messages_override = std::nullopt,
|
||||
const std::optional<json> & tools_override = std::nullopt,
|
||||
const std::optional<json> & additional_context = std::nullopt);
|
||||
const autoparser::generation_params & inputs);
|
||||
|
||||
std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
const autoparser::generation_params & params);
|
||||
|
|
|
|||
|
|
@ -1442,6 +1442,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
|
||||
mparams.progress_callback = params.load_progress_callback;
|
||||
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
|
||||
mparams.no_alloc = params.no_alloc;
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -579,8 +579,9 @@ struct common_params {
|
|||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
bool clear_idle = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
|
|
@ -679,6 +680,7 @@ struct common_params {
|
|||
// return false from callback to abort model loading or true to continue
|
||||
llama_progress_callback load_progress_callback = NULL;
|
||||
void * load_progress_callback_user_data = NULL;
|
||||
bool no_alloc = false; // Don't allocate model buffers
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
|
|
|
|||
|
|
@ -306,6 +306,19 @@ value filter_expression::execute_impl(context & ctx) {
|
|||
filter_id = "strip"; // alias
|
||||
}
|
||||
JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
|
||||
// TODO: Refactor filters so this coercion can be done automatically
|
||||
if (!input->is_undefined() && !is_val<value_string>(input) && (
|
||||
filter_id == "capitalize" ||
|
||||
filter_id == "lower" ||
|
||||
filter_id == "replace" ||
|
||||
filter_id == "strip" ||
|
||||
filter_id == "title" ||
|
||||
filter_id == "upper" ||
|
||||
filter_id == "wordcount"
|
||||
)) {
|
||||
JJ_DEBUG("Coercing %s to String for '%s' filter", input->type().c_str(), filter_id.c_str());
|
||||
input = mk_val<value_string>(input->as_string());
|
||||
}
|
||||
return try_builtin_func(ctx, filter_id, input)->invoke(func_args(ctx));
|
||||
|
||||
} else if (is_stmt<call_expression>(filter)) {
|
||||
|
|
|
|||
|
|
@ -465,8 +465,9 @@ const func_builtins & value_int_t::get_builtins() const {
|
|||
double val = static_cast<double>(args.get_pos(0)->as_int());
|
||||
return mk_val<value_float>(val);
|
||||
}},
|
||||
{"tojson", tojson},
|
||||
{"safe", tojson},
|
||||
{"string", tojson},
|
||||
{"tojson", tojson},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
|
|
@ -485,8 +486,9 @@ const func_builtins & value_float_t::get_builtins() const {
|
|||
int64_t val = static_cast<int64_t>(args.get_pos(0)->as_float());
|
||||
return mk_val<value_int>(val);
|
||||
}},
|
||||
{"tojson", tojson},
|
||||
{"safe", tojson},
|
||||
{"string", tojson},
|
||||
{"tojson", tojson},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
|
|
@ -771,6 +773,11 @@ const func_builtins & value_string_t::get_builtins() const {
|
|||
|
||||
|
||||
const func_builtins & value_bool_t::get_builtins() const {
|
||||
static const func_handler tostring = [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_bool>();
|
||||
bool val = args.get_pos(0)->as_bool();
|
||||
return mk_val<value_string>(val ? "True" : "False");
|
||||
};
|
||||
static const func_builtins builtins = {
|
||||
{"default", default_value},
|
||||
{"int", [](const func_args & args) -> value {
|
||||
|
|
@ -783,11 +790,8 @@ const func_builtins & value_bool_t::get_builtins() const {
|
|||
bool val = args.get_pos(0)->as_bool();
|
||||
return mk_val<value_float>(val ? 1.0 : 0.0);
|
||||
}},
|
||||
{"string", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_bool>();
|
||||
bool val = args.get_pos(0)->as_bool();
|
||||
return mk_val<value_string>(val ? "True" : "False");
|
||||
}},
|
||||
{"safe", tostring},
|
||||
{"string", tostring},
|
||||
{"tojson", tojson},
|
||||
};
|
||||
return builtins;
|
||||
|
|
@ -1100,18 +1104,14 @@ const func_builtins & value_object_t::get_builtins() const {
|
|||
}
|
||||
|
||||
const func_builtins & value_none_t::get_builtins() const {
|
||||
static const func_handler tostring = [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
};
|
||||
static const func_builtins builtins = {
|
||||
{"default", default_value},
|
||||
{"tojson", tojson},
|
||||
{"string", [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
}},
|
||||
{"safe", [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
}},
|
||||
{"strip", [](const func_args &) -> value {
|
||||
return mk_val<value_string>("None");
|
||||
}},
|
||||
{"string", tostring},
|
||||
{"safe", tostring},
|
||||
{"items", empty_value_fn<value_array>},
|
||||
{"map", empty_value_fn<value_array>},
|
||||
{"reject", empty_value_fn<value_array>},
|
||||
|
|
|
|||
|
|
@ -1557,6 +1557,52 @@ static std::unordered_set<std::string> collect_reachable_rules(
|
|||
|
||||
// GBNF generation implementation
|
||||
void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
|
||||
auto schema_delegates = [](const common_peg_schema_parser & s) -> bool {
|
||||
if (!s.schema) {
|
||||
return true;
|
||||
}
|
||||
if (s.raw && s.schema->contains("type")) {
|
||||
const auto & type_val = s.schema->at("type");
|
||||
if (type_val.is_string() && type_val == "string") {
|
||||
return true;
|
||||
}
|
||||
// Handle nullable types like ["string", "null"] - delegate when the
|
||||
// non-null type is string, since the tagged format uses raw text
|
||||
if (type_val.is_array()) {
|
||||
for (const auto & t : type_val) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
return t.get<std::string>() == "string";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Delegate for enum schemas in raw mode - enum values are literal strings
|
||||
if (s.raw && !s.schema->contains("type") && s.schema->contains("enum")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// Unwrap the parser so we can properly check if it's a sequence or choice
|
||||
auto effective_parser = [&](common_peg_parser_id id) -> const common_peg_parser_variant & {
|
||||
while (true) {
|
||||
const auto & p = parsers_.at(id);
|
||||
if (const auto * tag = std::get_if<common_peg_tag_parser>(&p)) {
|
||||
id = tag->child;
|
||||
} else if (const auto * atomic = std::get_if<common_peg_atomic_parser>(&p)) {
|
||||
id = atomic->child;
|
||||
} else if (const auto * schema = std::get_if<common_peg_schema_parser>(&p)) {
|
||||
if (schema_delegates(*schema)) {
|
||||
id = schema->child;
|
||||
} else {
|
||||
return p;
|
||||
}
|
||||
} else {
|
||||
return p;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Generate GBNF for a parser
|
||||
std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
|
||||
const auto & parser = parsers_.at(id);
|
||||
|
|
@ -1577,7 +1623,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
|||
s += " ";
|
||||
}
|
||||
auto child_gbnf = to_gbnf(child);
|
||||
const auto & child_parser = parsers_.at(child);
|
||||
const auto & child_parser = effective_parser(child);
|
||||
if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
|
||||
std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
|
||||
s += "(" + child_gbnf + ")";
|
||||
|
|
@ -1593,7 +1639,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
|||
s += " | ";
|
||||
}
|
||||
auto child_gbnf = to_gbnf(child);
|
||||
const auto & child_parser = parsers_.at(child);
|
||||
const auto & child_parser = effective_parser(child);
|
||||
if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
|
||||
s += "(" + child_gbnf + ")";
|
||||
} else {
|
||||
|
|
@ -1603,7 +1649,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
|||
return s;
|
||||
} else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
|
||||
auto child_gbnf = to_gbnf(p.child);
|
||||
const auto & child_parser = parsers_.at(p.child);
|
||||
const auto & child_parser = effective_parser(p.child);
|
||||
if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
|
||||
std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
|
||||
child_gbnf = "(" + child_gbnf + ")";
|
||||
|
|
@ -1663,15 +1709,10 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
|||
}
|
||||
return gbnf_excluding_pattern(p.delimiters);
|
||||
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
||||
if (p.schema) {
|
||||
if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
|
||||
// TODO: Implement more comprehensive grammar generation for raw strings.
|
||||
// For now, use the grammar emitted from the underlying parser.
|
||||
return to_gbnf(p.child);
|
||||
}
|
||||
return builder.add_schema(p.name, *p.schema);
|
||||
if (schema_delegates(p)) {
|
||||
return to_gbnf(p.child);
|
||||
}
|
||||
return to_gbnf(p.child);
|
||||
return builder.add_schema(p.name, *p.schema);
|
||||
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
|
||||
return p.name;
|
||||
} else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
|
||||
|
|
|
|||
|
|
@ -1164,7 +1164,7 @@ class TextModel(ModelBase):
|
|||
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
logger.info(f"gguf: expert count = {n_experts}")
|
||||
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None:
|
||||
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||
if (n_expert_groups := self.hparams.get("n_group")) is not None:
|
||||
|
|
@ -6878,7 +6878,9 @@ class Gemma2Model(TextModel):
|
|||
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
|
||||
class Gemma3Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3
|
||||
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||
|
||||
def norm_shift(self, name: str) -> float:
|
||||
return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
|
|
@ -6916,17 +6918,22 @@ class Gemma3Model(TextModel):
|
|||
|
||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||
if "embed_tokens.weight" in name:
|
||||
n_vocab_real = -1
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
tokens = self._create_vocab_sentencepiece()[0]
|
||||
n_vocab_real = len(tokens)
|
||||
else:
|
||||
tokens = self.get_vocab_base()[0]
|
||||
data_torch = data_torch[:len(tokens)]
|
||||
with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"])
|
||||
data_torch = data_torch[:n_vocab_real]
|
||||
|
||||
# ref code in Gemma3RMSNorm
|
||||
# output = output * (1.0 + self.weight.float())
|
||||
# note: this is not the case on gemma3n
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + self.norm_shift
|
||||
f_shift = self.norm_shift(name)
|
||||
if f_shift != 0.0:
|
||||
data_torch = data_torch + f_shift
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
|
@ -7100,7 +7107,8 @@ class ConformerAudioModel(MmprojModel):
|
|||
assert data_torch.shape[2] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||
yield (mapped_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
|
|
@ -7289,7 +7297,6 @@ class Gemma3nVisionAudioModel(ConformerAudioModel):
|
|||
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||
class Gemma3NModel(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
||||
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||
|
||||
_altup_proj: list[Tensor] = []
|
||||
_altup_unembd: list[Tensor] = []
|
||||
|
|
@ -7308,6 +7315,10 @@ class Gemma3NModel(Gemma3Model):
|
|||
torch.Tensor(), # to be replaced
|
||||
]
|
||||
|
||||
def norm_shift(self, name: str) -> float:
|
||||
del name
|
||||
return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||
|
||||
def set_vocab(self):
|
||||
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
||||
# which includes special tokens from 262144-262399 for vision/audio.
|
||||
|
|
@ -7425,6 +7436,209 @@ class Gemma3NModel(Gemma3Model):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
def norm_shift(self, name: str) -> float:
|
||||
del name # unused
|
||||
return 0.0
|
||||
|
||||
def set_vocab(self):
|
||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
text_str = text.decode()
|
||||
if text_str in visible_tokens:
|
||||
# always render these tokens, so that the chat parser can read them
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
logger.info(f"Token '{text_str}' is set to USER_DEFINED")
|
||||
else:
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gemma4")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
self.gguf_writer.add_add_bos_token(False) # already added via the chat template
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
|
||||
self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
|
||||
|
||||
# per-layer embedding is optional
|
||||
n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
|
||||
self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
|
||||
|
||||
swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
||||
self.gguf_writer.add_sliding_window_pattern(swa_layers)
|
||||
|
||||
head_dim_full = self.hparams["global_head_dim"]
|
||||
head_dim_swa = self.hparams["head_dim"]
|
||||
# correct the head dim for global/swa layers
|
||||
self.gguf_writer.add_key_length(head_dim_full)
|
||||
self.gguf_writer.add_value_length(head_dim_full)
|
||||
self.gguf_writer.add_key_length_swa(head_dim_swa)
|
||||
self.gguf_writer.add_value_length_swa(head_dim_swa)
|
||||
|
||||
expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
|
||||
if expert_intermediate_size is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
||||
|
||||
# if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
|
||||
use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
|
||||
first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
|
||||
if use_double_wide_mlp:
|
||||
n_ff = self.hparams["intermediate_size"]
|
||||
n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
|
||||
self.gguf_writer.add_feed_forward_length(n_ff_arr)
|
||||
|
||||
# handle num_global_key_value_heads
|
||||
num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
|
||||
num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
|
||||
if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
|
||||
value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
|
||||
self.gguf_writer.add_head_count_kv(value_arr)
|
||||
|
||||
# handle n_rot differently for global vs swa layers
|
||||
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
||||
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
||||
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
||||
self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# full layer uses "proportional" rope with partial_rotary_factor=0.25
|
||||
# the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
|
||||
# but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
|
||||
# solution is to set specific freq_factors for the unrotated dims
|
||||
|
||||
# IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
|
||||
rope_params_full = self.hparams["rope_parameters"]["full_attention"]
|
||||
assert rope_params_full["rope_type"] == "proportional"
|
||||
head_dim_full = (self.hparams["global_head_dim"])
|
||||
partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
|
||||
n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
|
||||
n_unrot_full = int(head_dim_full / 2) - n_rot_full
|
||||
values = [1.0] * n_rot_full + [1e30] * n_unrot_full
|
||||
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
||||
name = name + ".weight"
|
||||
|
||||
if "language_model." not in name and "rope_freqs" not in name:
|
||||
return # skip non-language model tensors
|
||||
|
||||
name = name.replace("language_model.", "")
|
||||
if name.endswith("router.scale"):
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if ".per_expert_scale" in name:
|
||||
# convert per-expert scale to FFN down scale
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if ".experts." in name and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4VisionAudioModel(MmprojModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
|
||||
|
||||
# remap audio hparams
|
||||
if self.hparams_audio:
|
||||
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
||||
else:
|
||||
self.has_audio_encoder = False
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# vision params
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
if self.hparams_audio:
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
|
||||
def is_audio_tensor(self, name: str) -> bool:
|
||||
return "audio_tower" in name or "embed_audio" in name
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if self.is_audio_tensor(name):
|
||||
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if "position_embedding_table" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("model.language_model."):
|
||||
return # skip
|
||||
|
||||
if len(data_torch.shape) == 0:
|
||||
# convert scalar tensors (input/output_mix/max) to 1D tensors
|
||||
data_torch = data_torch.unsqueeze(0)
|
||||
|
||||
if self.is_audio_tensor(name):
|
||||
assert self.hparams_audio is not None
|
||||
name = name.replace("model.audio_tower.", "conformer.")
|
||||
name = name.replace(".linear.", ".")
|
||||
if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"):
|
||||
name = name + ".weight"
|
||||
data_torch = torch.nn.functional.softplus(data_torch)
|
||||
if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"):
|
||||
assert data_torch.shape[1] == 1
|
||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||
yield (mapped_name, data_torch)
|
||||
|
||||
else:
|
||||
name = name.replace("model.vision_tower.encoder.", "vision_model.model.")
|
||||
name = name.replace(".linear.weight", ".weight")
|
||||
if name.endswith("layer_scalar") or name.endswith("position_embedding_table"):
|
||||
name = name + ".weight"
|
||||
if name.endswith("patch_embedder.input_proj.weight"):
|
||||
n_embd, ksize_sq_c = data_torch.shape
|
||||
patch_size = int((ksize_sq_c // 3) ** 0.5)
|
||||
data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3)
|
||||
data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
|
||||
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||
yield (mapped_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("Starcoder2ForCausalLM")
|
||||
class StarCoder2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
|
|
|||
|
|
@ -57,13 +57,14 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based
|
|||
|
||||
## Supported Operations
|
||||
|
||||
The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
|
||||
The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-based matrix multiplication (MUL_MAT_ID)** operations. Other operations are handled by the standard CPU backend.
|
||||
|
||||
| Operation | Status | Notes |
|
||||
|:-------------|:-------:|:----------------------------------------------:|
|
||||
| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul |
|
||||
| MUL_MAT_ID | Support | Accelerated via ZenDNN LowOHA MatMul (MoE) |
|
||||
|
||||
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
|
||||
*Note:* Since MUL_MAT and MUL_MAT_ID are accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs and Mixture-of-Experts models).
|
||||
|
||||
## DataType Supports
|
||||
|
||||
|
|
@ -181,7 +182,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
|
|||
|
||||
## Known Issues
|
||||
|
||||
- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
|
||||
- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
|
||||
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
|
||||
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
|
||||
|
||||
|
|
@ -216,4 +217,4 @@ Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-t
|
|||
|
||||
## TODO
|
||||
|
||||
- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
|
||||
- Expand operation support beyond MUL_MAT and MUL_MAT_ID (attention operations, activations, etc.)
|
||||
|
|
|
|||
|
|
@ -389,7 +389,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|||
|
||||
|
||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. Note that [`HSA_OVERRIDE_GFX_VERSION`] is [not supported on Windows](https://github.com/ROCm/ROCm/issues/2654)
|
||||
|
||||
### Unified Memory
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ Legend:
|
|||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | 🟡 | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
|
|
|
|||
9986
docs/ops/ZenDNN.csv
9986
docs/ops/ZenDNN.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -15,13 +15,18 @@ static bool run(llama_context * ctx, const common_params & params) {
|
|||
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos, true);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_INF("number of input tokens = %zu\n", tokens.size());
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
LOG_INF(" %d\n", tokens[i]);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -1009,8 +1009,8 @@ public:
|
|||
bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);
|
||||
|
||||
struct stored_graph {
|
||||
ggml_context_ptr ctx_ptr;
|
||||
ggml_cgraph * graph;
|
||||
std::vector<uint8_t> buffer;
|
||||
ggml_cgraph * graph;
|
||||
};
|
||||
|
||||
private:
|
||||
|
|
@ -1518,10 +1518,12 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
|
|||
LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);
|
||||
|
||||
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
|
||||
if (stored_graphs[device].buffer.size() < buf_size) {
|
||||
stored_graphs[device].buffer.resize(buf_size);
|
||||
}
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.mem_buffer =*/ stored_graphs[device].buffer.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
|
|
@ -1551,7 +1553,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
|
|||
}
|
||||
ggml_status status = ggml_backend_graph_compute(backends[device], graph);
|
||||
GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
|
||||
stored_graphs[device].ctx_ptr.swap(ctx_ptr);
|
||||
stored_graphs[device].graph = graph;
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -95,6 +95,12 @@ struct ggml_webgpu_generic_shader_decisions {
|
|||
uint32_t wg_size = 0;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_processed_shader {
|
||||
std::string wgsl;
|
||||
std::string variant;
|
||||
std::shared_ptr<void> decisions;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_ssm_conv_shader_decisions {
|
||||
uint32_t block_size;
|
||||
uint32_t tokens_per_wg;
|
||||
|
|
@ -384,11 +390,12 @@ struct ggml_webgpu_flash_attn_pipeline_key {
|
|||
bool has_mask;
|
||||
bool has_sinks;
|
||||
bool uses_logit_softcap;
|
||||
bool use_vec;
|
||||
|
||||
bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
|
||||
return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
|
||||
kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
|
||||
uses_logit_softcap == other.uses_logit_softcap;
|
||||
uses_logit_softcap == other.uses_logit_softcap && use_vec == other.use_vec;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -402,6 +409,7 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
|
|||
ggml_webgpu_hash_combine(seed, key.has_mask);
|
||||
ggml_webgpu_hash_combine(seed, key.has_sinks);
|
||||
ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
|
||||
ggml_webgpu_hash_combine(seed, key.use_vec);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
|
@ -421,6 +429,121 @@ struct ggml_webgpu_flash_attn_shader_decisions {
|
|||
uint32_t wg_size = 0;
|
||||
};
|
||||
|
||||
inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
|
||||
// Keep conservative defaults unless this is the f16 vec-split shape family.
|
||||
if (key.kv_type != GGML_TYPE_F16 || key.head_dim_qk != key.head_dim_v) {
|
||||
return 1u;
|
||||
}
|
||||
|
||||
// Head-dim specializations used by the tuned vec f16 path.
|
||||
switch (key.head_dim_qk) {
|
||||
case 64:
|
||||
return 2u;
|
||||
case 96:
|
||||
return 4u;
|
||||
case 128:
|
||||
return 1u;
|
||||
case 192:
|
||||
return 2u;
|
||||
case 576:
|
||||
return 2u;
|
||||
default:
|
||||
return 1u;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_vec_reduce_pipeline_key {
|
||||
uint32_t head_dim_v;
|
||||
uint32_t wg_size;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_flash_attn_vec_reduce_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.head_dim_v);
|
||||
ggml_webgpu_hash_combine(seed, key.wg_size);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
inline bool operator==(const ggml_webgpu_flash_attn_vec_reduce_pipeline_key & lhs,
|
||||
const ggml_webgpu_flash_attn_vec_reduce_pipeline_key & rhs) {
|
||||
return lhs.head_dim_v == rhs.head_dim_v && lhs.wg_size == rhs.wg_size;
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_vec_reduce_shader_lib_context {
|
||||
ggml_webgpu_flash_attn_vec_reduce_pipeline_key key;
|
||||
uint32_t max_wg_size;
|
||||
};
|
||||
|
||||
inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_vec_reduce_shader(
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_vec_reduce_shader_lib_context & context) {
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_reduce";
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(context.key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(context.key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
variant += std::string("_wg") + std::to_string(context.max_wg_size);
|
||||
|
||||
ggml_webgpu_processed_shader result;
|
||||
result.wgsl = preprocessor.preprocess(shader_src, defines);
|
||||
result.variant = variant;
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_pipeline_key {
|
||||
uint32_t q_tile;
|
||||
uint32_t kv_tile;
|
||||
|
||||
bool operator==(const ggml_webgpu_flash_attn_blk_pipeline_key & other) const {
|
||||
return q_tile == other.q_tile && kv_tile == other.kv_tile;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_flash_attn_blk_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.q_tile);
|
||||
ggml_webgpu_hash_combine(seed, key.kv_tile);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_flash_attn_blk_shader_lib_context {
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key key;
|
||||
uint32_t max_wg_size;
|
||||
};
|
||||
|
||||
inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_blk_shader(
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_blk";
|
||||
|
||||
defines.push_back(std::string("Q_TILE=") + std::to_string(context.key.q_tile));
|
||||
variant += std::string("_qt") + std::to_string(context.key.q_tile);
|
||||
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(context.key.kv_tile));
|
||||
variant += std::string("_kvt") + std::to_string(context.key.kv_tile);
|
||||
|
||||
uint32_t wg_size = 1;
|
||||
while ((wg_size << 1) <= context.max_wg_size) {
|
||||
wg_size <<= 1;
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
variant += std::string("_wg") + std::to_string(wg_size);
|
||||
|
||||
ggml_webgpu_processed_shader result;
|
||||
result.wgsl = preprocessor.preprocess(shader_src, defines);
|
||||
result.variant = variant;
|
||||
return result;
|
||||
}
|
||||
|
||||
// This is exposed because it's necessary in supports_op
|
||||
inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
|
||||
uint32_t kv_tile,
|
||||
|
|
@ -659,6 +782,14 @@ class ggml_webgpu_shader_lib {
|
|||
repeat_pipelines; // type
|
||||
std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
|
||||
flash_attn_pipelines;
|
||||
std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
|
||||
flash_attn_vec_reduce_pipelines;
|
||||
std::unordered_map<ggml_webgpu_flash_attn_blk_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key_hash>
|
||||
flash_attn_blk_pipelines;
|
||||
std::unordered_map<ggml_webgpu_legacy_mul_mat_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key_hash>
|
||||
|
|
@ -1673,24 +1804,8 @@ class ggml_webgpu_shader_lib {
|
|||
return repeat_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
const bool has_mask = context.src3 != nullptr;
|
||||
const bool has_sinks = context.src4 != nullptr;
|
||||
|
||||
bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
|
||||
(context.src1->ne[1] % context.sg_mat_n == 0);
|
||||
|
||||
ggml_webgpu_flash_attn_pipeline_key key = {
|
||||
.kv_type = context.src1->type,
|
||||
.head_dim_qk = (uint32_t) context.src0->ne[0],
|
||||
.head_dim_v = (uint32_t) context.src2->ne[0],
|
||||
.kv_direct = kv_direct,
|
||||
.has_mask = has_mask,
|
||||
.has_sinks = has_sinks,
|
||||
.uses_logit_softcap = (*(float *) &context.dst->op_params[2]) != 0.0f,
|
||||
};
|
||||
|
||||
auto it = flash_attn_pipelines.find(key);
|
||||
webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_flash_attn_shader_lib_context & context) {
|
||||
auto it = flash_attn_pipelines.find(context.key);
|
||||
if (it != flash_attn_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
|
@ -1698,7 +1813,7 @@ class ggml_webgpu_shader_lib {
|
|||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn";
|
||||
|
||||
switch (key.kv_type) {
|
||||
switch (context.key.kv_type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("KV_F32");
|
||||
break;
|
||||
|
|
@ -1714,41 +1829,51 @@ class ggml_webgpu_shader_lib {
|
|||
default:
|
||||
GGML_ABORT("Unsupported KV type for flash attention shader");
|
||||
}
|
||||
variant += std::string("_") + ggml_type_name(key.kv_type);
|
||||
variant += std::string("_") + ggml_type_name(context.key.kv_type);
|
||||
|
||||
if (key.has_mask) {
|
||||
if (context.key.has_mask) {
|
||||
defines.push_back("MASK");
|
||||
variant += "_mask";
|
||||
}
|
||||
if (key.has_sinks) {
|
||||
if (context.key.has_sinks) {
|
||||
defines.push_back("SINKS");
|
||||
variant += "_sinks";
|
||||
}
|
||||
if (key.uses_logit_softcap) {
|
||||
if (context.key.uses_logit_softcap) {
|
||||
defines.push_back("LOGIT_SOFTCAP");
|
||||
variant += "_lgsc";
|
||||
}
|
||||
if (key.kv_direct) {
|
||||
if (context.key.kv_direct) {
|
||||
defines.push_back("KV_DIRECT");
|
||||
variant += "_kvdirect";
|
||||
}
|
||||
if (context.key.has_mask && context.key.use_vec) {
|
||||
defines.push_back("BLK");
|
||||
variant += "_blk";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
|
||||
variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
|
||||
defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(context.key.head_dim_qk));
|
||||
variant += std::string("_hsqk") + std::to_string(context.key.head_dim_qk);
|
||||
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(key.head_dim_v);
|
||||
defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(context.key.head_dim_v));
|
||||
variant += std::string("_hsv") + std::to_string(context.key.head_dim_v);
|
||||
|
||||
defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
|
||||
defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
|
||||
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
|
||||
|
||||
uint32_t q_tile = context.sg_mat_m;
|
||||
uint32_t kv_tile =
|
||||
std::min(ggml_webgpu_flash_attn_max_kv_tile({ key, context.sg_mat_m, context.sg_mat_n, context.sg_mat_k,
|
||||
context.wg_mem_limit_bytes, context.max_subgroup_size }),
|
||||
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
if (key.kv_direct) {
|
||||
uint32_t q_tile = context.sg_mat_m;
|
||||
uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
|
||||
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
if (context.key.use_vec) {
|
||||
q_tile = 1;
|
||||
kv_tile = std::max(context.sg_mat_n, std::min(32u, ggml_webgpu_flash_attn_max_kv_tile(context)));
|
||||
kv_tile = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
|
||||
const uint32_t vec_ne = ggml_webgpu_flash_attn_pick_vec_ne(context.key);
|
||||
defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
|
||||
}
|
||||
if (context.key.kv_direct) {
|
||||
GGML_ASSERT(kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
|
||||
while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
|
||||
kv_tile -= context.sg_mat_n;
|
||||
}
|
||||
|
|
@ -1757,19 +1882,51 @@ class ggml_webgpu_shader_lib {
|
|||
defines.push_back(std::string("Q_TILE=") + std::to_string(q_tile));
|
||||
defines.push_back(std::string("KV_TILE=") + std::to_string(kv_tile));
|
||||
|
||||
uint32_t wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
|
||||
uint32_t wg_size = 0;
|
||||
if (context.key.use_vec) {
|
||||
wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
|
||||
} else {
|
||||
wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_flash_attn, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
|
||||
decisions->q_tile = q_tile;
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = wg_size;
|
||||
const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
|
||||
decisions->q_tile = q_tile;
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = wg_size;
|
||||
pipeline.context = decisions;
|
||||
flash_attn_pipelines[context.key] = pipeline;
|
||||
return flash_attn_pipelines[context.key];
|
||||
}
|
||||
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
flash_attn_pipelines[key] = pipeline;
|
||||
return flash_attn_pipelines[key];
|
||||
webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
auto it = flash_attn_blk_pipelines.find(context.key);
|
||||
if (it != flash_attn_blk_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_blk_shader(preprocessor, wgsl_flash_attn_vec_blk, context);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
flash_attn_blk_pipelines[context.key] = pipeline;
|
||||
return flash_attn_blk_pipelines[context.key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_flash_attn_vec_reduce_pipeline(
|
||||
const ggml_webgpu_flash_attn_vec_reduce_shader_lib_context & context) {
|
||||
auto it = flash_attn_vec_reduce_pipelines.find(context.key);
|
||||
if (it != flash_attn_vec_reduce_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_vec_reduce_shader(preprocessor, wgsl_flash_attn_vec_reduce, context);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
flash_attn_vec_reduce_pipelines[context.key] = pipeline;
|
||||
return flash_attn_vec_reduce_pipelines[context.key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_cpy_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,105 @@
|
|||
diagnostic(off, subgroup_uniformity);
|
||||
enable f16;
|
||||
|
||||
#define Q_TILE 1
|
||||
#define KV_TILE 32
|
||||
#define WG_SIZE 32
|
||||
|
||||
struct Params {
|
||||
offset_mask: u32,
|
||||
seq_len_q: u32,
|
||||
seq_len_kv: u32,
|
||||
stride_mask3: u32,
|
||||
// Number of KV blocks and Q blocks per batch.
|
||||
// nblk0 = ceil(seq_len_kv / KV_TILE), nblk1 = ceil(seq_len_q / Q_TILE).
|
||||
nblk0: u32,
|
||||
nblk1: u32,
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read> mask: array<f16>;
|
||||
@group(0) @binding(1) var<storage, read_write> blk: array<u32>;
|
||||
@group(0) @binding(2) var<uniform> params: Params;
|
||||
|
||||
const MASK_MIN: f32 = -65504.0;
|
||||
const MASK_MAX: f32 = 65504.0;
|
||||
var<workgroup> wg_min: array<f32, WG_SIZE>;
|
||||
var<workgroup> wg_max: array<f32, WG_SIZE>;
|
||||
var<workgroup> wg_any: array<u32, WG_SIZE>;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>) {
|
||||
// Dispatch mapping:
|
||||
// - x indexes KV blocks
|
||||
// - y flattens (batch_idx, q_blk) as y = batch_idx * nblk1 + q_blk
|
||||
let kv_blk = wg_id.x;
|
||||
let y = wg_id.y;
|
||||
let q_blk = y % params.nblk1;
|
||||
let batch_idx = y / params.nblk1;
|
||||
if (kv_blk >= params.nblk0) {
|
||||
return;
|
||||
}
|
||||
|
||||
let q_start = q_blk * Q_TILE;
|
||||
let k_start = kv_blk * KV_TILE;
|
||||
|
||||
let mask_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
|
||||
let mask_batch_base = params.offset_mask + mask_batch * params.stride_mask3;
|
||||
|
||||
// We keep min/max to classify:
|
||||
// - fully masked (max <= MASK_MIN)
|
||||
// - all-zero mask (min == 0 && max == 0)
|
||||
// - mixed/general mask
|
||||
var local_min = MASK_MAX;
|
||||
var local_max = -MASK_MAX;
|
||||
var local_any = 0u;
|
||||
|
||||
for (var q_rel = 0u; q_rel < Q_TILE; q_rel += 1u) {
|
||||
let q_row = q_start + q_rel;
|
||||
if (q_row >= params.seq_len_q) {
|
||||
continue;
|
||||
}
|
||||
let row_base = mask_batch_base + q_row * params.seq_len_kv;
|
||||
for (var k_rel = local_id.x; k_rel < KV_TILE; k_rel += WG_SIZE) {
|
||||
let k_col = k_start + k_rel;
|
||||
if (k_col >= params.seq_len_kv) {
|
||||
continue;
|
||||
}
|
||||
let mv = f32(mask[row_base + k_col]);
|
||||
local_min = min(local_min, mv);
|
||||
local_max = max(local_max, mv);
|
||||
local_any = 1u;
|
||||
}
|
||||
}
|
||||
|
||||
wg_min[local_id.x] = local_min;
|
||||
wg_max[local_id.x] = local_max;
|
||||
wg_any[local_id.x] = local_any;
|
||||
workgroupBarrier();
|
||||
|
||||
// Thread 0 writes one state per block.
|
||||
if (local_id.x == 0u) {
|
||||
var mmin = wg_min[0];
|
||||
var mmax = wg_max[0];
|
||||
var many = wg_any[0];
|
||||
for (var i = 1u; i < WG_SIZE; i += 1u) {
|
||||
mmin = min(mmin, wg_min[i]);
|
||||
mmax = max(mmax, wg_max[i]);
|
||||
many = max(many, wg_any[i]);
|
||||
}
|
||||
|
||||
var state = 0u;
|
||||
if (many != 0u) {
|
||||
if (mmax <= MASK_MIN) {
|
||||
state = 0u;
|
||||
} else if (mmin == 0.0 && mmax == 0.0) {
|
||||
state = 2u;
|
||||
} else {
|
||||
state = 1u;
|
||||
}
|
||||
}
|
||||
|
||||
let blk_idx = (batch_idx * params.nblk1 + q_blk) * params.nblk0 + kv_blk;
|
||||
blk[blk_idx] = state;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
diagnostic(off, subgroup_uniformity);
|
||||
enable f16;
|
||||
enable subgroups;
|
||||
|
||||
// Default values
|
||||
#define HEAD_DIM_V 64
|
||||
#define WG_SIZE 128
|
||||
|
||||
struct Params {
|
||||
nrows: u32,
|
||||
seq_len_q: u32,
|
||||
n_heads: u32,
|
||||
offset_dst: u32,
|
||||
nwg: u32,
|
||||
tmp_data_base: u32,
|
||||
tmp_stats_base: u32,
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> tmp: array<f32>;
|
||||
@group(0) @binding(1) var<storage, read_write> dst: array<vec4<f32>>;
|
||||
@group(0) @binding(2) var<uniform> params: Params;
|
||||
|
||||
const FLOAT_MIN: f32 = -1.0e9;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(subgroup_id) subgroup_id: u32,
|
||||
@builtin(num_subgroups) num_subgroups: u32,
|
||||
@builtin(subgroup_size) subgroup_size: u32,
|
||||
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
|
||||
let rid = wg_id.x;
|
||||
if (rid >= params.nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
let rows_per_batch = params.n_heads * params.seq_len_q;
|
||||
let batch_idx = rid / rows_per_batch;
|
||||
let rem = rid % rows_per_batch;
|
||||
let head_idx = rem / params.seq_len_q;
|
||||
let q_row = rem % params.seq_len_q;
|
||||
|
||||
let dst2_stride = HEAD_DIM_V * params.n_heads;
|
||||
let dst3_stride = dst2_stride * params.seq_len_q;
|
||||
let row_base = params.offset_dst + batch_idx * dst3_stride + q_row * dst2_stride + head_idx * HEAD_DIM_V;
|
||||
|
||||
let thread = sg_inv_id;
|
||||
if (params.nwg > subgroup_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
let stats_base = params.tmp_stats_base + rid * (2u * params.nwg);
|
||||
let active_thread = thread < params.nwg;
|
||||
let si = select(0.0, tmp[stats_base + 2u * thread + 0u], active_thread);
|
||||
let mi = select(FLOAT_MIN, tmp[stats_base + 2u * thread + 1u], active_thread);
|
||||
let m = subgroupMax(mi);
|
||||
let ms = select(0.0, exp(mi - m), active_thread);
|
||||
let s = subgroupAdd(si * ms);
|
||||
let inv_s = select(0.0, 1.0 / s, s != 0.0);
|
||||
|
||||
let row_tmp_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg);
|
||||
for (var elem_base = subgroup_id * 4u; elem_base < HEAD_DIM_V; elem_base += num_subgroups * 4u) {
|
||||
var weighted = vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
if (active_thread) {
|
||||
let src = row_tmp_base + thread * HEAD_DIM_V + elem_base;
|
||||
weighted = vec4<f32>(tmp[src + 0u], tmp[src + 1u], tmp[src + 2u], tmp[src + 3u]) * ms;
|
||||
}
|
||||
|
||||
let sum_x = subgroupAdd(weighted.x);
|
||||
let sum_y = subgroupAdd(weighted.y);
|
||||
let sum_z = subgroupAdd(weighted.z);
|
||||
let sum_w = subgroupAdd(weighted.w);
|
||||
|
||||
if (thread == 0u) {
|
||||
let dst_vec_index = (row_base + elem_base) >> 2u;
|
||||
dst[dst_vec_index] = vec4<f32>(sum_x, sum_y, sum_z, sum_w) * inv_s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,729 @@
|
|||
diagnostic(off, chromium.subgroup_matrix_uniformity);
|
||||
diagnostic(off, subgroup_uniformity);
|
||||
enable f16;
|
||||
enable subgroups;
|
||||
enable chromium_experimental_subgroup_matrix;
|
||||
|
||||
#ifdef KV_F32
|
||||
#define KV_TYPE f32
|
||||
#else
|
||||
#define KV_TYPE f16
|
||||
#endif
|
||||
|
||||
#define HEAD_DIM_QK 64
|
||||
#define HEAD_DIM_V 64
|
||||
|
||||
|
||||
#define SG_MAT_M 8
|
||||
#define SG_MAT_N 8
|
||||
#define SG_MAT_K 8
|
||||
|
||||
#define Q_TILE SG_MAT_M
|
||||
#define KV_TILE 16
|
||||
#define WG_SIZE 64
|
||||
#ifndef VEC_NE
|
||||
#define VEC_NE 4u
|
||||
#endif
|
||||
|
||||
#define KV_BLOCKS (KV_TILE / SG_MAT_N)
|
||||
|
||||
#define BLOCK_SIZE 32
|
||||
#define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
|
||||
#define BLOCKS_V ((HEAD_DIM_V + BLOCK_SIZE - 1) / BLOCK_SIZE)
|
||||
#if defined(KV_Q4_0)
|
||||
#define NQ 16
|
||||
#define F16_PER_BLOCK 9
|
||||
#define WEIGHTS_PER_F16 4
|
||||
#elif defined(KV_Q8_0)
|
||||
#define NQ 8
|
||||
#define F16_PER_BLOCK 17
|
||||
#define WEIGHTS_PER_F16 2
|
||||
#endif
|
||||
#define F16_PER_THREAD (NQ / WEIGHTS_PER_F16)
|
||||
|
||||
fn get_byte(value: u32, index: u32) -> u32 {
|
||||
return (value >> (index * 8)) & 0xFF;
|
||||
}
|
||||
|
||||
fn get_byte_i32(value: u32, index: u32) -> i32 {
|
||||
return bitcast<i32>(((value >> (index * 8)) & 0xFF) << 24) >> 24;
|
||||
}
|
||||
|
||||
struct Params {
|
||||
offset_q: u32,
|
||||
offset_k: u32,
|
||||
offset_v: u32,
|
||||
offset_mask: u32,
|
||||
offset_sinks: u32,
|
||||
offset_dst: u32,
|
||||
|
||||
// shapes of Q/K/V
|
||||
n_heads: u32,
|
||||
seq_len_q: u32,
|
||||
seq_len_kv: u32,
|
||||
|
||||
// strides (in elements)
|
||||
stride_q1: u32,
|
||||
stride_q2: u32,
|
||||
stride_q3: u32,
|
||||
stride_k1: u32,
|
||||
stride_k2: u32,
|
||||
stride_k3: u32,
|
||||
stride_v1: u32,
|
||||
stride_v2: u32,
|
||||
stride_v3: u32,
|
||||
stride_mask3: u32,
|
||||
|
||||
// repeat factors for K/V, e.g., MHA vs. MQA vs. GQA
|
||||
q_per_kv: u32,
|
||||
|
||||
// softmax params
|
||||
scale: f32,
|
||||
max_bias: f32,
|
||||
logit_softcap: f32,
|
||||
n_head_log2: f32,
|
||||
m0: f32,
|
||||
m1: f32,
|
||||
|
||||
#ifdef BLK
|
||||
blk_base: u32,
|
||||
blk_nblk0: u32,
|
||||
blk_nblk1: u32,
|
||||
#endif
|
||||
|
||||
tmp_data_base: u32,
|
||||
tmp_stats_base: u32,
|
||||
nwg: u32,
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
|
||||
#if defined(KV_Q4_0) || defined(KV_Q8_0)
|
||||
@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
|
||||
#else
|
||||
@group(0) @binding(1) var<storage, read_write> K: array<vec4<KV_TYPE>>;
|
||||
#endif
|
||||
#if defined(KV_Q4_0) || defined(KV_Q8_0)
|
||||
@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
|
||||
#else
|
||||
@group(0) @binding(2) var<storage, read_write> V: array<vec4<KV_TYPE>>;
|
||||
#endif
|
||||
#if defined(MASK) && defined(SINKS)
|
||||
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
|
||||
@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
|
||||
#ifdef BLK
|
||||
#define BLK_BINDING 5
|
||||
#define TMP_BINDING 6
|
||||
#define DST_BINDING 7
|
||||
#define PARAMS_BINDING 8
|
||||
#else
|
||||
#define TMP_BINDING 5
|
||||
#define DST_BINDING 6
|
||||
#define PARAMS_BINDING 7
|
||||
#endif
|
||||
#elif defined(MASK)
|
||||
@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
|
||||
#ifdef BLK
|
||||
#define BLK_BINDING 4
|
||||
#define TMP_BINDING 5
|
||||
#define DST_BINDING 6
|
||||
#define PARAMS_BINDING 7
|
||||
#else
|
||||
#define TMP_BINDING 4
|
||||
#define DST_BINDING 5
|
||||
#define PARAMS_BINDING 6
|
||||
#endif
|
||||
#elif defined(SINKS)
|
||||
@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
|
||||
#define TMP_BINDING 4
|
||||
#define DST_BINDING 5
|
||||
#define PARAMS_BINDING 6
|
||||
#else
|
||||
#define TMP_BINDING 3
|
||||
#define DST_BINDING 4
|
||||
#define PARAMS_BINDING 5
|
||||
#endif
|
||||
|
||||
#ifdef BLK
|
||||
@group(0) @binding(BLK_BINDING) var<storage, read_write> blk: array<u32>;
|
||||
#endif
|
||||
@group(0) @binding(TMP_BINDING) var<storage, read_write> tmp: array<f32>;
|
||||
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
|
||||
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
|
||||
|
||||
// Just a very small float value.
|
||||
const FLOAT_MIN: f32 = -1.0e9;
|
||||
|
||||
var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
|
||||
|
||||
#ifndef KV_DIRECT
|
||||
const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
|
||||
// we can reuse the same shmem for K and V since we only need one at a time
|
||||
var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
|
||||
#endif
|
||||
|
||||
var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>;
|
||||
|
||||
#ifdef MASK
|
||||
// storage for mask values
|
||||
var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
|
||||
#endif
|
||||
|
||||
// note that we reuse the same storage for both since we only need one at a time
|
||||
var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;
|
||||
|
||||
// Storage for row max and exp sum during online softmax
|
||||
var<workgroup> row_max_shmem: array<f32, Q_TILE>;
|
||||
var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
|
||||
var<workgroup> blk_state_wg: u32;
|
||||
|
||||
fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
|
||||
var v = select(FLOAT_MIN,
|
||||
f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
|
||||
kv_idx < KV_TILE);
|
||||
#ifdef LOGIT_SOFTCAP
|
||||
v = params.logit_softcap * tanh(v);
|
||||
#endif
|
||||
#ifdef MASK
|
||||
if (apply_mask) {
|
||||
var mask_val = select(0.0,f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
|
||||
v += select(mask_val, slope * mask_val, has_bias);
|
||||
}
|
||||
#endif
|
||||
return v;
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(subgroup_id) subgroup_id: u32,
|
||||
@builtin(subgroup_size) subgroup_size: u32,
|
||||
@builtin(num_subgroups) num_subgroups: u32,
|
||||
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
|
||||
|
||||
// initialize row max for online softmax
|
||||
for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
|
||||
row_max_shmem[i] = FLOAT_MIN;
|
||||
exp_sum_shmem[i] = 0.0;
|
||||
}
|
||||
|
||||
for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
|
||||
o_shmem[i] = 0.0;
|
||||
}
|
||||
|
||||
// workgroups per head/batch
|
||||
let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
|
||||
let wg_per_batch = wg_per_head * params.n_heads;
|
||||
|
||||
let dst2_stride = HEAD_DIM_V * params.n_heads;
|
||||
let dst3_stride = dst2_stride * params.seq_len_q;
|
||||
|
||||
let iwg = wg_id.x % params.nwg;
|
||||
let base_wg_id = wg_id.x / params.nwg;
|
||||
|
||||
// batch index
|
||||
let batch_idx = base_wg_id / wg_per_batch;
|
||||
let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
|
||||
let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
|
||||
let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
|
||||
let wg_in_batch = base_wg_id % wg_per_batch;
|
||||
|
||||
// head index
|
||||
let head_idx = wg_in_batch / wg_per_head;
|
||||
let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
|
||||
let k_head_idx = head_idx / params.q_per_kv;
|
||||
let v_head_idx = k_head_idx;
|
||||
let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
|
||||
let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;
|
||||
|
||||
// starting Q row for this workgroup
|
||||
let wg_in_head = wg_in_batch % wg_per_head;
|
||||
let q_row_start = wg_in_head * Q_TILE;
|
||||
|
||||
#ifdef MASK
|
||||
// mask offset
|
||||
let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
|
||||
#endif
|
||||
|
||||
let head = f32(head_idx);
|
||||
let has_bias = params.max_bias > 0.0;
|
||||
let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), has_bias);
|
||||
|
||||
// load q tile into shared memory
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
|
||||
let q_row = elem_idx / HEAD_DIM_QK;
|
||||
let q_col = elem_idx % HEAD_DIM_QK;
|
||||
let head_q_row = q_row_start + q_row;
|
||||
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
|
||||
q_shmem[elem_idx] = f16(select(
|
||||
0.0,
|
||||
Q[global_q_row_offset + q_col],
|
||||
head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
|
||||
}
|
||||
|
||||
for (var kv_tile = iwg * KV_TILE; kv_tile < params.seq_len_kv; kv_tile += KV_TILE * params.nwg) {
|
||||
#ifdef BLK
|
||||
let q_blk = q_row_start / Q_TILE;
|
||||
let kv_blk = kv_tile / KV_TILE;
|
||||
let blk_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
|
||||
let blk_idx = params.blk_base + (blk_batch * params.blk_nblk1 + q_blk) * params.blk_nblk0 + kv_blk;
|
||||
let blk_state_local = blk[blk_idx];
|
||||
#else
|
||||
let blk_state_local = 1u;
|
||||
#endif
|
||||
if (local_id.x == 0u) {
|
||||
blk_state_wg = blk_state_local;
|
||||
}
|
||||
workgroupBarrier();
|
||||
let blk_state = blk_state_wg;
|
||||
let skip_tile = blk_state == 0u;
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
|
||||
inter_shmem[elem_idx] = f16(0.0);
|
||||
}
|
||||
|
||||
// load k tile into shared memory
|
||||
#if defined(KV_Q4_0)
|
||||
for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) {
|
||||
let blck_idx = elem_idx / BLOCK_SIZE;
|
||||
let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let k_row = blck_idx / BLOCKS_K;
|
||||
let global_k_row = kv_tile + k_row;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let row_offset = k_row * HEAD_DIM_QK;
|
||||
|
||||
if (global_k_row < params.seq_len_kv) {
|
||||
let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k;
|
||||
let base_idx = global_block_idx * F16_PER_BLOCK;
|
||||
let d = K[base_idx];
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = K[base_idx + 1u + block_offset + j];
|
||||
let q_1 = K[base_idx + 1u + block_offset + j + 1];
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
|
||||
let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
|
||||
let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
|
||||
kv_shmem[row_offset + idx] = q_lo;
|
||||
kv_shmem[row_offset + idx + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(KV_Q8_0)
|
||||
for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) {
|
||||
let blck_idx = elem_idx / BLOCK_SIZE;
|
||||
let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let k_row = blck_idx / BLOCKS_K;
|
||||
let global_k_row = kv_tile + k_row;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let row_offset = k_row * HEAD_DIM_QK;
|
||||
|
||||
if (global_k_row < params.seq_len_kv) {
|
||||
let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k;
|
||||
let base_idx = global_block_idx * F16_PER_BLOCK;
|
||||
let d = K[base_idx];
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = K[base_idx + 1u + block_offset + j];
|
||||
let q_1 = K[base_idx + 1u + block_offset + j + 1];
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
let q_val = f16(q_byte) * d;
|
||||
let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
|
||||
kv_shmem[row_offset + idx] = q_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(KV_DIRECT)
|
||||
// Direct global loads for KV
|
||||
#else
|
||||
for (var elem_idx = local_id.x * 4u; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * 4u) {
|
||||
let k_row = elem_idx / HEAD_DIM_QK;
|
||||
let k_col = elem_idx % HEAD_DIM_QK;
|
||||
let global_k_row = kv_tile + k_row;
|
||||
let global_k_row_offset = k_head_offset + global_k_row * params.stride_k1;
|
||||
let in_bounds = global_k_row < params.seq_len_kv && (k_col + 3u) < HEAD_DIM_QK;
|
||||
let vec_idx = (global_k_row_offset + k_col) >> 2u;
|
||||
let k4 = select(vec4<KV_TYPE>(0.0), K[vec_idx], in_bounds);
|
||||
kv_shmem[elem_idx + 0u] = f16(k4.x);
|
||||
kv_shmem[elem_idx + 1u] = f16(k4.y);
|
||||
kv_shmem[elem_idx + 2u] = f16(k4.z);
|
||||
kv_shmem[elem_idx + 3u] = f16(k4.w);
|
||||
}
|
||||
#endif
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
// accumulate q block * k block into registers across the entire KV tile
|
||||
if (!skip_tile) {
|
||||
let num_of_threads = subgroup_size / VEC_NE;
|
||||
let tx = sg_inv_id % num_of_threads;
|
||||
let ty = sg_inv_id / num_of_threads;
|
||||
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) {
|
||||
continue;
|
||||
}
|
||||
let local_q_row_offset = q_tile_row * HEAD_DIM_QK;
|
||||
|
||||
for (var kv_base : u32 = 0u; kv_base < KV_TILE; kv_base += VEC_NE) {
|
||||
let kv_idx = kv_base + ty;
|
||||
var partial_sum: f32 = 0.0;
|
||||
let kv_valid = kv_idx < KV_TILE && (kv_tile + kv_idx) < params.seq_len_kv;
|
||||
if (kv_valid) {
|
||||
for (var i = tx; i < (HEAD_DIM_QK / 4u); i += num_of_threads) {
|
||||
let q_off = local_q_row_offset + i * 4u;
|
||||
|
||||
let qv = vec4<f32>(
|
||||
f32(q_shmem[q_off + 0u]),
|
||||
f32(q_shmem[q_off + 1u]),
|
||||
f32(q_shmem[q_off + 2u]),
|
||||
f32(q_shmem[q_off + 3u]));
|
||||
#ifdef KV_DIRECT
|
||||
let idx = k_head_offset + (kv_tile + kv_idx) * params.stride_k1 + (i * 4u);
|
||||
let kv = vec4<f32>(K[idx >> 2u]);
|
||||
#else
|
||||
let idx = kv_idx * HEAD_DIM_QK + (i * 4u);
|
||||
let kv = vec4<f32>(
|
||||
f32(kv_shmem[idx + 0u]),
|
||||
f32(kv_shmem[idx + 1u]),
|
||||
f32(kv_shmem[idx + 2u]),
|
||||
f32(kv_shmem[idx + 3u]));
|
||||
#endif
|
||||
partial_sum += dot(qv, kv);
|
||||
}
|
||||
}
|
||||
var sum = partial_sum;
|
||||
// Reduce over tx threads (NL) for this ty stripe.
|
||||
var tx_delta = num_of_threads >> 1u;
|
||||
loop {
|
||||
if (tx_delta == 0u) {
|
||||
break;
|
||||
}
|
||||
let sh = subgroupShuffleDown(sum, tx_delta);
|
||||
if (tx < tx_delta) {
|
||||
sum += sh;
|
||||
}
|
||||
tx_delta >>= 1u;
|
||||
}
|
||||
|
||||
let sum_bcast = subgroupShuffle(sum, num_of_threads * ty);
|
||||
if (tx == 0u && kv_valid) {
|
||||
let dst_idx = q_tile_row * KV_TILE + kv_idx;
|
||||
inter_shmem[dst_idx] = f16(sum_bcast);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef MASK
|
||||
let apply_mask = !skip_tile && (blk_state != 2u);
|
||||
if (apply_mask) {
|
||||
// load mask tile into shared memory for this KV block
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
|
||||
let mask_row = elem_idx / KV_TILE;
|
||||
let mask_col = elem_idx % KV_TILE;
|
||||
let global_q_row = q_row_start + mask_row;
|
||||
let global_k_col = kv_tile + mask_col;
|
||||
let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
|
||||
let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
|
||||
mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
|
||||
}
|
||||
}
|
||||
#else
|
||||
let apply_mask = false;
|
||||
#endif
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
// online softmax
|
||||
if (!skip_tile) {
|
||||
for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) {
|
||||
break;
|
||||
}
|
||||
|
||||
var prev_max = row_max_shmem[q_tile_row];
|
||||
var final_max = prev_max;
|
||||
// pass 1: compute final max across the full KV tile in chunks
|
||||
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
|
||||
let kv_idx = kv_offset + sg_inv_id;
|
||||
let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
|
||||
let softmax_term = select(FLOAT_MIN,
|
||||
calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask),
|
||||
kv_valid);
|
||||
final_max = subgroupMax(max(final_max, softmax_term));
|
||||
}
|
||||
|
||||
var total_exp_term: f32 = 0.0;
|
||||
// pass 2: compute exp sum and write P using final_max
|
||||
for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
|
||||
let kv_idx = kv_offset + sg_inv_id;
|
||||
let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask);
|
||||
let cur_p = select(0.0,
|
||||
exp(softmax_term - final_max),
|
||||
kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
|
||||
total_exp_term += subgroupAdd(cur_p);
|
||||
if (kv_idx < KV_TILE) {
|
||||
inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
|
||||
}
|
||||
}
|
||||
|
||||
let cur_exp = exp(prev_max - final_max);
|
||||
|
||||
if (sg_inv_id == 0) {
|
||||
row_max_shmem[q_tile_row] = final_max;
|
||||
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
|
||||
}
|
||||
|
||||
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
|
||||
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
|
||||
o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// load v tile into shared memory
|
||||
#if defined(KV_Q4_0)
|
||||
for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) {
|
||||
let blck_idx = elem_idx / BLOCK_SIZE;
|
||||
let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let v_row = blck_idx / BLOCKS_V;
|
||||
let global_v_row = kv_tile + v_row;
|
||||
let block_k = blck_idx % BLOCKS_V;
|
||||
let row_offset = v_row * HEAD_DIM_V;
|
||||
|
||||
if (global_v_row < params.seq_len_kv) {
|
||||
let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k;
|
||||
let base_idx = global_block_idx * F16_PER_BLOCK;
|
||||
let d = V[base_idx];
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = V[base_idx + 1u + block_offset + j];
|
||||
let q_1 = V[base_idx + 1u + block_offset + j + 1];
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
|
||||
let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
|
||||
let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
|
||||
kv_shmem[row_offset + idx] = q_lo;
|
||||
kv_shmem[row_offset + idx + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(KV_Q8_0)
|
||||
for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) {
|
||||
let blck_idx = elem_idx / BLOCK_SIZE;
|
||||
let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let v_row = blck_idx / BLOCKS_V;
|
||||
let global_v_row = kv_tile + v_row;
|
||||
let block_k = blck_idx % BLOCKS_V;
|
||||
let row_offset = v_row * HEAD_DIM_V;
|
||||
|
||||
if (global_v_row < params.seq_len_kv) {
|
||||
let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k;
|
||||
let base_idx = global_block_idx * F16_PER_BLOCK;
|
||||
let d = V[base_idx];
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = V[base_idx + 1u + block_offset + j];
|
||||
let q_1 = V[base_idx + 1u + block_offset + j + 1];
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
let q_val = f16(q_byte) * d;
|
||||
let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k;
|
||||
kv_shmem[row_offset + idx] = q_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(KV_DIRECT)
|
||||
// Direct global loads for KV
|
||||
#else
|
||||
for (var elem_idx = local_id.x * 4u; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * 4u) {
|
||||
let v_row = elem_idx / HEAD_DIM_V;
|
||||
let v_col = elem_idx % HEAD_DIM_V;
|
||||
let global_v_row = kv_tile + v_row;
|
||||
let global_v_row_offset = v_head_offset + global_v_row * params.stride_v1;
|
||||
let in_bounds = global_v_row < params.seq_len_kv && (v_col + 3u) < HEAD_DIM_V;
|
||||
let vec_idx = (global_v_row_offset + v_col) >> 2u;
|
||||
let v4 = select(vec4<KV_TYPE>(0.0), V[vec_idx], in_bounds);
|
||||
kv_shmem[elem_idx + 0u] = f16(v4.x);
|
||||
kv_shmem[elem_idx + 1u] = f16(v4.y);
|
||||
kv_shmem[elem_idx + 2u] = f16(v4.z);
|
||||
kv_shmem[elem_idx + 3u] = f16(v4.w);
|
||||
}
|
||||
#endif
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
if (!skip_tile) {
|
||||
// we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
|
||||
// we want to compute O += P * V across the full KV tile
|
||||
let ne_threads : u32 = VEC_NE;
|
||||
let nl_threads = max(1u, subgroup_size / ne_threads);
|
||||
let tx_pv = sg_inv_id % nl_threads;
|
||||
let ty_pv = sg_inv_id / nl_threads;
|
||||
for (var q_tile_row = subgroup_id;
|
||||
q_tile_row < Q_TILE;
|
||||
q_tile_row += num_subgroups) {
|
||||
for (var vec_col = tx_pv; vec_col < (HEAD_DIM_V / 4u); vec_col += nl_threads) {
|
||||
var lo = vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||
for (var cc = 0u; cc < KV_TILE / ne_threads; cc += 1u) {
|
||||
let kv_idx = cc * ne_threads + ty_pv;
|
||||
let v_row = kv_tile + kv_idx;
|
||||
if (v_row >= params.seq_len_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let p = f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]);
|
||||
#ifdef KV_DIRECT
|
||||
let v_idx = v_head_offset + v_row * params.stride_v1 + vec_col * 4u;
|
||||
let v4 = vec4<f32>(V[v_idx >> 2u]);
|
||||
#else
|
||||
let v_idx = kv_idx * HEAD_DIM_V + vec_col * 4u;
|
||||
let v4 = vec4<f32>(
|
||||
f32(kv_shmem[v_idx + 0u]),
|
||||
f32(kv_shmem[v_idx + 1u]),
|
||||
f32(kv_shmem[v_idx + 2u]),
|
||||
f32(kv_shmem[v_idx + 3u]));
|
||||
#endif
|
||||
lo += p * v4;
|
||||
}
|
||||
|
||||
var lo_x = lo.x;
|
||||
var lo_y = lo.y;
|
||||
var lo_z = lo.z;
|
||||
var lo_w = lo.w;
|
||||
// Reduce over ty threads (NE) for this tx thread.
|
||||
var ty_delta = ne_threads >> 1u;
|
||||
loop {
|
||||
if (ty_delta == 0u) {
|
||||
break;
|
||||
}
|
||||
let thread_delta = ty_delta * nl_threads;
|
||||
let shx = subgroupShuffleDown(lo_x, thread_delta);
|
||||
let shy = subgroupShuffleDown(lo_y, thread_delta);
|
||||
let shz = subgroupShuffleDown(lo_z, thread_delta);
|
||||
let shw = subgroupShuffleDown(lo_w, thread_delta);
|
||||
if (ty_pv < ty_delta) {
|
||||
lo_x += shx;
|
||||
lo_y += shy;
|
||||
lo_z += shz;
|
||||
lo_w += shw;
|
||||
}
|
||||
ty_delta >>= 1u;
|
||||
}
|
||||
|
||||
if (ty_pv == 0u) {
|
||||
let elem_base = vec_col * 4u;
|
||||
let o_base_idx = q_tile_row * HEAD_DIM_V + elem_base;
|
||||
o_shmem[o_base_idx + 0u] = f16(f32(o_shmem[o_base_idx + 0u]) + lo_x);
|
||||
o_shmem[o_base_idx + 1u] = f16(f32(o_shmem[o_base_idx + 1u]) + lo_y);
|
||||
o_shmem[o_base_idx + 2u] = f16(f32(o_shmem[o_base_idx + 2u]) + lo_z);
|
||||
o_shmem[o_base_idx + 3u] = f16(f32(o_shmem[o_base_idx + 3u]) + lo_w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
|
||||
#ifdef SINKS
|
||||
// Sinks are global terms and must be applied exactly once across split workgroups.
|
||||
if (iwg == 0u) {
|
||||
for (var q_tile_row = subgroup_id;
|
||||
q_tile_row < Q_TILE;
|
||||
q_tile_row += num_subgroups) {
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) {
|
||||
break;
|
||||
}
|
||||
|
||||
var prev_max = row_max_shmem[q_tile_row];
|
||||
|
||||
// for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
|
||||
let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
|
||||
let new_max = subgroupMax(max(prev_max, sink_val));
|
||||
let max_exp = exp(prev_max - new_max);
|
||||
let sink_exp = exp(sink_val - new_max);
|
||||
|
||||
let sink_exp_sum = subgroupAdd(sink_exp);
|
||||
|
||||
if (sg_inv_id == 0) {
|
||||
row_max_shmem[q_tile_row] = new_max;
|
||||
exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
|
||||
}
|
||||
|
||||
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
|
||||
let idx = q_tile_row * HEAD_DIM_V + elem_idx;
|
||||
o_shmem[idx] = f16(f32(o_shmem[idx]) * max_exp);
|
||||
}
|
||||
}
|
||||
workgroupBarrier();
|
||||
}
|
||||
#endif
|
||||
let rows_per_batch = params.n_heads * params.seq_len_q;
|
||||
for (var q_tile_row = subgroup_id;
|
||||
q_tile_row < Q_TILE;
|
||||
q_tile_row += num_subgroups) {
|
||||
|
||||
let global_q_row = q_row_start + q_tile_row;
|
||||
if (global_q_row >= params.seq_len_q) { break; }
|
||||
|
||||
if (params.nwg == 1u) {
|
||||
let exp_sum = exp_sum_shmem[q_tile_row];
|
||||
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
|
||||
let row_base: u32 =
|
||||
params.offset_dst + batch_idx * dst3_stride + global_q_row * dst2_stride + head_idx * HEAD_DIM_V;
|
||||
|
||||
for (var elem_base = sg_inv_id * 4u; elem_base < HEAD_DIM_V; elem_base += subgroup_size * 4u) {
|
||||
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
|
||||
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
|
||||
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
|
||||
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
|
||||
|
||||
let v = vec4<f32>(
|
||||
f32(o_shmem[i0]) * scale,
|
||||
f32(o_shmem[i1]) * scale,
|
||||
f32(o_shmem[i2]) * scale,
|
||||
f32(o_shmem[i3]) * scale
|
||||
);
|
||||
|
||||
let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
|
||||
dst[dst_vec_index] = v;
|
||||
}
|
||||
} else {
|
||||
let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + global_q_row;
|
||||
let tmp_row_data_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg) + iwg * HEAD_DIM_V;
|
||||
let tmp_row_stats_base = params.tmp_stats_base + rid * (2u * params.nwg) + 2u * iwg;
|
||||
|
||||
for (var elem_base = sg_inv_id * 4u;
|
||||
elem_base < HEAD_DIM_V;
|
||||
elem_base += subgroup_size * 4u) {
|
||||
|
||||
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
|
||||
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
|
||||
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
|
||||
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
|
||||
|
||||
let tbase = tmp_row_data_base + elem_base;
|
||||
tmp[tbase + 0u] = f32(o_shmem[i0]);
|
||||
tmp[tbase + 1u] = f32(o_shmem[i1]);
|
||||
tmp[tbase + 2u] = f32(o_shmem[i2]);
|
||||
tmp[tbase + 3u] = f32(o_shmem[i3]);
|
||||
}
|
||||
|
||||
if (sg_inv_id == 0u) {
|
||||
tmp[tmp_row_stats_base + 0u] = exp_sum_shmem[q_tile_row];
|
||||
tmp[tmp_row_stats_base + 1u] = row_max_shmem[q_tile_row];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
|
|||
ExternalProject_Add(
|
||||
zendnn
|
||||
GIT_REPOSITORY https://github.com/amd/ZenDNN.git
|
||||
GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08
|
||||
GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e # ZenDNN-2026-WW13
|
||||
PREFIX ${ZENDNN_PREFIX}
|
||||
SOURCE_DIR ${ZENDNN_SOURCE_DIR}
|
||||
BINARY_DIR ${ZENDNN_BUILD_DIR}
|
||||
|
|
|
|||
|
|
@ -190,6 +190,170 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
|||
}
|
||||
}
|
||||
|
||||
struct mmid_row_mapping {
|
||||
int32_t i1;
|
||||
int32_t i2;
|
||||
};
|
||||
|
||||
static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
ggml_backend_zendnn_context * ctx,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0]; // expert weights
|
||||
const ggml_tensor * src1 = dst->src[1]; // inputs
|
||||
const ggml_tensor * ids = dst->src[2]; // expert ids
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// exit for no tokens to process
|
||||
if (ne2 == 0 || ne11 == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_type const vec_dot_type = src0->type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float_ref;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(ne03 == 1);
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
GGML_ASSERT(ne3 == 1);
|
||||
|
||||
// row groups
|
||||
const int n_ids = ids->ne[0]; // n_expert_used
|
||||
const int n_as = ne02; // n_experts
|
||||
|
||||
std::vector<int64_t> matrix_row_counts(n_as, 0);
|
||||
std::vector<std::vector<mmid_row_mapping>> matrix_rows(n_as);
|
||||
|
||||
int64_t max_rows = 0;
|
||||
// group rows by expert (preprocessing step)
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
||||
for (int id = 0; id < n_ids; ++id) {
|
||||
const int32_t i02 = *(const int32_t *)((const char *)ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
matrix_rows[i02].push_back({id, iid1});
|
||||
matrix_row_counts[i02]++;
|
||||
if (matrix_row_counts[i02] > max_rows) {
|
||||
max_rows = matrix_row_counts[i02];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (max_rows == 0) {
|
||||
return; // no rows to process
|
||||
}
|
||||
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
// size for converting src1 rows to vec_dot_type if needed
|
||||
const size_t nbw1 = row_size;
|
||||
const size_t nbw2 = nbw1 * ne11;
|
||||
const size_t nbw3 = nbw2 * ne12;
|
||||
const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
|
||||
|
||||
// size for MoE gather/scatter buffers
|
||||
const size_t wdata_cur_size = max_rows * row_size;
|
||||
const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
|
||||
|
||||
// allocate single buffer for all needs
|
||||
const size_t total_size = src1_conv_size + wdata_cur_size + dst_cur_size;
|
||||
if (ctx->work_size < total_size) {
|
||||
ctx->work_data.reset(new char[total_size]);
|
||||
ctx->work_size = total_size;
|
||||
}
|
||||
|
||||
// partition the buffer
|
||||
char * work_data = ctx->work_data.get();
|
||||
char * wdata_cur = work_data + src1_conv_size;
|
||||
char * dst_cur = wdata_cur + wdata_cur_size;
|
||||
|
||||
if (src1->type != vec_dot_type) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
#pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
const float * src1_f32 = (float *)((char *)src1->data + i11*nb11 + i12*nb12 + i13*nb13);
|
||||
void * src1_conv = (char *)work_data + i11*nbw1 + i12*nbw2 + i13*nbw3;
|
||||
from_float(src1_f32, src1_conv, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
|
||||
|
||||
// process each expert with gather -> gemm -> scatter pattern
|
||||
for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
|
||||
const int64_t cne1 = matrix_row_counts[cur_a];
|
||||
|
||||
if (cne1 == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
||||
|
||||
// gather input rows for this expert
|
||||
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
|
||||
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
|
||||
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
|
||||
const int64_t id = row_mapping.i1;
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = row_mapping.i2;
|
||||
|
||||
std::memcpy(
|
||||
wdata_cur + ir1 * row_size,
|
||||
(const char *) wdata + (i11 + i12*ne11) * row_size,
|
||||
row_size
|
||||
);
|
||||
}
|
||||
|
||||
// batched gemm for all tokens in this expert
|
||||
if (!ggml_zendnn_sgemm(ctx,
|
||||
ne01, // m
|
||||
cne1, // n
|
||||
ne10, // k
|
||||
src0_cur,
|
||||
ne00, // lda
|
||||
wdata_cur,
|
||||
ne10, // ldb
|
||||
dst_cur,
|
||||
ne01, // ldc
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
dst->type)) {
|
||||
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
|
||||
}
|
||||
|
||||
// scatter output rows to destination
|
||||
#pragma omp parallel for num_threads(ctx->n_threads) schedule(static)
|
||||
for (int64_t ir1 = 0; ir1 < cne1; ++ir1) {
|
||||
const mmid_row_mapping & row_mapping = matrix_rows[cur_a][ir1];
|
||||
const int64_t id = row_mapping.i1;
|
||||
const int64_t i1 = id;
|
||||
const int64_t i2 = row_mapping.i2;
|
||||
|
||||
std::memcpy(
|
||||
(char *) dst->data + i1*nb1 + i2*nb2,
|
||||
dst_cur + ir1 * ggml_row_size(dst->type, ne01),
|
||||
ggml_row_size(dst->type, ne01)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// backend interface
|
||||
|
||||
static const char * ggml_backend_zendnn_get_name(ggml_backend_t backend) {
|
||||
|
|
@ -218,6 +382,9 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
|
|||
case GGML_OP_MUL_MAT:
|
||||
ggml_zendnn_compute_forward_mul_mat(ctx, node);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
ggml_zendnn_compute_forward_mul_mat_id(ctx, node);
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
|
|
@ -361,6 +528,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
|
|||
return true;
|
||||
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const ggml_tensor * weights = op->src[0];
|
||||
const ggml_tensor * inputs = op->src[1];
|
||||
|
|
@ -374,6 +542,17 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
|
|||
ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) {
|
||||
return false;
|
||||
}
|
||||
// MUL_MAT_ID performs best with a moderate number of experts due to its
|
||||
// gather + batched matmul + scatter approach. Future versions will leverage
|
||||
// ZenDNN's grouped_gemm for better scalability with larger expert counts:
|
||||
// https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_group_gemm_operator.md
|
||||
if (op->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int64_t n_experts = weights->ne[2];
|
||||
const int64_t max_experts = 32;
|
||||
if (n_experts > max_experts) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (weights->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
|
|
|
|||
|
|
@ -419,6 +419,7 @@ class MODEL_ARCH(IntEnum):
|
|||
GEMMA2 = auto()
|
||||
GEMMA3 = auto()
|
||||
GEMMA3N = auto()
|
||||
GEMMA4 = auto()
|
||||
GEMMA_EMBEDDING = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
|
|
@ -535,8 +536,11 @@ class MODEL_TENSOR(IntEnum):
|
|||
FFN_GATE_INP = auto()
|
||||
FFN_GATE_INP_SHEXP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_PRE_NORM = auto()
|
||||
FFN_PRE_NORM = auto() # alias of FFN_NORM
|
||||
FFN_PRE_NORM_2 = auto() # gemma4
|
||||
FFN_POST_NORM = auto()
|
||||
FFN_POST_NORM_1 = auto() # gemma4
|
||||
FFN_POST_NORM_2 = auto() # gemma4
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
|
|
@ -558,6 +562,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
LAYER_OUT_NORM = auto()
|
||||
LAYER_OUT_SCALE = auto()
|
||||
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
||||
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
||||
PER_LAYER_INP_GATE = auto() # gemma3n
|
||||
|
|
@ -722,8 +727,11 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_GATE = auto()
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_ENC_ATTN_POST_NORM = auto() # gemma4
|
||||
V_ENC_FFN_POST_NORM = auto()
|
||||
V_LAYER_SCALE_1 = auto()
|
||||
V_LAYER_SCALE_2 = auto()
|
||||
V_LAYER_OUT_SCALE = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_POST_NORM = auto()
|
||||
|
|
@ -761,6 +769,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_STD_BIAS = auto() # gemma4
|
||||
V_STD_SCALE = auto() # gemma4
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
|
||||
V_SAM_PRE_NORM = auto() # Deepseek-OCR
|
||||
|
|
@ -781,6 +791,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_ENC_EMBD_POS = auto()
|
||||
A_ENC_EMBD_NORM = auto()
|
||||
A_ENC_EMBD_TO_LOGITS = auto() # lfm2
|
||||
A_ENC_INP_PROJ = auto() # gemma4
|
||||
A_ENC_CONV1D = auto()
|
||||
A_ENC_CONV1D_NORM = auto() # gemma3n
|
||||
A_PRE_NORM = auto()
|
||||
|
|
@ -789,10 +800,13 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_ENC_ATTN_Q = auto()
|
||||
A_ENC_ATTN_K = auto()
|
||||
A_ENC_ATTN_V = auto()
|
||||
A_ENC_ATTN_POST_NORM = auto()
|
||||
A_ENC_ATTN_PRE_NORM = auto()
|
||||
A_ENC_ATTN_K_REL = auto() # gemma4
|
||||
A_ENC_PER_DIM_SCALE = auto() # gemma3n
|
||||
A_ENC_INPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto()
|
||||
A_ENC_OUTPUT_NORM = auto()
|
||||
A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||
A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT
|
||||
A_ENC_FFN_UP = auto()
|
||||
A_ENC_FFN_NORM = auto()
|
||||
A_ENC_FFN_POST_NORM = auto() # gemma3n
|
||||
|
|
@ -813,6 +827,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
A_MM_HARD_EMB_NORM = auto() # gemma3n
|
||||
A_MM_SOFT_EMB_NORM = auto() # gemma3n
|
||||
A_MM_INP_PROJ = auto() # gemma3n
|
||||
A_PER_DIM_K_SCALE = auto() # gemma4
|
||||
A_PER_DIM_SCALE = auto() # gemma4
|
||||
# nextn/mtp
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
|
|
@ -882,6 +898,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.GEMMA3: "gemma3",
|
||||
MODEL_ARCH.GEMMA3N: "gemma3n",
|
||||
MODEL_ARCH.GEMMA4: "gemma4",
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
|
|
@ -1000,6 +1017,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4
|
||||
MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4
|
||||
MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
|
|
@ -1019,6 +1039,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
|
||||
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale",
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
||||
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
||||
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
||||
|
|
@ -1183,8 +1204,11 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm",
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale",
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||
|
|
@ -1222,6 +1246,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
|
||||
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
|
||||
# DeepSeek-OCR SAM
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
|
||||
|
|
@ -1243,6 +1269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection",
|
||||
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
|
||||
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
||||
|
|
@ -1251,6 +1278,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm",
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel",
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
||||
|
|
@ -1275,6 +1305,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4
|
||||
# lfm2 audio
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
|
||||
|
|
@ -1319,8 +1351,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1,
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE,
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
MODEL_TENSOR.V_MM_POST_NORM,
|
||||
|
|
@ -1358,6 +1393,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_MM_GATE,
|
||||
MODEL_TENSOR.V_TOK_BOI,
|
||||
MODEL_TENSOR.V_TOK_EOI,
|
||||
MODEL_TENSOR.V_STD_BIAS,
|
||||
MODEL_TENSOR.V_STD_SCALE,
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PATCH_EMBD,
|
||||
MODEL_TENSOR.V_SAM_PRE_NORM,
|
||||
|
|
@ -1375,6 +1412,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ,
|
||||
MODEL_TENSOR.A_ENC_CONV1D,
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM,
|
||||
MODEL_TENSOR.A_PRE_NORM,
|
||||
|
|
@ -1383,6 +1421,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K,
|
||||
MODEL_TENSOR.A_ENC_ATTN_V,
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM,
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL,
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.A_ENC_OUTPUT,
|
||||
|
|
@ -1416,6 +1457,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.A_MM_EMBEDDING,
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE,
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
@ -2273,6 +2316,38 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.LAUREL_R,
|
||||
MODEL_TENSOR.LAUREL_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA4: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM_1,
|
||||
MODEL_TENSOR.FFN_POST_NORM_2,
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE,
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
||||
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
||||
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
||||
MODEL_TENSOR.PER_LAYER_PROJ,
|
||||
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
||||
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA_EMBEDDING: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
|
|
@ -4010,6 +4085,8 @@ class VisionProjectorType:
|
|||
GEMMA3 = "gemma3"
|
||||
GEMMA3NV = "gemma3nv"
|
||||
GEMMA3NA = "gemma3na"
|
||||
GEMMA4V = "gemma4v"
|
||||
GEMMA4A = "gemma4a"
|
||||
PHI4 = "phi4"
|
||||
IDEFICS3 = "idefics3"
|
||||
PIXTRAL = "pixtral"
|
||||
|
|
|
|||
|
|
@ -799,6 +799,7 @@ class GGUFWriter:
|
|||
def add_shared_kv_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
# if input is array, true means SWA and false means full_attention for each layer
|
||||
def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
|
||||
key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
|
||||
if isinstance(value, int):
|
||||
|
|
|
|||
|
|
@ -401,6 +401,10 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.pre_mlp_layernorm", # afmoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_PRE_NORM_2: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm_2", # gemma4
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
|
|
@ -411,6 +415,14 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.post_moe_norm", # grok-2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_POST_NORM_1: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_POST_NORM_2: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe
|
||||
|
|
@ -428,6 +440,7 @@ class TensorNameMap:
|
|||
"layers.{bid}.gate", # mistral-large
|
||||
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||
"model.layers.{bid}.moe.gate", # step3.5
|
||||
"model.layers.{bid}.router.proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
|
|
@ -570,6 +583,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP: (
|
||||
"model.layers.{bid}.mlp.experts.gate_up_proj",
|
||||
"model.layers.{bid}.experts.gate_up_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.MOE_LATENT_DOWN: (
|
||||
|
|
@ -629,6 +643,7 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
||||
"model.layers.{bid}.moe.down_proj", # step3.5
|
||||
"model.layers.{bid}.experts.down_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
|
@ -693,6 +708,10 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.final_layernorm", # bailingmoe2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE: (
|
||||
"model.layers.{bid}.layer_scalar", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
||||
"model.embed_tokens_per_layer", # gemma3n
|
||||
),
|
||||
|
|
@ -1383,6 +1402,7 @@ class TensorNameMap:
|
|||
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.input_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||
|
|
@ -1400,6 +1420,7 @@ class TensorNameMap:
|
|||
"model.vision.patch_embedding.position_embedding", # cogvlm
|
||||
"visual.embeddings.position_embedding", # glm4v
|
||||
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
|
|
@ -1430,12 +1451,14 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
|
||||
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
|
||||
"vision_model.model.layers.{bid}.self_attn.q_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
|
|
@ -1450,12 +1473,14 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
|
||||
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
|
||||
"vision_model.model.layers.{bid}.self_attn.k_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
|
|
@ -1470,6 +1495,7 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
|
|
@ -1480,7 +1506,7 @@ class TensorNameMap:
|
|||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
||||
|
|
@ -1505,6 +1531,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
|
|
@ -1522,6 +1549,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
|
|
@ -1540,12 +1568,14 @@ class TensorNameMap:
|
|||
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
|
||||
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
|
|
@ -1564,6 +1594,15 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
|
||||
"vision_model.model.layers.{bid}.post_attention_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_POST_NORM: (
|
||||
"vision_model.model.layers.{bid}.post_feedforward_layernorm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||
|
|
@ -1576,6 +1615,10 @@ class TensorNameMap:
|
|||
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
|
||||
"vision_model.model.layers.{bid}.layer_scalar", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
"vision_tower.vision_model.pre_layrnorm",
|
||||
"vision_tower.ln_pre", # pixtral-hf
|
||||
|
|
@ -1763,6 +1806,14 @@ class TensorNameMap:
|
|||
"model.vision.eoi", # cogvlm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_BIAS: (
|
||||
"model.vision_tower.std_bias", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_SCALE: (
|
||||
"model.vision_tower.std_scale", # gemma4
|
||||
),
|
||||
|
||||
# audio (mtmd)
|
||||
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: (
|
||||
|
|
@ -1782,10 +1833,15 @@ class TensorNameMap:
|
|||
"audio_tower.conv{bid}", # ultravox
|
||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||
"conformer.subsample_conv_projection.layer{bid}.conv", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
||||
"conformer.subsample_conv_projection.layer{bid}.norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_INP_PROJ: (
|
||||
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
|
|
@ -1799,22 +1855,38 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
|
||||
"conformer.layers.{bid}.self_attn.relative_k_proj", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_POST_NORM: (
|
||||
"conformer.layers.{bid}.norm_post_attn", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: (
|
||||
"conformer.layers.{bid}.norm_pre_attn", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.per_dim_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||
|
|
@ -1831,6 +1903,7 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
"conformer.layers.{bid}.self_attn.post", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
|
|
@ -1842,10 +1915,12 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.post_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||
|
|
@ -1856,6 +1931,7 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
|
|
@ -1864,25 +1940,30 @@ class TensorNameMap:
|
|||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||
"conformer.layers.{bid}.feed_forward2.post_layer_norm", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||
|
|
@ -1904,7 +1985,8 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.A_ENC_OUT: (
|
||||
"conformer.pre_encode.out", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n (note: it should be A_ENC_INP_PROJ, this is a mistake; it should be corrected in C++ code when it's supported)
|
||||
"conformer.output_proj", # gemma4
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
|
|
@ -1918,6 +2000,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
"audio.multi_modal_projector.linear", # qwen2audio
|
||||
"audio_tower.proj", # qwen2omni
|
||||
"model.audio_tower.output_proj" # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||
|
|
@ -1953,6 +2036,14 @@ class TensorNameMap:
|
|||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_K_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_key_scale", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||
"model.embed_audio.embedding", # gemma3n
|
||||
),
|
||||
|
|
|
|||
|
|
@ -0,0 +1,266 @@
|
|||
{%- macro format_parameters(properties, required) -%}
|
||||
{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
|
||||
{%- set ns = namespace(found_first=false) -%}
|
||||
{%- for key, value in properties | dictsort -%}
|
||||
{%- set add_comma = false -%}
|
||||
{%- if key not in standard_keys -%}
|
||||
{%- if ns.found_first %},{% endif -%}
|
||||
{%- set ns.found_first = true -%}
|
||||
{{ key }}:{
|
||||
{%- if value['description'] -%}
|
||||
description:<|"|>{{ value['description'] }}<|"|>
|
||||
{%- set add_comma = true -%}
|
||||
{%- endif -%}
|
||||
{%- if value['nullable'] %}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
nullable:true
|
||||
{%- endif -%}
|
||||
{%- if value['type'] | upper == 'STRING' -%}
|
||||
{%- if value['enum'] -%}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
enum:{{ format_argument(value['enum']) }}
|
||||
{%- endif -%}
|
||||
{%- elif value['type'] | upper == 'OBJECT' -%}
|
||||
,properties:{
|
||||
{%- if value['properties'] is defined and value['properties'] is mapping -%}
|
||||
{{- format_parameters(value['properties'], value['required'] | default([])) -}}
|
||||
{%- elif value is mapping -%}
|
||||
{{- format_parameters(value, value['required'] | default([])) -}}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- if value['required'] -%}
|
||||
,required:[
|
||||
{%- for item in value['required'] | default([]) -%}
|
||||
<|"|>{{- item -}}<|"|>
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
]
|
||||
{%- endif -%}
|
||||
{%- elif value['type'] | upper == 'ARRAY' -%}
|
||||
{%- if value['items'] is mapping and value['items'] -%}
|
||||
,items:{
|
||||
{%- set ns_items = namespace(found_first=false) -%}
|
||||
{%- for item_key, item_value in value['items'] | dictsort -%}
|
||||
{%- if item_value is not none -%}
|
||||
{%- if ns_items.found_first %},{% endif -%}
|
||||
{%- set ns_items.found_first = true -%}
|
||||
{%- if item_key == 'properties' -%}
|
||||
properties:{
|
||||
{%- if item_value is mapping -%}
|
||||
{{- format_parameters(item_value, value['items']['required'] | default([])) -}}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- elif item_key == 'required' -%}
|
||||
required:[
|
||||
{%- for req_item in item_value -%}
|
||||
<|"|>{{- req_item -}}<|"|>
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
]
|
||||
{%- elif item_key == 'type' -%}
|
||||
{%- if item_value is string -%}
|
||||
type:{{ format_argument(item_value | upper) }}
|
||||
{%- else -%}
|
||||
type:{{ format_argument(item_value | map('upper') | list) }}
|
||||
{%- endif -%}
|
||||
{%- else -%}
|
||||
{{ item_key }}:{{ format_argument(item_value) }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
type:<|"|>{{ value['type'] | upper }}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endmacro -%}
|
||||
{%- macro format_function_declaration(tool_data) -%}
|
||||
declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
|
||||
{%- set params = tool_data['function']['parameters'] -%}
|
||||
{%- if params -%}
|
||||
,parameters:{
|
||||
{%- if params['properties'] -%}
|
||||
properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
|
||||
{%- endif -%}
|
||||
{%- if params['required'] -%}
|
||||
required:[
|
||||
{%- for item in params['required'] -%}
|
||||
<|"|>{{- item -}}<|"|>
|
||||
{{- ',' if not loop.last -}}
|
||||
{%- endfor -%}
|
||||
],
|
||||
{%- endif -%}
|
||||
{%- if params['type'] -%}
|
||||
type:<|"|>{{- params['type'] | upper -}}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- if 'response' in tool_data['function'] -%}
|
||||
{%- set response_declaration = tool_data['function']['response'] -%}
|
||||
,response:{
|
||||
{%- if response_declaration['description'] -%}
|
||||
description:<|"|>{{- response_declaration['description'] -}}<|"|>,
|
||||
{%- endif -%}
|
||||
{%- if response_declaration['type'] | upper == 'OBJECT' -%}
|
||||
type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- endmacro -%}
|
||||
{%- macro format_argument(argument, escape_keys=True) -%}
|
||||
{%- if argument is string -%}
|
||||
{{- '<|"|>' + argument + '<|"|>' -}}
|
||||
{%- elif argument is boolean -%}
|
||||
{{- 'true' if argument else 'false' -}}
|
||||
{%- elif argument is mapping -%}
|
||||
{{- '{' -}}
|
||||
{%- set ns = namespace(found_first=false) -%}
|
||||
{%- for key, value in argument | dictsort -%}
|
||||
{%- if ns.found_first %},{% endif -%}
|
||||
{%- set ns.found_first = true -%}
|
||||
{%- if escape_keys -%}
|
||||
{{- '<|"|>' + key + '<|"|>' -}}
|
||||
{%- else -%}
|
||||
{{- key -}}
|
||||
{%- endif -%}
|
||||
:{{- format_argument(value, escape_keys=escape_keys) -}}
|
||||
{%- endfor -%}
|
||||
{{- '}' -}}
|
||||
{%- elif argument is sequence -%}
|
||||
{{- '[' -}}
|
||||
{%- for item in argument -%}
|
||||
{{- format_argument(item, escape_keys=escape_keys) -}}
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
{{- ']' -}}
|
||||
{%- else -%}
|
||||
{{- argument -}}
|
||||
{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
{%- macro strip_thinking(text) -%}
|
||||
{%- set ns = namespace(result='') -%}
|
||||
{%- for part in text.split('<channel|>') -%}
|
||||
{%- if '<|channel>' in part -%}
|
||||
{%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
|
||||
{%- else -%}
|
||||
{%- set ns.result = ns.result + part -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{- ns.result | trim -}}
|
||||
{%- endmacro -%}
|
||||
|
||||
{%- set ns = namespace(prev_message_type=None) -%}
|
||||
{%- set loop_messages = messages -%}
|
||||
{{ bos_token }}
|
||||
{#- Handle System/Tool Definitions Block -#}
|
||||
{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
|
||||
{{- '<|turn>system\n' -}}
|
||||
|
||||
{#- Inject Thinking token at the very top of the FIRST system turn -#}
|
||||
{%- if enable_thinking is defined and enable_thinking -%}
|
||||
{{- '<|think|>' -}}
|
||||
{%- set ns.prev_message_type = 'think' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if messages[0]['role'] in ['system', 'developer'] -%}
|
||||
{{- messages[0]['content'] | trim -}}
|
||||
{%- set loop_messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if tools -%}
|
||||
{%- for tool in tools %}
|
||||
{{- '<|tool>' -}}
|
||||
{{- format_function_declaration(tool) | trim -}}
|
||||
{{- '<tool|>' -}}
|
||||
{%- endfor %}
|
||||
{%- set ns.prev_message_type = 'tool' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{{- '<turn|>\n' -}}
|
||||
{%- endif %}
|
||||
|
||||
{#- Loop through messages -#}
|
||||
{%- for message in loop_messages -%}
|
||||
{%- set ns.prev_message_type = None -%}
|
||||
{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
|
||||
{{- '<|turn>' + role + '\n' }}
|
||||
|
||||
{%- if message['tool_calls'] -%}
|
||||
{%- for tool_call in message['tool_calls'] -%}
|
||||
{%- set function = tool_call['function'] -%}
|
||||
{{- '<|tool_call>call:' + function['name'] + '{' -}}
|
||||
{%- if function['arguments'] is mapping -%}
|
||||
{%- set ns_args = namespace(found_first=false) -%}
|
||||
{%- for key, value in function['arguments'] | dictsort -%}
|
||||
{%- if ns_args.found_first %},{% endif -%}
|
||||
{%- set ns_args.found_first = true -%}
|
||||
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
|
||||
{%- endfor -%}
|
||||
{%- elif function['arguments'] is string -%}
|
||||
{{- function['arguments'] -}}
|
||||
{%- endif -%}
|
||||
{{- '}<tool_call|>' -}}
|
||||
{%- endfor -%}
|
||||
{%- set ns.prev_message_type = 'tool_call' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['tool_responses'] -%}
|
||||
{#- Tool Response handling -#}
|
||||
{%- for tool_response in message['tool_responses'] -%}
|
||||
{{- '<|tool_response>' -}}
|
||||
{%- if tool_response['response'] is mapping -%}
|
||||
{{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
|
||||
{%- for key, value in tool_response['response'] | dictsort -%}
|
||||
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
{{- '}' -}}
|
||||
{%- else -%}
|
||||
{{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
|
||||
{%- endif -%}
|
||||
{{- '<tool_response|>' -}}
|
||||
{%- endfor -%}
|
||||
{%- set ns.prev_message_type = 'tool_response' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['content'] is string -%}
|
||||
{%- if role == 'model' -%}
|
||||
{{- strip_thinking(message['content']) -}}
|
||||
{%- else -%}
|
||||
{{- message['content'] | trim -}}
|
||||
{%- endif -%}
|
||||
{%- elif message['content'] is sequence -%}
|
||||
{%- for item in message['content'] -%}
|
||||
{%- if item['type'] == 'text' -%}
|
||||
{%- if role == 'model' -%}
|
||||
{{- strip_thinking(item['text']) -}}
|
||||
{%- else -%}
|
||||
{{- item['text'] | trim -}}
|
||||
{%- endif -%}
|
||||
{%- elif item['type'] == 'image' -%}
|
||||
{{- '\n\n<|image|>\n\n' -}}
|
||||
{%- set ns.prev_message_type = 'image' -%}
|
||||
{%- elif item['type'] == 'audio' -%}
|
||||
{{- '<|audio|>' -}}
|
||||
{%- set ns.prev_message_type = 'audio' -%}
|
||||
{%- elif item['type'] == 'video' -%}
|
||||
{{- '\n\n<|video|>\n\n' -}}
|
||||
{%- set ns.prev_message_type = 'video' -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if not (message['tool_responses'] and not message['content']) -%}
|
||||
{{- '<turn|>\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{%- if ns.prev_message_type != 'tool_response' -%}
|
||||
{{- '<|turn>model\n' -}}
|
||||
{%- endif -%}
|
||||
{%- if not enable_thinking | default(false) -%}
|
||||
{{- '<|channel>thought\n<channel|>' -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
|
|
@ -73,6 +73,7 @@ add_library(llama
|
|||
models/gemma2-iswa.cpp
|
||||
models/gemma3.cpp
|
||||
models/gemma3n-iswa.cpp
|
||||
models/gemma4-iswa.cpp
|
||||
models/glm4-moe.cpp
|
||||
models/glm4.cpp
|
||||
models/gpt2.cpp
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
||||
{ LLM_ARCH_GEMMA4, "gemma4" },
|
||||
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
|
|
@ -165,6 +166,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH_PER_LAYER, "%s.embedding_length_per_layer_input" },
|
||||
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
||||
|
|
@ -238,6 +240,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||
|
|
@ -364,6 +367,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM_1, "blk.%d.post_ffw_norm_1" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM_2, "blk.%d.post_ffw_norm_2" },
|
||||
{ LLM_TENSOR_FFN_PRE_NORM_2, "blk.%d.pre_ffw_norm_2" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
|
|
@ -373,6 +379,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_LAYER_OUT_SCALE, "blk.%d.layer_output_scale" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
||||
|
|
@ -1342,6 +1349,38 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_LAUREL_R,
|
||||
LLM_TENSOR_LAUREL_POST_NORM,
|
||||
};
|
||||
case LLM_ARCH_GEMMA4:
|
||||
return {
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM_1,
|
||||
LLM_TENSOR_FFN_POST_NORM_2,
|
||||
LLM_TENSOR_FFN_PRE_NORM_2,
|
||||
LLM_TENSOR_LAYER_OUT_SCALE,
|
||||
LLM_TENSOR_PER_LAYER_TOKEN_EMBD,
|
||||
LLM_TENSOR_PER_LAYER_MODEL_PROJ,
|
||||
LLM_TENSOR_PER_LAYER_PROJ_NORM,
|
||||
LLM_TENSOR_PER_LAYER_INP_GATE,
|
||||
LLM_TENSOR_PER_LAYER_PROJ,
|
||||
LLM_TENSOR_PER_LAYER_POST_NORM,
|
||||
};
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -2654,11 +2693,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_PRE_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_POST_NORM_1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_POST_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_NORM_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_LAYER_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_LAYER_OUT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_Q_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ enum llm_arch {
|
|||
LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_GEMMA3,
|
||||
LLM_ARCH_GEMMA3N,
|
||||
LLM_ARCH_GEMMA4,
|
||||
LLM_ARCH_GEMMA_EMBEDDING,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
|
|
@ -169,6 +170,7 @@ enum llm_kv {
|
|||
LLM_KV_CONTEXT_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||
LLM_KV_EMBEDDING_LENGTH_PER_LAYER,
|
||||
LLM_KV_FEATURES_LENGTH,
|
||||
LLM_KV_BLOCK_COUNT,
|
||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||
|
|
@ -242,6 +244,7 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||
|
|
@ -369,6 +372,9 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM_1,
|
||||
LLM_TENSOR_FFN_POST_NORM_2,
|
||||
LLM_TENSOR_FFN_PRE_NORM_2,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
|
|
@ -393,6 +399,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_LAYER_OUT_NORM,
|
||||
LLM_TENSOR_LAYER_OUT_SCALE,
|
||||
LLM_TENSOR_POST_ATTN_NORM,
|
||||
LLM_TENSOR_POST_MLP_NORM,
|
||||
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-context.h"
|
||||
#include "ggml.h"
|
||||
#include "stdint.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
|
|
@ -10,3 +10,47 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
|||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs);
|
||||
|
||||
// Get the default ggml_type for a given ftype.
|
||||
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
||||
|
||||
// Quantization state.
|
||||
struct quantize_state_impl;
|
||||
|
||||
LLAMA_API quantize_state_impl * llama_quant_init(
|
||||
const llama_model * model,
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
LLAMA_API void llama_quant_free(quantize_state_impl * qs);
|
||||
|
||||
// Descriptor for constructing a mock model for quantization testing.
|
||||
struct llama_quant_model_desc {
|
||||
const char * architecture;
|
||||
uint32_t n_embd;
|
||||
uint32_t n_ff;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_head;
|
||||
uint32_t n_head_kv;
|
||||
uint32_t n_expert;
|
||||
uint32_t n_embd_head_k;
|
||||
uint32_t n_embd_head_v;
|
||||
};
|
||||
|
||||
// Create a mock model from a metadata descriptor (for testing).
|
||||
// The returned model must be freed with llama_model_free().
|
||||
LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
|
||||
|
||||
// Returns true if this tensor should be quantized (based on name, dims, params).
|
||||
LLAMA_API bool llama_quant_tensor_allows_quantization(
|
||||
const quantize_state_impl * qs,
|
||||
const ggml_tensor * tensor);
|
||||
|
||||
// Compute quantization type assignments for a list of tensors.
|
||||
// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
|
||||
// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
|
||||
LLAMA_API void llama_quant_compute_types(
|
||||
quantize_state_impl * qs,
|
||||
llama_ftype ftype,
|
||||
ggml_tensor ** tensors,
|
||||
ggml_type * result_types,
|
||||
size_t n_tensors);
|
||||
|
|
|
|||
|
|
@ -209,6 +209,9 @@ struct llama_hparams {
|
|||
// qwen3vl deepstack
|
||||
uint32_t n_deepstack_layers = 0;
|
||||
|
||||
// gemma4 per-layer embedding
|
||||
uint32_t n_embd_per_layer = 0;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
|
|
|
|||
|
|
@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
|
|||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
// note: the SWA cache is never quantized because it is relatively small
|
||||
kv_swa = std::make_unique<llama_kv_cache>(
|
||||
model, GGML_TYPE_F16, GGML_TYPE_F16,
|
||||
model, type_k, type_v,
|
||||
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1261,6 +1261,31 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA4:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
|
||||
uint32_t n_kv_shared_layers = 0;
|
||||
ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
|
||||
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
|
||||
hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, hparams.n_embd_per_layer);
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 35: type = LLM_TYPE_E2B; break;
|
||||
case 42: type = LLM_TYPE_E4B; break; // to confirm: E4B or E5B?
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
|
|
@ -4229,6 +4254,100 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA4:
|
||||
{
|
||||
const uint32_t n_embd_per_layer = hparams.n_embd_per_layer;
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
|
||||
if (n_embd_head_k != n_embd_head_v) {
|
||||
throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v");
|
||||
}
|
||||
if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
|
||||
throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa");
|
||||
}
|
||||
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
if (n_embd_per_layer > 0) {
|
||||
tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_per_layer * n_layer, n_vocab}, 0);
|
||||
per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_per_layer * n_layer}, 0);
|
||||
per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_per_layer}, 0);
|
||||
}
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
|
||||
int rope_freqs_flag = 0;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
const int64_t n_head = hparams.n_head(i);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k(i);
|
||||
const int64_t n_embd_k = hparams.n_embd_k_gqa(i);
|
||||
const int64_t n_embd_v = hparams.n_embd_v_gqa(i);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj)
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (!hparams.is_swa(i)) {
|
||||
// full_attention layers use rope_freqs for proportional rope
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag);
|
||||
rope_freqs_flag = TENSOR_DUPLICATED;
|
||||
}
|
||||
|
||||
// handle use_double_wide_mlp
|
||||
int64_t n_ff_cur = hparams.n_ff(i);
|
||||
|
||||
// for expert layers, we use normal FFN as shared expert (same as python code)
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_cur}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// MoE router
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
||||
bool has_expert = layer.ffn_gate_inp != nullptr;
|
||||
|
||||
// norm
|
||||
if (has_expert) {
|
||||
layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_pre_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
|
||||
|
||||
// MoE FFN
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
|
||||
// per-expert scale will be loaded as down_exps_s at the end of the current switch case
|
||||
}
|
||||
|
||||
// per-layer embeddings
|
||||
if (n_embd_per_layer > 0) {
|
||||
layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_per_layer}, 0);
|
||||
layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_per_layer, n_embd}, 0);
|
||||
layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
|
@ -8233,7 +8352,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
} else {
|
||||
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
||||
|
||||
if (arch == LLM_ARCH_GEMMA3N) {
|
||||
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
|
||||
reuse = [&](int32_t il) {
|
||||
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
||||
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
||||
|
|
@ -8486,6 +8605,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
{
|
||||
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA4:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma4_iswa>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
||||
|
|
@ -9006,6 +9129,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
case LLM_ARCH_GEMMA4:
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_OPENELM:
|
||||
|
|
|
|||
|
|
@ -270,6 +270,9 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_norm = nullptr;
|
||||
struct ggml_tensor * ffn_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_post_norm = nullptr;
|
||||
struct ggml_tensor * ffn_post_norm_1 = nullptr; // gemma4
|
||||
struct ggml_tensor * ffn_post_norm_2 = nullptr; // gemma4
|
||||
struct ggml_tensor * ffn_pre_norm_2 = nullptr; // gemma4
|
||||
struct ggml_tensor * layer_out_norm = nullptr;
|
||||
struct ggml_tensor * layer_out_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_norm_exps = nullptr;
|
||||
|
|
@ -285,6 +288,7 @@ struct llama_layer {
|
|||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp_s = nullptr; // gemma4
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
|
|
@ -483,6 +487,9 @@ struct llama_layer {
|
|||
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||
|
||||
// gemma4 layer output scale
|
||||
struct ggml_tensor * out_scale = nullptr;
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include "llama.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-model-loader.h"
|
||||
#include "llama-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
|
|
@ -197,6 +197,7 @@ struct quantize_state_impl {
|
|||
|
||||
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
||||
struct tensor_metadata {
|
||||
std::string name;
|
||||
ggml_type target_type;
|
||||
tensor_category category;
|
||||
std::string remapped_imatrix_name;
|
||||
|
|
@ -788,7 +789,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
|
|||
// given a file type, get the default tensor type
|
||||
//
|
||||
|
||||
static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
||||
ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
|
||||
|
|
@ -827,16 +828,32 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
default: return GGML_TYPE_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
|
||||
for (auto & tm : metadata) {
|
||||
tensor_category cat = tensor_get_category(tm.name);
|
||||
tm.category = cat;
|
||||
|
||||
if (category_is_attn_v(cat)) {
|
||||
++qs.n_attention_wv;
|
||||
}
|
||||
|
||||
if (cat == tensor_category::OUTPUT) {
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
||||
}
|
||||
|
||||
//
|
||||
// main quantization driver
|
||||
//
|
||||
|
||||
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
|
||||
int nthread = params->nthread;
|
||||
|
|
@ -845,7 +862,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
nthread = std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
default_type = llama_ftype_get_default_type(ftype);
|
||||
ggml_type default_type = llama_ftype_get_default_type(ftype);
|
||||
if (default_type == GGML_TYPE_COUNT) {
|
||||
throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
||||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||
|
|
@ -964,6 +984,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
});
|
||||
}
|
||||
|
||||
// compute tensor metadata once and cache it
|
||||
std::vector<tensor_metadata> metadata(tensors.size());
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
metadata[i].name = ggml_get_name(tensors[i]->tensor);
|
||||
}
|
||||
|
||||
// initialize quantization state counters and metadata categories
|
||||
init_quantize_state_counters(qs, metadata);
|
||||
|
||||
int idx = 0;
|
||||
uint16_t n_split = 1;
|
||||
|
||||
|
|
@ -976,25 +1005,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
||||
ctx_outs[0] = std::move(ctx_out);
|
||||
|
||||
// compute tensor metadata once and cache it
|
||||
std::vector<tensor_metadata> metadata(tensors.size());
|
||||
|
||||
// initialize quantization state before preliminary loop (counters for use_more_bits)
|
||||
{
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
const auto cat = tensor_get_category(tensors[i]->tensor->name);
|
||||
if (category_is_attn_v(cat)) {
|
||||
++qs.n_attention_wv;
|
||||
}
|
||||
if (cat == tensor_category::OUTPUT) {
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
metadata[i].category = cat; // save and re-use the category while we're at it
|
||||
}
|
||||
// these also need to be set to n_layer by default
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
||||
}
|
||||
|
||||
// flag for --dry-run
|
||||
bool will_require_imatrix = false;
|
||||
|
||||
|
|
@ -1005,7 +1015,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
const auto * it = tensors[i];
|
||||
const struct ggml_tensor * tensor = it->tensor;
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
uint16_t i_split = params->keep_split ? it->idx : 0;
|
||||
if (!ctx_outs[i_split]) {
|
||||
|
|
@ -1034,7 +1043,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
" - offending tensor: %s\n"
|
||||
" - target type: %s\n"
|
||||
"============================================================================\n\n",
|
||||
name.c_str(), ggml_type_name(metadata[i].target_type));
|
||||
metadata[i].name.c_str(), ggml_type_name(metadata[i].target_type));
|
||||
throw std::runtime_error("this quantization requires an imatrix!");
|
||||
}
|
||||
}
|
||||
|
|
@ -1107,7 +1116,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
new_ofstream(weight.idx);
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
|
||||
if (!params->dry_run) {
|
||||
|
|
@ -1238,9 +1246,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
total_size_new += new_size;
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), metadata[i].name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
|
|
@ -1305,3 +1313,89 @@ uint32_t llama_model_quantize(
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Helper functions for external tools exposed in llama-ext.h
|
||||
//
|
||||
|
||||
quantize_state_impl * llama_quant_init(
|
||||
const llama_model * model,
|
||||
const llama_model_quantize_params * params) {
|
||||
return new quantize_state_impl(*model, params);
|
||||
}
|
||||
|
||||
void llama_quant_free(quantize_state_impl * qs) {
|
||||
delete qs;
|
||||
}
|
||||
|
||||
llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
|
||||
struct llama_model_params mparams = llama_model_default_params();
|
||||
auto * model = new llama_model(mparams);
|
||||
|
||||
model->arch = llm_arch_from_string(desc->architecture);
|
||||
|
||||
// infer llm_type: only LLM_TYPE_70B matters for quantization logic
|
||||
if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
|
||||
model->type = LLM_TYPE_70B;
|
||||
}
|
||||
|
||||
model->hparams.n_embd = desc->n_embd;
|
||||
model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
|
||||
model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
|
||||
model->hparams.n_layer = desc->n_layer;
|
||||
model->hparams.n_expert = desc->n_expert;
|
||||
|
||||
for (uint32_t i = 0; i < desc->n_layer; i++) {
|
||||
model->hparams.n_head_arr[i] = desc->n_head;
|
||||
model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
|
||||
model->hparams.n_ff_arr[i] = desc->n_ff;
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
bool llama_quant_tensor_allows_quantization(
|
||||
const quantize_state_impl * qs,
|
||||
const ggml_tensor * tensor) {
|
||||
return tensor_allows_quantization(qs->params, qs->model.arch, tensor);
|
||||
}
|
||||
|
||||
void llama_quant_compute_types(
|
||||
quantize_state_impl * qs,
|
||||
llama_ftype ftype,
|
||||
ggml_tensor ** tensors,
|
||||
ggml_type * result_types,
|
||||
size_t n_tensors) {
|
||||
// reset per-computation state
|
||||
qs->n_attention_wv = 0;
|
||||
qs->n_ffn_down = 0;
|
||||
qs->n_ffn_gate = 0;
|
||||
qs->n_ffn_up = 0;
|
||||
qs->i_attention_wv = 0;
|
||||
qs->i_ffn_down = 0;
|
||||
qs->i_ffn_gate = 0;
|
||||
qs->i_ffn_up = 0;
|
||||
qs->n_fallback = 0;
|
||||
qs->has_imatrix = false;
|
||||
qs->has_tied_embeddings = true;
|
||||
|
||||
// build metadata from tensor names
|
||||
std::vector<tensor_metadata> metadata(n_tensors);
|
||||
for (size_t i = 0; i < n_tensors; i++) {
|
||||
metadata[i].name = ggml_get_name(tensors[i]);
|
||||
}
|
||||
|
||||
// initialize counters and categories
|
||||
init_quantize_state_counters(*qs, metadata);
|
||||
|
||||
// use a local copy of params with the requested ftype
|
||||
llama_model_quantize_params local_params = *qs->params;
|
||||
local_params.ftype = ftype;
|
||||
|
||||
ggml_type default_type = llama_ftype_get_default_type(ftype);
|
||||
|
||||
// compute types
|
||||
for (size_t i = 0; i < n_tensors; i++) {
|
||||
result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -493,6 +493,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
|
||||
// Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
|
||||
// normalizer, then BPE merges run on the whole text without
|
||||
// word-level pre-splitting. We only need to split on newlines
|
||||
// since BPE merge lookup asserts no newlines in tokens.
|
||||
regex_exprs = {
|
||||
"[^\\n]+|[\\n]+",
|
||||
};
|
||||
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
@ -506,6 +516,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
}
|
||||
|
||||
std::vector<std::string> regex_exprs;
|
||||
bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
|
||||
};
|
||||
|
||||
struct llm_tokenizer_bpe_session {
|
||||
|
|
@ -550,9 +561,10 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
int final_prev_index = -1;
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
|
||||
|
||||
symbols_final.clear();
|
||||
auto tok_pre = vocab.get_pre_type();
|
||||
|
||||
for (const auto & word : word_collection) {
|
||||
work_queue = llm_bigram_bpe::queue();
|
||||
|
|
@ -565,6 +577,13 @@ struct llm_tokenizer_bpe_session {
|
|||
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
|
||||
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||
offset = word.size();
|
||||
} else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
|
||||
// fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
|
||||
auto tok = vocab.text_to_token(word);
|
||||
if (tok != LLAMA_TOKEN_NULL) {
|
||||
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||
offset = word.size();
|
||||
}
|
||||
}
|
||||
|
||||
while (offset < word.size()) {
|
||||
|
|
@ -1863,6 +1882,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = 3; // <|plamo:pad|>
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
} else if (tokenizer_model == "gemma4") {
|
||||
type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||
if (merges_keyidx == -1) {
|
||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||
}
|
||||
{
|
||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||
for (int i = 0; i < n_merges; i++) {
|
||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||
|
||||
std::string first;
|
||||
std::string second;
|
||||
|
||||
const size_t pos = word.find(' ', 1);
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
first = word.substr(0, pos);
|
||||
second = word.substr(pos + 1);
|
||||
}
|
||||
|
||||
bpe_ranks.emplace(std::make_pair(first, second), i);
|
||||
}
|
||||
}
|
||||
|
||||
// default special tokens (to be read from GGUF)
|
||||
special_bos_id = LLAMA_TOKEN_NULL;
|
||||
special_eos_id = LLAMA_TOKEN_NULL;
|
||||
special_unk_id = LLAMA_TOKEN_NULL;
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = LLAMA_TOKEN_NULL;
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
tokenizer_pre = "gemma4";
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||
}
|
||||
|
|
@ -1870,6 +1925,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
// for now, only BPE models have pre-tokenizers
|
||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
add_space_prefix = false;
|
||||
escape_whitespaces = false;
|
||||
clean_spaces = true;
|
||||
if (tokenizer_pre.empty()) {
|
||||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
|
@ -1936,6 +1992,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
} else if (
|
||||
tokenizer_pre == "jais-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||
} else if (
|
||||
tokenizer_pre == "gemma4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||
escape_whitespaces = true;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
|
|
@ -2490,6 +2550,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "[EOS]" // Kimi-K2
|
||||
|| t.first == "<|end_of_text|>"
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
|| t.first == "<turn|>" // gemma4
|
||||
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
|
|
@ -3032,6 +3093,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
if (escape_whitespaces) {
|
||||
llama_escape_whitespace(text);
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
|
|
@ -3211,6 +3276,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|||
return _try_copy(token_text.data(), token_text.size());
|
||||
}
|
||||
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||
if (escape_whitespaces) {
|
||||
// SPM-style BPE: tokens contain ▁ for spaces
|
||||
std::string result = token_text;
|
||||
llama_unescape_whitespace(result);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
std::string result = llama_decode_text(token_text);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
|
|
@ -3641,9 +3712,7 @@ int llama_vocab::max_token_len() const {
|
|||
|
||||
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
||||
|
||||
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == pimpl->bpe_ranks.end()) {
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ enum llama_vocab_pre_type {
|
|||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,311 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params),
|
||||
model(model),
|
||||
n_embd_per_layer(model.hparams.n_embd_per_layer) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
||||
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: is causal == true correct? might need some changes
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
|
||||
// inp_per_layer shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||
ggml_tensor * inp_per_layer = nullptr;
|
||||
if (model.tok_embd_per_layer) {
|
||||
inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k(il);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v(il));
|
||||
|
||||
const int64_t n_head = hparams.n_head(il);
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
const int n_rot_l = hparams.n_rot(il);
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
ggml_tensor * freq_factors = nullptr;
|
||||
if (!hparams.is_swa(il)) {
|
||||
// full_attention layers use rope_freqs for proportional rope
|
||||
freq_factors = model.layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
// Q projection (shared for both non-KV and KV layers)
|
||||
// this is to mirror Gemma4Attention in pytorch code
|
||||
ggml_tensor * Qcur;
|
||||
{
|
||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_pos", il);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
if (hparams.has_kv(il)) {
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = model.layers[il].wv
|
||||
? build_lora_mm(model.layers[il].wv, cur)
|
||||
: Kcur; // if v_proj is not present, use Kcur as Vcur
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
|
||||
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
cb(Vcur, "Vcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
cb(Kcur, "Kcur_pos", il);
|
||||
|
||||
cur = build_attn(inp_attn, model.layers[il].wo,
|
||||
nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
|
||||
hparams.f_attention_scale, il);
|
||||
} else {
|
||||
// reuse KV cache of earlier layers
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, nullptr,
|
||||
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
||||
}
|
||||
|
||||
// TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].attn_post_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(attn_out, "attn_out", il);
|
||||
|
||||
// feed-forward network
|
||||
const bool is_moe_layer = model.layers[il].ffn_gate_inp != nullptr;
|
||||
if (is_moe_layer) {
|
||||
// MLP (shared exp)
|
||||
ggml_tensor * cur_mlp = build_norm(attn_out,
|
||||
model.layers[il].ffn_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur_mlp, "ffn_norm_1", il);
|
||||
|
||||
cur_mlp = build_ffn(cur_mlp,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
cur_mlp = build_norm(cur_mlp,
|
||||
model.layers[il].ffn_post_norm_1, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur_mlp, "ffn_mlp", il);
|
||||
|
||||
// Expert FFN
|
||||
ggml_tensor * cur_moe = build_norm(attn_out,
|
||||
model.layers[il].ffn_pre_norm_2, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur_moe, "ffn_norm_2", il);
|
||||
|
||||
// custom MoE logits calculation (router operates on attn_out, not cur)
|
||||
ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps);
|
||||
tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd));
|
||||
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s);
|
||||
ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
|
||||
cur_moe = build_moe_ffn(cur_moe,
|
||||
nullptr, // gate_inp
|
||||
nullptr, // up_exps
|
||||
nullptr, // gate_exps
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr, // exp_probs_b (not used for gemma4)
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_GELU, true,
|
||||
1.0f,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il, logits,
|
||||
model.layers[il].ffn_gate_up_exps,
|
||||
nullptr, // up_exps_s
|
||||
nullptr, // gate_exps_s
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cur_moe = build_norm(cur_moe,
|
||||
model.layers[il].ffn_post_norm_2, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur_moe, "ffn_moe", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur_mlp, cur_moe);
|
||||
cb(cur, "ffn_moe_combined", il);
|
||||
} else {
|
||||
cur = build_norm(attn_out,
|
||||
model.layers[il].ffn_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, nullptr,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
// residual connection
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
|
||||
// per-layer embedding
|
||||
if (inp_per_layer) {
|
||||
ggml_tensor * pe_in = cur;
|
||||
cb(cur, "pe_in", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens]
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_per_layer, n_tokens]
|
||||
|
||||
// TODO @ngxson : improve this
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = ggml_mul(ctx0, cur, inp_this_layer);
|
||||
cur = build_lora_mm(model.layers[il].per_layer_proj, cur); // [n_embd, n_tokens]
|
||||
cur = build_norm(cur, model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "per_layer_embd_out", il);
|
||||
|
||||
// residual connection
|
||||
cur = ggml_add(ctx0, pe_in, cur);
|
||||
}
|
||||
|
||||
// layer_scalar
|
||||
if (model.layers[il].out_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||
cb(cur, "out_scaled", il);
|
||||
}
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, nullptr,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
|
||||
ggml_tensor * llm_build_gemma4_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
||||
GGML_ASSERT(idx < (int) x->ne[2]);
|
||||
return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], ggml_row_size(x->type, x->ne[0]),
|
||||
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
|
||||
}
|
||||
|
||||
// equivalent to get_per_layer_inputs() in python code
|
||||
// output shape: [n_embd_per_layer, n_layer, n_tokens]
|
||||
ggml_tensor * llm_build_gemma4_iswa::get_per_layer_inputs() {
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||
ggml_tensor * inp_per_layer;
|
||||
if (ubatch.token) {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
res->t_inp_tokens = inp->tokens;
|
||||
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer));
|
||||
cb(inp_per_layer, "inp_per_layer_selected", -1);
|
||||
res->add_input(std::move(inp));
|
||||
} else {
|
||||
// Vision embedding path: use padding token (ID=0) embedding
|
||||
// TODO: verify if this is the correct behavior in transformers implementation
|
||||
const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_per_layer * n_layer
|
||||
|
||||
// Extract and dequantize padding token embedding (row 0)
|
||||
ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
|
||||
inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
|
||||
|
||||
// Reshape to [n_embd_per_layer, n_layer, 1]
|
||||
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);
|
||||
cb(inp_per_layer, "inp_per_layer_vision", -1);
|
||||
}
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
||||
// equivalent to project_per_layer_inputs() in python code
|
||||
// this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
|
||||
// inputs_embeds shape: [n_embd, n_tokens]
|
||||
// inp_per_layer shape: [n_embd_per_layer, n_layer, n_tokens] (from get_per_layer_inputs)
|
||||
// output shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||
ggml_tensor * llm_build_gemma4_iswa::project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
|
||||
const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
|
||||
const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
|
||||
|
||||
ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
|
||||
per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
|
||||
per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_per_layer, n_layer, n_tokens);
|
||||
per_layer_proj = build_norm(per_layer_proj, model.per_layer_proj_norm, nullptr, LLM_NORM_RMS,
|
||||
-1); // [n_embd_per_layer, n_layer, n_tokens]
|
||||
cb(per_layer_proj, "per_layer_proj", -1);
|
||||
|
||||
inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
|
||||
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
|
||||
cb(inp_per_layer, "inp_per_layer", -1);
|
||||
|
||||
// permute to shape: [n_embd_per_layer, n_tokens, n_layer]
|
||||
inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
|
||||
return inp_per_layer;
|
||||
}
|
||||
|
|
@ -266,6 +266,17 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|||
ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
|
||||
};
|
||||
|
||||
struct llm_build_gemma4_iswa : public llm_graph_context {
|
||||
const llama_model & model;
|
||||
|
||||
const int64_t n_embd_per_layer;
|
||||
|
||||
llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
ggml_tensor * view_2d_slice(ggml_tensor * x, int idx);
|
||||
ggml_tensor * get_per_layer_inputs();
|
||||
ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer);
|
||||
};
|
||||
|
||||
struct llm_build_gemma_embedding : public llm_graph_context {
|
||||
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
|
|||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||
|
|
@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
start += offset;
|
||||
}
|
||||
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
if (byte_encode) {
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
}
|
||||
|
||||
return bpe_words;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
|
|||
|
||||
bool unicode_cpt_is_han(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
*
|
||||
!*.*
|
||||
!snapshots/
|
||||
*.o
|
||||
ggml-common.h
|
||||
**/*.swp
|
||||
|
|
|
|||
|
|
@ -274,6 +274,12 @@ if (TARGET cpp-httplib)
|
|||
add_executable(test-gguf-model-data test-gguf-model-data.cpp)
|
||||
target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
|
||||
llama_test(test-gguf-model-data LABEL "model")
|
||||
|
||||
# test-quant-type-selection requires gguf-model-data for remote model metadata
|
||||
llama_build_and_test(test-quant-type-selection.cpp LABEL "model")
|
||||
target_link_libraries(test-quant-type-selection PRIVATE gguf-model-data)
|
||||
target_compile_definitions(test-quant-type-selection PRIVATE
|
||||
SNAPSHOT_DIR="${CMAKE_CURRENT_SOURCE_DIR}/snapshots")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
@ -287,3 +293,7 @@ target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
|||
|
||||
llama_build(export-graph-ops.cpp)
|
||||
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
if (TARGET gguf-model-data)
|
||||
target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
|
||||
target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -1,15 +1,26 @@
|
|||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "llama-cpp.h"
|
||||
#include "../src/llama-ext.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf-model-data.h"
|
||||
#include "gguf.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "download.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
// Noop because weights are not needed
|
||||
static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
|
||||
GGML_UNUSED(tensor);
|
||||
GGML_UNUSED(userdata);
|
||||
}
|
||||
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
|
|
@ -132,9 +143,52 @@ int main(int argc, char ** argv) {
|
|||
|
||||
params.warmup = false;
|
||||
|
||||
auto init_result = common_init_from_params(params);
|
||||
llama_context * ctx;
|
||||
common_init_result_ptr init_result;
|
||||
llama_context_ptr ctx2;
|
||||
llama_model_ptr model;
|
||||
|
||||
llama_context * ctx = init_result->context();
|
||||
if (params.model.hf_repo.empty()) {
|
||||
init_result = common_init_from_params(params);
|
||||
|
||||
ctx = init_result->context();
|
||||
} else {
|
||||
#ifdef LLAMA_HF_FETCH
|
||||
auto [hf_repo, hf_quant] = common_download_split_repo_tag(params.model.hf_repo);
|
||||
if (hf_quant.empty() || hf_quant == "latest") {
|
||||
hf_quant = "Q4_K_M";
|
||||
}
|
||||
|
||||
gguf_context_ptr gguf_ctx = gguf_fetch_gguf_ctx(hf_repo, hf_quant);
|
||||
if (!gguf_ctx) {
|
||||
LOG_ERR("failed to fetch GGUF metadata from %s\n", hf_repo.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.devices = params.devices.data();
|
||||
model_params.no_alloc = true;
|
||||
|
||||
model.reset(llama_model_init_from_user(gguf_ctx.get(), set_tensor_data, nullptr, model_params));
|
||||
|
||||
if (!model) {
|
||||
LOG_ERR("failed to create llama_model from %s\n", hf_repo.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx2.reset(llama_init_from_model(model.get(), ctx_params));
|
||||
ctx = ctx2.get();
|
||||
|
||||
if (!ctx) {
|
||||
LOG_ERR("failed to create llama_context\n");
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
LOG_ERR("export-graph-ops compiled without HF fetch support\n");
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
const uint32_t n_seqs = llama_n_seq_max(ctx);
|
||||
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
|
||||
|
|
@ -143,13 +197,15 @@ int main(int argc, char ** argv) {
|
|||
|
||||
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
|
||||
if (!gf_pp) {
|
||||
throw std::runtime_error("failed to reserve prompt processing graph");
|
||||
LOG_ERR("failed to reserve prompt processing graph\n");
|
||||
return 1;
|
||||
}
|
||||
extract_graph_ops(gf_pp, "pp", tests);
|
||||
|
||||
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
|
||||
if (!gf_tg) {
|
||||
throw std::runtime_error("failed to reserve token generation graph");
|
||||
LOG_ERR("failed to reserve token generation graph\n");
|
||||
return 1;
|
||||
}
|
||||
extract_graph_ops(gf_tg, "tg", tests);
|
||||
|
||||
|
|
@ -158,7 +214,8 @@ int main(int argc, char ** argv) {
|
|||
std::ofstream f(params.out_file);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to open output file");
|
||||
LOG_ERR("unable to open output file: %s\n", params.out_file.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (const auto& test : tests) {
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "gguf-model-data.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
|
@ -124,6 +125,35 @@ static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
|
|||
}
|
||||
|
||||
static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
|
||||
// Handle array-valued fields (e.g. per-layer head counts in hybrid models)
|
||||
// by reading the first element as a representative value.
|
||||
if (vtype == GGUF_TYPE_ARRAY) {
|
||||
int32_t elem_type;
|
||||
uint64_t count;
|
||||
if (!r.read_val(elem_type)) {
|
||||
return false;
|
||||
}
|
||||
if (!r.read_val(count)) {
|
||||
return false;
|
||||
}
|
||||
if (count == 0) {
|
||||
return false;
|
||||
}
|
||||
// Read first element, skip the rest
|
||||
if (!gguf_read_uint32_val(r, elem_type, out)) {
|
||||
return false;
|
||||
}
|
||||
for (uint64_t i = 1; i < count; i++) {
|
||||
size_t sz = gguf_val_type_size(elem_type);
|
||||
if (sz == 0) {
|
||||
return false;
|
||||
}
|
||||
if (!r.skip(sz)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (vtype == GGUF_TYPE_UINT8) {
|
||||
uint8_t v;
|
||||
if (!r.read_val(v)) {
|
||||
|
|
@ -486,7 +516,8 @@ static std::string detect_gguf_filename(const std::string & repo, const std::str
|
|||
static std::optional<gguf_remote_model> fetch_and_parse(
|
||||
const std::string & repo,
|
||||
const std::string & filename,
|
||||
const std::string & cache_path) {
|
||||
const std::string & cache_path,
|
||||
bool verbose) {
|
||||
std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
|
||||
|
||||
// Progressive download inspired by RangeView.fetchChunk()
|
||||
|
|
@ -495,7 +526,9 @@ static std::optional<gguf_remote_model> fetch_and_parse(
|
|||
const size_t max_chunk = 64 * 1024 * 1024;
|
||||
|
||||
while (chunk_size <= max_chunk) {
|
||||
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
|
||||
}
|
||||
|
||||
char range_buf[64];
|
||||
snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
|
||||
|
|
@ -531,34 +564,42 @@ static std::optional<gguf_remote_model> fetch_and_parse(
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
static std::string get_cache_file_path(const std::string& cdir, const std::string& repo_part, const std::string& filename) {
|
||||
std::string fname_part = sanitize_for_path(filename);
|
||||
return cdir + "/" + repo_part + "--" + fname_part + ".partial";
|
||||
}
|
||||
|
||||
// Try cache first, then fetch and parse a single GGUF shard.
|
||||
static std::optional<gguf_remote_model> fetch_or_cached(
|
||||
const std::string & repo,
|
||||
const std::string & filename,
|
||||
const std::string & cdir,
|
||||
const std::string & repo_part) {
|
||||
std::string fname_part = sanitize_for_path(filename);
|
||||
std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
|
||||
const std::string & repo_part,
|
||||
bool verbose) {
|
||||
std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
|
||||
|
||||
{
|
||||
std::vector<char> cached;
|
||||
if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
|
||||
auto result = gguf_parse_meta(cached);
|
||||
if (result.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs_create_directory_with_parents(cdir);
|
||||
return fetch_and_parse(repo, filename, cache_path);
|
||||
return fetch_and_parse(repo, filename, cache_path, verbose);
|
||||
}
|
||||
|
||||
std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
||||
const std::string & repo,
|
||||
const std::string & quant,
|
||||
const std::string & cache_dir) {
|
||||
const std::string & cache_dir,
|
||||
bool verbose) {
|
||||
std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
|
||||
std::string repo_part = sanitize_for_path(repo);
|
||||
|
||||
|
|
@ -568,7 +609,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
|
||||
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
|
||||
if (!model_opt.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
|
||||
return std::nullopt;
|
||||
|
|
@ -583,8 +624,10 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
||||
model.n_split, model.n_split - 1);
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
||||
model.n_split, model.n_split - 1);
|
||||
}
|
||||
|
||||
for (int i = 2; i <= model.n_split; i++) {
|
||||
char num_buf[6], total_buf[6];
|
||||
|
|
@ -592,7 +635,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
||||
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
|
||||
if (!shard.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
|
||||
return std::nullopt;
|
||||
|
|
@ -611,3 +654,87 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
|
||||
return model_opt;
|
||||
}
|
||||
|
||||
gguf_context_ptr gguf_fetch_gguf_ctx(
|
||||
const std::string & repo,
|
||||
const std::string & quant,
|
||||
const std::string & cache_dir,
|
||||
bool verbose) {
|
||||
std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
|
||||
std::string repo_part = sanitize_for_path(repo);
|
||||
|
||||
std::string split_prefix;
|
||||
std::string filename = detect_gguf_filename(repo, quant, split_prefix);
|
||||
|
||||
if (filename.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
|
||||
if (!model_opt.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto & model = model_opt.value();
|
||||
|
||||
const std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
|
||||
|
||||
ggml_context_ptr ggml_ctx_ptr;
|
||||
ggml_context * ggml_ctx{};
|
||||
gguf_init_params params{true, &ggml_ctx};
|
||||
gguf_context_ptr ctx{gguf_init_from_file(cache_path.c_str(), params)};
|
||||
ggml_ctx_ptr.reset(ggml_ctx);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "gguf_fetch: gguf_init_from_file failed\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// If the model is split across multiple files we need to fetch the remaining shards metadata
|
||||
if (model.n_split > 1) {
|
||||
if (split_prefix.empty()) {
|
||||
fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
||||
model.n_split, model.n_split - 1);
|
||||
}
|
||||
|
||||
for (int i = 2; i <= model.n_split; i++) {
|
||||
char num_buf[6], total_buf[6];
|
||||
snprintf(num_buf, sizeof(num_buf), "%05d", i);
|
||||
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
||||
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
|
||||
if (!shard.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Load tensors from shard and add to main gguf_context
|
||||
const std::string shard_path = get_cache_file_path(cdir, repo_part, shard_name);
|
||||
ggml_context_ptr shard_ggml_ctx_ptr;
|
||||
ggml_context * shard_ggml_ctx{};
|
||||
gguf_init_params shard_params{true, &shard_ggml_ctx};
|
||||
gguf_context_ptr shard_ctx{gguf_init_from_file(shard_path.c_str(), shard_params)};
|
||||
shard_ggml_ctx_ptr.reset(shard_ggml_ctx);
|
||||
|
||||
if (shard_ctx == nullptr) {
|
||||
fprintf(stderr, "gguf_fetch: shard gguf_init_from_file failed\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(shard_ggml_ctx); t; t = ggml_get_next_tensor(shard_ggml_ctx, t)) {
|
||||
gguf_add_tensor(ctx.get(), t);
|
||||
}
|
||||
}
|
||||
|
||||
gguf_set_val_u16(ctx.get(), "split.count", 1);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
|
@ -39,4 +40,11 @@ struct gguf_remote_model {
|
|||
std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
||||
const std::string & repo,
|
||||
const std::string & quant = "Q8_0",
|
||||
const std::string & cache_dir = ""); // empty = default
|
||||
const std::string & cache_dir = "", // empty = default
|
||||
bool verbose = true);
|
||||
|
||||
gguf_context_ptr gguf_fetch_gguf_ctx(
|
||||
const std::string & repo,
|
||||
const std::string & quant = "Q8_0",
|
||||
const std::string & cache_dir = "",
|
||||
bool verbose = true);
|
||||
|
|
|
|||
|
|
@ -213,6 +213,66 @@ void test_gbnf_generation(testing &t) {
|
|||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("tagged choice inside sequence gets parenthesized", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.literal("a") + p.tag("t", p.literal("b") | p.literal("c"));
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= "a" ("b" | "c")
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("tagged sequence inside choice gets parenthesized", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.tag("t", p.literal("a") + p.literal("b")) | p.literal("c");
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= "a" "b" | "c"
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("atomic choice inside repetition gets parenthesized", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.one_or_more(p.atomic(p.literal("a") | p.literal("b")));
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= ("a" | "b")+
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("nested transparent wrappers get parenthesized", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.literal("x") + p.tag("outer", p.atomic(p.literal("a") | p.literal("b")));
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= "x" ("a" | "b")
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("emit only trigger rules (and references)", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -589,6 +589,51 @@ static common_chat_tool amount_tool{
|
|||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool toggle_tool{
|
||||
/* .name = */ "toggle",
|
||||
/* .description = */ "Toggle a feature",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to enable the feature"
|
||||
}
|
||||
},
|
||||
"required": ["enabled"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_tool{
|
||||
/* .name = */ "set_nullable",
|
||||
/* .description = */ "Set a nullable value",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {
|
||||
"type": "null",
|
||||
"description": "A null value"
|
||||
}
|
||||
},
|
||||
"required": ["value"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool config_tool{
|
||||
/* .name = */ "set_config",
|
||||
/* .description = */ "Set configuration",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"config": {
|
||||
"type": "object",
|
||||
"description": "Configuration dict"
|
||||
}
|
||||
},
|
||||
"required": ["config"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool imaginary_number_tool{
|
||||
/* .name = */ "imaginary_number",
|
||||
/* .description = */ "Imaginary number converter",
|
||||
|
|
@ -612,6 +657,66 @@ static common_chat_tool imaginary_number_tool{
|
|||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_string_tool{
|
||||
/* .name = */ "set_nullable_str",
|
||||
/* .description = */ "Set a nullable string value",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": ["string", "null"],
|
||||
"description": "A nullable string"
|
||||
}
|
||||
},
|
||||
"required": ["name"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_string_null_first_tool{
|
||||
/* .name = */ "set_nullable_str_nf",
|
||||
/* .description = */ "Set a nullable string value with null first in type array",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": ["null", "string"],
|
||||
"description": "A nullable string with null first"
|
||||
}
|
||||
},
|
||||
"required": ["name"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_int_tool{
|
||||
/* .name = */ "set_nullable_int",
|
||||
/* .description = */ "Set a nullable integer value",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"count": {
|
||||
"type": ["integer", "null"],
|
||||
"description": "A nullable integer"
|
||||
}
|
||||
},
|
||||
"required": ["count"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool enum_no_type_tool{
|
||||
/* .name = */ "set_unit",
|
||||
/* .description = */ "Set a temperature unit",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "Temperature unit"
|
||||
}
|
||||
},
|
||||
"required": ["unit"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool string_param_tool{
|
||||
/* .name = */ "string_param",
|
||||
/* .description = */ "Tool with string parameter for testing",
|
||||
|
|
@ -1869,6 +1974,130 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
// Google Gemma 4 (tool calling with Gemma4 dict format)
|
||||
auto tst = peg_tester("models/templates/gemma4.jinja");
|
||||
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
|
||||
|
||||
// Simple tool call with string argument
|
||||
tst.test(
|
||||
"<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>")
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_tool_calls("get_time", R"({"city": "London"})"))
|
||||
.run();
|
||||
|
||||
// Tool call with string argument containing special chars
|
||||
tst.test(
|
||||
"<|tool_call>call:get_time{city:<|\"|>San Francisco<|\"|>}<tool_call|>")
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_tool_calls("get_time", R"({"city": "San Francisco"})"))
|
||||
.run();
|
||||
|
||||
// Tool call with empty args
|
||||
tst.test(
|
||||
"<|tool_call>call:empty_args{}<tool_call|>")
|
||||
.tools({ empty_args_tool })
|
||||
.expect(message_with_tool_calls("empty_args", "{}"))
|
||||
.run();
|
||||
|
||||
// Tool call with string and content
|
||||
tst.test(
|
||||
"Hello, world!\nWhat's up?<|tool_call>call:get_time{city:<|\"|>Paris<|\"|>}<tool_call|>")
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_content_and_tool_call("Hello, world!\nWhat's up?", "get_time", R"({"city": "Paris"})"))
|
||||
.run();
|
||||
|
||||
// Parallel tool calls
|
||||
tst.test(
|
||||
"<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>"
|
||||
"<|tool_call>call:get_weather{city:<|\"|>Paris<|\"|>}<tool_call|>")
|
||||
.tools({ get_time_tool, get_weather_tool })
|
||||
.parallel_tool_calls(true)
|
||||
.expect_tool_calls({
|
||||
{ "get_time", R"({"city": "London"})", "" },
|
||||
{ "get_weather", R"({"city": "Paris"})", "" },
|
||||
})
|
||||
.run();
|
||||
|
||||
// Tool call with integer argument (number type)
|
||||
tst.test(
|
||||
"<|tool_call>call:special_function{arg1:42}<tool_call|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_with_tool_calls("special_function", R"({"arg1": 42})"))
|
||||
.run();
|
||||
|
||||
// Tool call with negative number argument
|
||||
tst.test(
|
||||
"<|tool_call>call:special_function{arg1:-7}<tool_call|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_with_tool_calls("special_function", R"({"arg1": -7})"))
|
||||
.run();
|
||||
|
||||
// Tool call with decimal number argument
|
||||
tst.test(
|
||||
"<|tool_call>call:amount{orig:3.14}<tool_call|>")
|
||||
.tools({ amount_tool })
|
||||
.expect(message_with_tool_calls("amount", R"({"orig": 3.14})"))
|
||||
.run();
|
||||
|
||||
// Tool call with boolean argument (true)
|
||||
tst.test(
|
||||
"<|tool_call>call:toggle{enabled:true}<tool_call|>")
|
||||
.tools({ toggle_tool })
|
||||
.expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
|
||||
.run();
|
||||
|
||||
// Tool call with boolean argument (false)
|
||||
tst.test(
|
||||
"<|tool_call>call:toggle{enabled:false}<tool_call|>")
|
||||
.tools({ toggle_tool })
|
||||
.expect(message_with_tool_calls("toggle", R"({"enabled": false})"))
|
||||
.run();
|
||||
|
||||
// Tool call with null argument
|
||||
tst.test(
|
||||
"<|tool_call>call:set_nullable{value:null}<tool_call|>")
|
||||
.tools({ nullable_tool })
|
||||
.expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
|
||||
.run();
|
||||
|
||||
// Tool call with array argument (todo list)
|
||||
tst.test(
|
||||
"<|tool_call>call:todo_list{todos:[<|\"|>buy milk<|\"|>,<|\"|>walk dog<|\"|>]}<tool_call|>")
|
||||
.tools({ todo_list })
|
||||
.expect(message_with_tool_calls("todo_list", R"({"todos":["buy milk","walk dog"]})"))
|
||||
.run();
|
||||
|
||||
// Tool call with object/dict argument
|
||||
tst.test(
|
||||
"<|tool_call>call:set_config{config:{theme:<|\"|>dark<|\"|>,count:3}}<tool_call|>")
|
||||
.tools({ config_tool })
|
||||
.expect(message_with_tool_calls("set_config", R"({"config":{"theme":"dark","count":3}})"))
|
||||
.run();
|
||||
|
||||
// Tool call with empty array
|
||||
tst.test(
|
||||
"<|tool_call>call:todo_list{todos:[]}<tool_call|>")
|
||||
.tools({ todo_list })
|
||||
.expect(message_with_tool_calls("todo_list", R"({"todos":[]})"))
|
||||
.run();
|
||||
|
||||
// Tool call with empty dict
|
||||
tst.test(
|
||||
"<|tool_call>call:set_config{config:{}}<tool_call|>")
|
||||
.tools({ config_tool })
|
||||
.expect(message_with_tool_calls("set_config", R"({"config":{}})"))
|
||||
.run();
|
||||
|
||||
// Tool call with scientific notation number
|
||||
tst.test(
|
||||
"<|tool_call>call:amount{orig:1.5e10}<tool_call|>")
|
||||
.tools({ amount_tool })
|
||||
.expect(message_with_tool_calls("amount", R"({"orig": 1.5e10})"))
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
// Qwen-QwQ-32B (reasoning model)
|
||||
auto tst = peg_tester("models/templates/Qwen-QwQ-32B.jinja");
|
||||
|
|
@ -2031,6 +2260,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
}
|
||||
})
|
||||
.run();
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
|
|
@ -2214,6 +2444,58 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// nullable string type ["string", "null"]
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_str>\n"
|
||||
"<parameter=name>\nhello world\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_string_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_str", R"({"name": "hello world"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// nullable string with null first in type array ["null", "string"]
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_str_nf>\n"
|
||||
"<parameter=name>\nhello world\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_string_null_first_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_str_nf", R"({"name": "hello world"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// nullable integer type ["integer", "null"] - should use JSON value path, not string
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_int>\n"
|
||||
"<parameter=count>\n42\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_int_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_int", R"({"count": 42})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// enum without explicit type key - should infer string from enum values
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_unit>\n"
|
||||
"<parameter=unit>\ncelsius\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ enum_no_type_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_unit", R"({"unit": "celsius"})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
|
||||
|
|
@ -2372,55 +2654,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
// #20424 introduced effective_input = generation_prompt + input, but the throw
|
||||
// uses input.substr(result.end) where result.end is in effective_input space.
|
||||
{
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
|
||||
if (!g_template_filter.empty() && std::string("models/templates/GLM-4.7-Flash.jinja").find(g_template_filter) != std::string::npos) {
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
|
||||
|
||||
static common_chat_tool weather_tool{
|
||||
"get_weather", "Get weather",
|
||||
R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
|
||||
};
|
||||
static common_chat_tool weather_tool{
|
||||
"get_weather", "Get weather",
|
||||
R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
|
||||
};
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { weather_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "get_weather";
|
||||
inputs.messages = { msg };
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { weather_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "get_weather";
|
||||
inputs.messages = { msg };
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
common_chat_parser_params pp(params);
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
common_chat_parser_params pp(params);
|
||||
|
||||
// generation_prompt is non-empty for thinking models, so result.end
|
||||
// will be offset by generation_prompt.size() into effective_input space.
|
||||
assert(!pp.generation_prompt.empty());
|
||||
// generation_prompt is non-empty for thinking models, so result.end
|
||||
// will be offset by generation_prompt.size() into effective_input space.
|
||||
assert(!pp.generation_prompt.empty());
|
||||
|
||||
std::string bad_input =
|
||||
"Thinking.\n"
|
||||
"</think>"
|
||||
"<tool_call>get_weather"
|
||||
"<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
|
||||
"</tool_call>\n";
|
||||
std::string bad_input =
|
||||
"Thinking.\n"
|
||||
"</think>"
|
||||
"<tool_call>get_weather"
|
||||
"<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
|
||||
"</tool_call>\n";
|
||||
|
||||
bool got_runtime_error = false;
|
||||
bool got_out_of_range = false;
|
||||
std::string error_msg;
|
||||
try {
|
||||
common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
|
||||
} catch (const std::out_of_range & e) {
|
||||
got_out_of_range = true;
|
||||
error_msg = e.what();
|
||||
} catch (const std::runtime_error & e) {
|
||||
got_runtime_error = true;
|
||||
error_msg = e.what();
|
||||
bool got_runtime_error = false;
|
||||
bool got_out_of_range = false;
|
||||
std::string error_msg;
|
||||
try {
|
||||
common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
|
||||
} catch (const std::out_of_range & e) {
|
||||
got_out_of_range = true;
|
||||
error_msg = e.what();
|
||||
} catch (const std::runtime_error & e) {
|
||||
got_runtime_error = true;
|
||||
error_msg = e.what();
|
||||
}
|
||||
GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
|
||||
GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position");
|
||||
}
|
||||
GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
|
||||
GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position");
|
||||
}
|
||||
|
||||
// Kimi-K2-Thinking tests - custom parser
|
||||
|
|
@ -3000,6 +3284,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test("[TOOL_CALLS]special_function[CALL_ID]000000001[ARGS]{\"arg1\": 1}"
|
||||
"[TOOL_CALLS]special_function_with_opt[CALL_ID]000000002[ARGS]{\"arg1\": 1, \"arg2\": 2}")
|
||||
.parallel_tool_calls(true)
|
||||
.tools({
|
||||
special_function_tool, special_function_tool_with_optional_param
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", "000000001" },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", "000000002" },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
|
||||
}
|
||||
// Devstral
|
||||
{
|
||||
|
|
@ -3175,6 +3474,24 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
.expect_reasoning("I will execute python to say hello")
|
||||
.expect_content("")
|
||||
.run();
|
||||
|
||||
// Edge cases
|
||||
|
||||
// "<|channel|>commentary to=assistant" before reasoning
|
||||
tst.test(
|
||||
"<|channel|>commentary to=assistant<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
|
||||
// "<|channel|>commentary to=assistant" before final message
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>commentary to=assistant<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
|
|
|
|||
|
|
@ -116,6 +116,39 @@ int main() {
|
|||
// Verify tensor count
|
||||
TEST_ASSERT(model3.tensors.size() == 780, "expected tensor count == 780");
|
||||
|
||||
// Test a hybrid-attention model with array-valued head counts
|
||||
auto result4 = gguf_fetch_model_meta("ggml-org/Step-3.5-Flash-GGUF", "Q4_K");
|
||||
if (!result4.has_value()) {
|
||||
fprintf(stderr, "FAIL: could not fetch Step-3.5-Flash metadata\n");
|
||||
return 1;
|
||||
}
|
||||
const auto & model4 = result4.value();
|
||||
|
||||
fprintf(stderr, "Architecture: %s\n", model4.architecture.c_str());
|
||||
fprintf(stderr, "n_embd: %u\n", model4.n_embd);
|
||||
fprintf(stderr, "n_ff: %u\n", model4.n_ff);
|
||||
fprintf(stderr, "n_vocab: %u\n", model4.n_vocab);
|
||||
fprintf(stderr, "n_layer: %u\n", model4.n_layer);
|
||||
fprintf(stderr, "n_head: %u\n", model4.n_head);
|
||||
fprintf(stderr, "n_head_kv: %u\n", model4.n_head_kv);
|
||||
fprintf(stderr, "n_expert: %u\n", model4.n_expert);
|
||||
fprintf(stderr, "n_embd_head_k: %u\n", model4.n_embd_head_k);
|
||||
fprintf(stderr, "n_embd_head_v: %u\n", model4.n_embd_head_v);
|
||||
fprintf(stderr, "tensors: %zu\n", model4.tensors.size());
|
||||
|
||||
TEST_ASSERT(model4.architecture == "step35", "expected architecture 'step35'");
|
||||
|
||||
TEST_ASSERT(model4.n_layer == 45, "expected n_layer == 45");
|
||||
TEST_ASSERT(model4.n_embd == 4096, "expected n_embd == 4096");
|
||||
TEST_ASSERT(model4.n_ff == 11264, "expected n_ff == 11264");
|
||||
TEST_ASSERT(model4.n_head == 64, "expected n_head == 64 (first element of per-layer array)");
|
||||
TEST_ASSERT(model4.n_head_kv == 8, "expected n_head_kv == 8 (first element of per-layer array)");
|
||||
TEST_ASSERT(model4.n_expert == 288, "expected n_expert == 288");
|
||||
TEST_ASSERT(model4.n_embd_head_k == 128, "expected n_embd_head_k == 128");
|
||||
TEST_ASSERT(model4.n_embd_head_v == 128, "expected n_embd_head_v == 128");
|
||||
TEST_ASSERT(model4.n_vocab == 128896, "expected n_vocab == 128896");
|
||||
TEST_ASSERT(model4.tensors.size() == 754, "expected tensor count == 754");
|
||||
|
||||
fprintf(stderr, "=== ALL TESTS PASSED ===\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -523,6 +523,18 @@ static void test_filters(testing & t) {
|
|||
"hello"
|
||||
);
|
||||
|
||||
test_template(t, "upper array",
|
||||
"{{ items|upper }}",
|
||||
{{"items", json::array({"hello", "world"})}},
|
||||
"['HELLO', 'WORLD']"
|
||||
);
|
||||
|
||||
test_template(t, "upper dict",
|
||||
"{{ items|upper }}",
|
||||
{{"items", {{"hello", "world"}}}},
|
||||
"{'HELLO': 'WORLD'}"
|
||||
);
|
||||
|
||||
test_template(t, "capitalize",
|
||||
"{{ 'heLlo World'|capitalize }}",
|
||||
json::object(),
|
||||
|
|
|
|||
|
|
@ -385,6 +385,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
|||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME @ngxson
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME
|
||||
}
|
||||
|
|
@ -451,6 +454,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
|||
if (arch == LLM_ARCH_CHAMELEON) {
|
||||
continue; // Only half-implemented and to be removed in the future.
|
||||
}
|
||||
if (arch == LLM_ARCH_GEMMA4) {
|
||||
continue; // FIXME @ngxson
|
||||
}
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
continue; // FIXME CUDA backend crashes.
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,520 @@
|
|||
#include "../src/llama-ext.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "gguf-model-data.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ftype name <-> enum mapping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct ftype_name_entry {
|
||||
const char * name;
|
||||
llama_ftype ftype;
|
||||
};
|
||||
|
||||
static const ftype_name_entry ftype_name_table[] = {
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32 },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16 },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16 },
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0 },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1 },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0 },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1 },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0 },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S },
|
||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M },
|
||||
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS },
|
||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS },
|
||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S },
|
||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M },
|
||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL },
|
||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS },
|
||||
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0 },
|
||||
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0 },
|
||||
{ "MXFP4_MOE", LLAMA_FTYPE_MOSTLY_MXFP4_MOE },
|
||||
{ "NVFP4", LLAMA_FTYPE_MOSTLY_NVFP4 },
|
||||
};
|
||||
|
||||
static llama_ftype llama_ftype_from_name(const char * name) {
|
||||
for (const auto & e : ftype_name_table) {
|
||||
if (strcmp(name, e.name) == 0) {
|
||||
return e.ftype;
|
||||
}
|
||||
}
|
||||
return (llama_ftype) -1;
|
||||
}
|
||||
|
||||
static const char * llama_ftype_to_name(llama_ftype ftype) {
|
||||
for (const auto & e : ftype_name_table) {
|
||||
if (e.ftype == ftype) {
|
||||
return e.name;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ggml_type name lookup
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static ggml_type ggml_type_from_name(const std::string & name) {
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
const char * tname = ggml_type_name((ggml_type) i);
|
||||
if (tname && name == tname) {
|
||||
return (ggml_type) i;
|
||||
}
|
||||
}
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// File parser for snapshot files (quant type schemas)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct snapshot_section {
|
||||
llama_ftype ftype;
|
||||
ggml_type default_type;
|
||||
std::vector<std::pair<std::string, ggml_type>> overrides;
|
||||
};
|
||||
|
||||
// This function is pretty ugly, but it's a trade-off of readable snapshot files
|
||||
// versus readable parsing code
|
||||
static bool parse_snapshot_file(const std::string & path, std::vector<snapshot_section> & sections) {
|
||||
std::ifstream f(path);
|
||||
if (!f.good()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
snapshot_section * cur = nullptr;
|
||||
std::string line;
|
||||
|
||||
while (std::getline(f, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// section header: [FTYPE_NAME] default_type
|
||||
if (line[0] == '[') {
|
||||
auto close = line.find(']');
|
||||
if (close == std::string::npos) {
|
||||
fprintf(stderr, "parse error: missing ] in '%s'\n", line.c_str());
|
||||
return false;
|
||||
}
|
||||
std::string ftype_str = line.substr(1, close - 1);
|
||||
std::string default_str;
|
||||
size_t pos = close + 1;
|
||||
while (pos < line.size() && line[pos] == ' ') {
|
||||
pos++;
|
||||
}
|
||||
default_str = line.substr(pos);
|
||||
|
||||
llama_ftype ftype = llama_ftype_from_name(ftype_str.c_str());
|
||||
if ((int) ftype < 0) {
|
||||
fprintf(stderr, "parse error: unknown ftype '%s'\n", ftype_str.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_type dtype = ggml_type_from_name(default_str);
|
||||
if (dtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "parse error: unknown default type '%s'\n", default_str.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
sections.push_back({ ftype, dtype, {} });
|
||||
cur = §ions.back();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!cur) {
|
||||
fprintf(stderr, "parse error: tensor line before any section: '%s'\n", line.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto sp = line.rfind(' ');
|
||||
if (sp == std::string::npos) {
|
||||
fprintf(stderr, "parse error: no space in tensor line: '%s'\n", line.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string tname = line.substr(0, sp);
|
||||
std::string ttype = line.substr(sp + 1);
|
||||
|
||||
ggml_type gt = ggml_type_from_name(ttype);
|
||||
if (gt == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "parse error: unknown type '%s' for tensor '%s'\n", ttype.c_str(), tname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
cur->overrides.push_back({ tname, gt });
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Remote model support using gguf-model-data.cpp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct remote_model_spec {
|
||||
const char * repo;
|
||||
const char * quant;
|
||||
};
|
||||
|
||||
// Get model name from repo: strip org prefix, strip -GGUF suffix,
|
||||
// and strip anything up to and including first '_' (e.g. "deepseek-ai_DeepSeek-V3.1").
|
||||
static std::string model_name_from_repo(const char * repo) {
|
||||
std::string s(repo);
|
||||
|
||||
auto slash = s.find('/');
|
||||
if (slash != std::string::npos) {
|
||||
s = s.substr(slash + 1);
|
||||
}
|
||||
|
||||
const std::string suffix = "-GGUF";
|
||||
if (s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0) {
|
||||
s = s.substr(0, s.size() - suffix.size());
|
||||
}
|
||||
|
||||
auto underscore = s.find('_');
|
||||
if (underscore != std::string::npos) {
|
||||
s = s.substr(underscore + 1);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static std::string snapshot_file_from_name(const std::string & name) {
|
||||
std::string lower = name;
|
||||
for (auto & c : lower) {
|
||||
c = std::tolower(c);
|
||||
}
|
||||
return lower;
|
||||
}
|
||||
|
||||
static const remote_model_spec model_specs[] = {
|
||||
{ "ggml-org/Qwen3-0.6B-GGUF", "Q8_0" },
|
||||
{ "ggml-org/GLM-4.6V-GGUF", "Q8_0" },
|
||||
{ "ggml-org/Step-3.5-Flash-GGUF", "Q4_K" },
|
||||
{ "ggml-org/Qwen3-Coder-Next-GGUF", "Q8_0" },
|
||||
{ "ggml-org/Qwen3-14B-GGUF", "Q8_0" },
|
||||
{ "ggml-org/Nemotron-Nano-3-30B-A3B-GGUF", "Q8_0" },
|
||||
{ "ggml-org/gpt-oss-120b-GGUF", "mxfp4" },
|
||||
{ "ggml-org/gemma-3-4b-it-GGUF", "Q8_0" },
|
||||
{ "bartowski/Meta-Llama-3.1-70B-Instruct-GGUF", "Q4_K_M" },
|
||||
{ "bartowski/deepseek-ai_DeepSeek-V3.1-GGUF", "IQ1_M" },
|
||||
{ "bartowski/Qwen_Qwen3.5-397B-A17B-GGUF", "IQ1_S" }, // TODO: swap with ggml-org if/when it's released
|
||||
{ "bartowski/Qwen_Qwen3.5-27B-GGUF", "Q8_0" }, // TODO: swap with ggml-org if/when it's released
|
||||
};
|
||||
|
||||
static const int n_model_specs = (int) (sizeof(model_specs) / sizeof(model_specs[0]));
|
||||
|
||||
static llama_model * build_mock_model_from_remote(const gguf_remote_model & remote) {
|
||||
llama_quant_model_desc desc = {};
|
||||
desc.architecture = remote.architecture.c_str();
|
||||
desc.n_embd = remote.n_embd;
|
||||
desc.n_ff = remote.n_ff;
|
||||
desc.n_layer = remote.n_layer;
|
||||
desc.n_head = remote.n_head;
|
||||
desc.n_head_kv = remote.n_head_kv;
|
||||
desc.n_expert = remote.n_expert;
|
||||
desc.n_embd_head_k = remote.n_embd_head_k;
|
||||
desc.n_embd_head_v = remote.n_embd_head_v;
|
||||
return llama_quant_model_from_metadata(&desc);
|
||||
}
|
||||
|
||||
// Single ggml context holding all quantizable tensors for a model.
|
||||
struct mock_tensors {
|
||||
ggml_context_ptr ctx;
|
||||
std::vector<ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
static mock_tensors build_mock_tensors(const quantize_state_impl * qs, const gguf_remote_model & remote) {
|
||||
const size_t ctx_size = remote.tensors.size() * ggml_tensor_overhead();
|
||||
struct ggml_init_params params = { ctx_size, nullptr, true };
|
||||
ggml_context_ptr ctx(ggml_init(params));
|
||||
|
||||
std::vector<ggml_tensor *> result;
|
||||
|
||||
for (const auto & t : remote.tensors) {
|
||||
ggml_tensor * gt = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32, t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
|
||||
ggml_set_name(gt, t.name.c_str());
|
||||
if (llama_quant_tensor_allows_quantization(qs, gt)) {
|
||||
result.push_back(gt);
|
||||
}
|
||||
}
|
||||
|
||||
// sort by layer index then name, matching llama_model_loader::weight_name_comparer
|
||||
std::sort(result.begin(), result.end(), [](const ggml_tensor * a, const ggml_tensor * b) {
|
||||
int a_layer = -1, b_layer = -1;
|
||||
sscanf(a->name, "blk.%d.", &a_layer);
|
||||
sscanf(b->name, "blk.%d.", &b_layer);
|
||||
if (a_layer != b_layer) {
|
||||
return a_layer < b_layer;
|
||||
}
|
||||
return strcmp(a->name, b->name) < 0;
|
||||
});
|
||||
|
||||
return { std::move(ctx), std::move(result) };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate mode: regenerate all snapshot files
|
||||
// Use this when either adding new models or modifying quants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static std::string generate_snapshot(const std::string & name,
|
||||
const gguf_remote_model & remote,
|
||||
quantize_state_impl * qs,
|
||||
mock_tensors & mt) {
|
||||
std::ostringstream out;
|
||||
|
||||
out << "# Model: " << name << "\n";
|
||||
out << "# n_embd=" << remote.n_embd << ", n_ff=" << remote.n_ff << ", n_vocab=" << remote.n_vocab
|
||||
<< ", n_layer=" << remote.n_layer << ", n_head=" << remote.n_head << ", n_head_kv=" << remote.n_head_kv;
|
||||
if (remote.n_expert > 0) {
|
||||
out << ", n_expert=" << remote.n_expert;
|
||||
}
|
||||
out << "\n";
|
||||
|
||||
for (int i = 0; i < LLAMA_FTYPE_GUESSED; i++) {
|
||||
llama_ftype ft = (llama_ftype) i;
|
||||
ggml_type default_type = llama_ftype_get_default_type(ft);
|
||||
if (default_type == GGML_TYPE_COUNT) {
|
||||
continue;
|
||||
}
|
||||
const char * fname = llama_ftype_to_name(ft);
|
||||
if (!fname) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<ggml_type> result_types(mt.tensors.size());
|
||||
llama_quant_compute_types(qs, ft, mt.tensors.data(), result_types.data(), mt.tensors.size());
|
||||
|
||||
out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n";
|
||||
for (size_t j = 0; j < mt.tensors.size(); j++) {
|
||||
if (result_types[j] != default_type) {
|
||||
out << ggml_get_name(mt.tensors[j]) << " " << ggml_type_name(result_types[j]) << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static int run_generate(const std::string & snapshot_dir) {
|
||||
fprintf(stderr, "This will overwrite all snapshot files in:\n %s\n", snapshot_dir.c_str());
|
||||
fprintf(stderr, "Continue? [y/N] ");
|
||||
int ch = fgetc(stdin);
|
||||
if (ch != 'y' && ch != 'Y') {
|
||||
fprintf(stderr, "Aborted.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
int n_written = 0;
|
||||
|
||||
for (int m = 0; m < n_model_specs; m++) {
|
||||
const auto & spec = model_specs[m];
|
||||
std::string name = model_name_from_repo(spec.repo);
|
||||
|
||||
fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
|
||||
auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
|
||||
if (!result.has_value()) {
|
||||
fprintf(stderr, "ERROR: could not fetch model metadata for %s\n", name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto & remote = result.value();
|
||||
llama_model * model = build_mock_model_from_remote(remote);
|
||||
llama_model_quantize_params qparams = llama_model_quantize_default_params();
|
||||
quantize_state_impl * qs = llama_quant_init(model, &qparams);
|
||||
auto mt = build_mock_tensors(qs, remote);
|
||||
|
||||
std::string content = generate_snapshot(name, remote, qs, mt);
|
||||
std::string path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
|
||||
|
||||
std::ofstream f(path);
|
||||
if (!f.good()) {
|
||||
fprintf(stderr, "ERROR: could not write %s\n", path.c_str());
|
||||
llama_quant_free(qs);
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
}
|
||||
f << content;
|
||||
n_written++;
|
||||
fprintf(stderr, " wrote %s\n", path.c_str());
|
||||
llama_quant_free(qs);
|
||||
llama_model_free(model);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%d files written\n", n_written);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test mode: compare against snapshot files
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static bool run_test_section(quantize_state_impl * qs, mock_tensors & mt, const snapshot_section & section) {
|
||||
// verify default_type matches what llama_ftype_get_default_type returns
|
||||
ggml_type computed_default = llama_ftype_get_default_type(section.ftype);
|
||||
if (computed_default != section.default_type) {
|
||||
printf(" FAIL [%s] default type mismatch: file says %s, code says %s\n", llama_ftype_to_name(section.ftype),
|
||||
ggml_type_name(section.default_type), ggml_type_name(computed_default));
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<ggml_type> result_types(mt.tensors.size());
|
||||
llama_quant_compute_types(qs, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size());
|
||||
|
||||
std::map<std::string, ggml_type> override_map(section.overrides.begin(), section.overrides.end());
|
||||
|
||||
bool all_pass = true;
|
||||
int n_override_found = 0;
|
||||
|
||||
for (size_t i = 0; i < mt.tensors.size(); i++) {
|
||||
const char * name = ggml_get_name(mt.tensors[i]);
|
||||
ggml_type got = result_types[i];
|
||||
|
||||
ggml_type expected = section.default_type;
|
||||
auto it = override_map.find(name);
|
||||
if (it != override_map.end()) {
|
||||
expected = it->second;
|
||||
n_override_found++;
|
||||
}
|
||||
|
||||
if (got != expected) {
|
||||
printf(" FAIL %-50s %-10s expected %s, got %s\n", name, llama_ftype_to_name(section.ftype),
|
||||
ggml_type_name(expected), ggml_type_name(got));
|
||||
all_pass = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_override_found != (int) section.overrides.size()) {
|
||||
printf(" FAIL [%s] override count mismatch: listed %d, matched %d\n", llama_ftype_to_name(section.ftype),
|
||||
(int) section.overrides.size(), n_override_found);
|
||||
all_pass = false;
|
||||
}
|
||||
|
||||
return all_pass;
|
||||
}
|
||||
|
||||
static int run_remote_tests(const std::string & snapshot_dir, const char * argv0) {
|
||||
int total_pass = 0;
|
||||
int total_fail = 0;
|
||||
int total_skip = 0;
|
||||
|
||||
for (int m = 0; m < n_model_specs; m++) {
|
||||
const auto & spec = model_specs[m];
|
||||
std::string name = model_name_from_repo(spec.repo);
|
||||
printf("=== %s ===\n", name.c_str());
|
||||
|
||||
auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
|
||||
if (!result.has_value()) {
|
||||
printf(" SKIP (could not fetch model metadata)\n\n");
|
||||
total_skip++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto & remote = result.value();
|
||||
llama_model * model = build_mock_model_from_remote(remote);
|
||||
llama_model_quantize_params qparams = llama_model_quantize_default_params();
|
||||
quantize_state_impl * qs = llama_quant_init(model, &qparams);
|
||||
auto mt = build_mock_tensors(qs, remote);
|
||||
|
||||
std::string snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
|
||||
std::vector<snapshot_section> sections;
|
||||
if (!parse_snapshot_file(snapshot_path, sections)) {
|
||||
printf(" SKIP (could not read snapshot file: %s)\n\n", snapshot_path.c_str());
|
||||
llama_quant_free(qs);
|
||||
llama_model_free(model);
|
||||
total_skip++;
|
||||
continue;
|
||||
}
|
||||
|
||||
int model_pass = 0;
|
||||
int model_fail = 0;
|
||||
|
||||
for (const auto & section : sections) {
|
||||
bool pass = run_test_section(qs, mt, section);
|
||||
if (pass) {
|
||||
model_pass++;
|
||||
} else {
|
||||
model_fail++;
|
||||
}
|
||||
}
|
||||
|
||||
printf(" %s %s: %d/%d ftype sections passed (%d tensors)\n", model_fail == 0 ? "PASS" : "FAIL", name.c_str(),
|
||||
model_pass, model_pass + model_fail, (int) mt.tensors.size());
|
||||
printf("\n");
|
||||
|
||||
if (model_fail == 0) {
|
||||
total_pass++;
|
||||
} else {
|
||||
total_fail++;
|
||||
}
|
||||
|
||||
llama_quant_free(qs);
|
||||
llama_model_free(model);
|
||||
}
|
||||
|
||||
printf("%d/%d models passed", total_pass, total_pass + total_fail);
|
||||
if (total_skip > 0) {
|
||||
printf(", %d skipped", total_skip);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
if (total_fail > 0) {
|
||||
printf("\nIf these changes are intentional, regenerate snapshot files with:\n");
|
||||
printf(" %s --generate\n", argv0);
|
||||
}
|
||||
|
||||
return total_fail > 0 ? 1 : 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::string snapshot_dir = SNAPSHOT_DIR;
|
||||
bool generate = false;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "--generate") == 0) {
|
||||
generate = true;
|
||||
} else if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
|
||||
snapshot_dir = argv[++i];
|
||||
}
|
||||
}
|
||||
|
||||
if (generate) {
|
||||
return run_generate(snapshot_dir);
|
||||
}
|
||||
|
||||
// suppress llama log warnings during test (e.g. tensor type fallback messages)
|
||||
llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
|
||||
|
||||
return run_remote_tests(snapshot_dir, argv[0]);
|
||||
}
|
||||
|
|
@ -176,8 +176,8 @@
|
|||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
|
|
|
|||
|
|
@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ add_library(mtmd
|
|||
models/models.h
|
||||
models/cogvlm.cpp
|
||||
models/conformer.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ struct clip_graph {
|
|||
const int n_layer;
|
||||
const int n_mmproj_embd;
|
||||
const float eps;
|
||||
const float kq_scale;
|
||||
float kq_scale; // TODO: maybe move this to hparams
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
|
|
|
|||
|
|
@ -88,8 +88,11 @@
|
|||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LS_OUT "%s.blk.%d.out_scale.%s" // layer out scale (gemma4)
|
||||
#define TN_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" // post-attn norm (gemma4)
|
||||
#define TN_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" // post-FFN norm (gemma4)
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
|
|
@ -213,6 +216,10 @@
|
|||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
// gemma4
|
||||
#define TN_STD_BIAS "v.std_bias"
|
||||
#define TN_STD_SCALE "v.std_scale"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
|
@ -233,6 +240,8 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_GEMMA4V,
|
||||
PROJECTOR_TYPE_GEMMA4A,
|
||||
PROJECTOR_TYPE_PHI4,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
|
|
@ -272,6 +281,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_GEMMA4V, "gemma4v"},
|
||||
{ PROJECTOR_TYPE_GEMMA4A, "gemma4a"},
|
||||
{ PROJECTOR_TYPE_PHI4, "phi4"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
|
|
@ -476,6 +487,18 @@ static std::vector<std::string> string_split_str(std::string s, const std::strin
|
|||
return tokens;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
||||
return str.size() >= prefix.size() &&
|
||||
str.compare(0, prefix.size(), prefix) == 0;
|
||||
}
|
||||
|
||||
// remove when moving to c++20
|
||||
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
||||
return str.size() >= suffix.size() &&
|
||||
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
//
|
||||
// gguf utils
|
||||
//
|
||||
|
|
|
|||
|
|
@ -143,6 +143,10 @@ struct clip_hparams {
|
|||
};
|
||||
|
||||
struct clip_layer {
|
||||
// layernorm 1 (or layer input norm, or pre-attention norm)
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
|
||||
// attention
|
||||
ggml_tensor * k_w = nullptr;
|
||||
ggml_tensor * k_b = nullptr;
|
||||
|
|
@ -159,9 +163,7 @@ struct clip_layer {
|
|||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
// layernorm 1
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
ggml_tensor * attn_post_norm_w = nullptr;
|
||||
|
||||
ggml_tensor * ff_up_w = nullptr;
|
||||
ggml_tensor * ff_up_b = nullptr;
|
||||
|
|
@ -170,13 +172,16 @@ struct clip_layer {
|
|||
ggml_tensor * ff_down_w = nullptr;
|
||||
ggml_tensor * ff_down_b = nullptr;
|
||||
|
||||
// layernorm 2
|
||||
// layernorm 2 (or pre-FFN norm)
|
||||
ggml_tensor * ln_2_w = nullptr;
|
||||
ggml_tensor * ln_2_b = nullptr;
|
||||
|
||||
ggml_tensor * ff_post_norm_w = nullptr;
|
||||
|
||||
// layer scale (no bias)
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
ggml_tensor * ls_out_w = nullptr; // gemma4
|
||||
|
||||
// qwen3vl deepstack merger
|
||||
ggml_tensor * deepstack_norm_w = nullptr;
|
||||
|
|
@ -437,6 +442,18 @@ struct clip_model {
|
|||
ggml_tensor * pre_encode_out_w = nullptr;
|
||||
ggml_tensor * pre_encode_out_b = nullptr;
|
||||
|
||||
// gemma4
|
||||
ggml_tensor * std_bias = nullptr;
|
||||
ggml_tensor * std_scale = nullptr;
|
||||
// Gemma4ClippableLinear
|
||||
struct clamp_info {
|
||||
float inp_max;
|
||||
float inp_min;
|
||||
float out_max;
|
||||
float out_min;
|
||||
};
|
||||
std::map<std::string, clamp_info> clamp_info_map;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
#include <limits>
|
||||
#include <array>
|
||||
#include <functional>
|
||||
#include <float.h>
|
||||
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
|
|
@ -379,19 +380,34 @@ ggml_tensor * clip_graph::build_vit(
|
|||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
// if true, norm must be applied after reshaping to (d_head, n_head, n_pos)
|
||||
bool norm_per_head = layer.q_norm && layer.q_norm->ne[0] == d_head;
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
if (!norm_per_head) {
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
if (norm_per_head) {
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm_per_head", il);
|
||||
}
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm_per_head", il);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
|
@ -405,6 +421,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cb(Kcur, "Kcur_pos", il);
|
||||
}
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA4V) {
|
||||
Vcur = ggml_rms_norm(ctx0, Vcur, eps);
|
||||
cb(Vcur, "Vcur_normed", il);
|
||||
}
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
|
@ -415,6 +436,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cb(cur, "attn_out_scaled", il);
|
||||
}
|
||||
|
||||
if (layer.attn_post_norm_w) {
|
||||
cur = build_norm(cur, layer.attn_post_norm_w, nullptr, norm_t, eps, il);
|
||||
cb(cur, "attn_post_normed", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
|
|
@ -422,7 +448,7 @@ ggml_tensor * clip_graph::build_vit(
|
|||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
// layernorm2 (pre-ffn norm)
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
|
|
@ -435,6 +461,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
if (layer.ff_post_norm_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, norm_t, eps, il);
|
||||
cb(cur, "ffn_post_normed", il);
|
||||
}
|
||||
|
||||
if (layer.ls_2_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||
cb(cur, "ffn_out_scaled", il);
|
||||
|
|
@ -444,6 +475,11 @@ ggml_tensor * clip_graph::build_vit(
|
|||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
if (layer.ls_out_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_out_w);
|
||||
cb(cur, "layer_out_scaled", il);
|
||||
}
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
|
@ -808,6 +844,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
|
|
@ -1257,6 +1297,17 @@ struct clip_model_loader {
|
|||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
hparams.rope_theta = 100.0f;
|
||||
hparams.n_merge = 3; // pooling_kernel_size
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
|
||||
hparams.set_limit_image_tokens(252, 280);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
} break;
|
||||
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
|
||||
|
|
@ -1442,6 +1493,11 @@ struct clip_model_loader {
|
|||
std::map<std::string, size_t> tensor_offset;
|
||||
std::vector<ggml_tensor *> tensors_to_load;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// TODO @ngxson : support both audio and video in the future
|
||||
const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
|
||||
|
||||
|
|
@ -1478,6 +1534,18 @@ struct clip_model_loader {
|
|||
return cur;
|
||||
};
|
||||
|
||||
auto get_scalar = [&](const std::string & name, float default_val) {
|
||||
auto it = tensor_offset.find(name);
|
||||
if (it == tensor_offset.end()) {
|
||||
return default_val;
|
||||
}
|
||||
size_t offset = it->second;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
float value;
|
||||
fin.read(reinterpret_cast<char*>(&value), sizeof(float));
|
||||
return value;
|
||||
};
|
||||
|
||||
model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
||||
|
||||
model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
||||
|
|
@ -1512,8 +1580,11 @@ struct clip_model_loader {
|
|||
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
||||
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
||||
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
||||
layer.ls_out_w = get_tensor(string_format(TN_LS_OUT, prefix, il, "weight"), false); // no bias
|
||||
layer.attn_post_norm_w = get_tensor(string_format(TN_ATTN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||
layer.ff_post_norm_w = get_tensor(string_format(TN_FFN_POST_NORM, prefix, il, "weight"), false); // no bias
|
||||
|
||||
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
||||
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
||||
|
|
@ -1713,6 +1784,32 @@ struct clip_model_loader {
|
|||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||
model.std_bias = get_tensor(TN_STD_BIAS, false);
|
||||
model.std_scale = get_tensor(TN_STD_SCALE, false);
|
||||
// load scalar for Gemma4ClippableLinear
|
||||
for (auto * tensor : tensors_to_load) {
|
||||
std::string name = tensor->name;
|
||||
if (string_ends_with(name, ".weight")) {
|
||||
std::string name_inp_max = name;
|
||||
std::string name_inp_min = name;
|
||||
std::string name_out_max = name;
|
||||
std::string name_out_min = name;
|
||||
string_replace_all(name_inp_max, ".weight", ".input_max");
|
||||
string_replace_all(name_inp_min, ".weight", ".input_min");
|
||||
string_replace_all(name_out_max, ".weight", ".output_max");
|
||||
string_replace_all(name_out_min, ".weight", ".output_min");
|
||||
model.clamp_info_map[name] = {
|
||||
get_scalar(name_inp_max, FLT_MAX),
|
||||
get_scalar(name_inp_min, -FLT_MAX),
|
||||
get_scalar(name_out_max, FLT_MAX),
|
||||
get_scalar(name_out_min, -FLT_MAX)
|
||||
};
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
|
||||
|
|
@ -2042,11 +2139,6 @@ struct clip_model_loader {
|
|||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
||||
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
||||
|
|
@ -2345,7 +2437,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
|
||||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV
|
||||
|| ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V;
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
|
|
@ -2581,6 +2674,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
|
|
@ -3031,6 +3125,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
// set (col, row) patch positions for learned positional embedding
|
||||
const int n_cols = image_size_width / patch_size;
|
||||
std::vector<int> pos_x(num_patches), pos_y(num_patches);
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
pos_x[i] = i % n_cols;
|
||||
pos_y[i] = i / n_cols;
|
||||
}
|
||||
set_input_i32("pos_x", pos_x);
|
||||
set_input_i32("pos_y", pos_y);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
GGML_ASSERT(pos_w == pos_h);
|
||||
|
|
@ -3218,6 +3324,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
return ctx->model.mm_input_proj_w->ne[1];
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,151 @@
|
|||
#include "models.h"
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
// patches = 2 * (patches - 0.5)
|
||||
// equivalent to: patches * 2 - 1
|
||||
inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f);
|
||||
ggml_set_name(inp_raw, "inp_raw_scaled");
|
||||
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
ggml_set_name(inp, "inp");
|
||||
// note: no patch bias
|
||||
|
||||
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_x, "pos_x");
|
||||
ggml_set_input(pos_x);
|
||||
|
||||
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_y, "pos_y");
|
||||
ggml_set_input(pos_y);
|
||||
|
||||
{
|
||||
const int64_t pos_size = model.position_embeddings->ne[1];
|
||||
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
|
||||
|
||||
// positional embeddings are stored as lookup tables (one for x, one for y)
|
||||
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, 0);
|
||||
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, pos_size * nb1);
|
||||
|
||||
// ggml_get_rows: [n_embd, n_patches]
|
||||
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
|
||||
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
|
||||
|
||||
inp = ggml_add(ctx0, inp, emb_x);
|
||||
inp = ggml_add(ctx0, inp, emb_y);
|
||||
cb(inp, "pos_embd", -1);
|
||||
}
|
||||
|
||||
// similar to build_rope_2d, but use neox ordering
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
const int64_t n_dim = cur->ne[0];
|
||||
const int64_t n_head = cur->ne[1];
|
||||
const int64_t n_pos = cur->ne[2];
|
||||
|
||||
// first half
|
||||
ggml_tensor * first;
|
||||
{
|
||||
first = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
0);
|
||||
first = ggml_rope_ext(
|
||||
ctx0,
|
||||
first,
|
||||
pos_x, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
// second half
|
||||
ggml_tensor * second;
|
||||
{
|
||||
second = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
second,
|
||||
pos_y, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
cur = ggml_concat(ctx0, first, second, 0);
|
||||
return cur;
|
||||
};
|
||||
|
||||
kq_scale = 1.0f;
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr, // pos embd is already handled above
|
||||
add_pos);
|
||||
|
||||
// Gemma4VisionPooler
|
||||
{
|
||||
const int kernel_size = hparams.n_merge;
|
||||
GGML_ASSERT(kernel_size > 0);
|
||||
|
||||
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
|
||||
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
|
||||
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
const int out_x = n_patches_x / kernel_size;
|
||||
const int out_y = n_patches_y / kernel_size;
|
||||
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
|
||||
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
|
||||
cb(cur, "pooled", -1);
|
||||
}
|
||||
|
||||
// hidden_states = (hidden_states - self.std_bias) * self.std_scale
|
||||
if (model.std_bias && model.std_scale) {
|
||||
cur = ggml_sub(ctx0, cur, model.std_bias);
|
||||
cur = ggml_mul(ctx0, cur, model.std_scale);
|
||||
cb(cur, "std_scaled", -1);
|
||||
}
|
||||
|
||||
// Gemma4MultimodalEmbedder
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// embedding_post_projection_norm
|
||||
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
|
||||
cb(cur, "projected_normed", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
// Gemma4ClippableLinear
|
||||
|
||||
auto it = model.clamp_info_map.find(w->name);
|
||||
if (it == model.clamp_info_map.end()) {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
} else {
|
||||
const auto & clamp_info = it->second;
|
||||
ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max);
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||
out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,12 @@ struct clip_graph_siglip : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4v : clip_graph {
|
||||
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_pixtral : clip_graph {
|
||||
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -394,6 +394,13 @@ struct mtmd_context {
|
|||
img_end = "<|IMAGE_END|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
{
|
||||
// <|image> ... (image embeddings) ... <image|>
|
||||
img_beg = "<|image>";
|
||||
img_end = "<image|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
|
|
@ -974,6 +981,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
|||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
switch (ctx->proj_type_v()) {
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -5,15 +5,15 @@
|
|||
#include "gguf.h"
|
||||
#include "jinja/runtime.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum class output_mode {
|
||||
|
|
@ -34,14 +34,14 @@ enum class input_message_type {
|
|||
};
|
||||
|
||||
struct debug_options {
|
||||
std::string template_path;
|
||||
bool with_tools = true;
|
||||
bool generation_prompt = true;
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
std::string template_path;
|
||||
bool with_tools = true;
|
||||
bool generation_prompt = true;
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
};
|
||||
|
||||
static std::string read_file(const std::string & path) {
|
||||
|
|
@ -274,7 +274,7 @@ static void render_scenario(const common_chat_template & tmpl,
|
|||
json final_messages = messages;
|
||||
if (add_generation_prompt && !messages.empty() && messages.back().value("role", "") == "assistant") {
|
||||
final_messages.push_back(json{
|
||||
{ "role", "user" },
|
||||
{ "role", "user" },
|
||||
{ "content", "Now please continue with another response." }
|
||||
});
|
||||
}
|
||||
|
|
@ -305,7 +305,7 @@ static void render_all_scenarios(const common_chat_template & tmpl,
|
|||
const json & tools,
|
||||
bool add_generation_prompt,
|
||||
bool enable_thinking,
|
||||
input_message_type message_type) {
|
||||
input_message_type message_type) {
|
||||
json user_msg = build_user_message();
|
||||
|
||||
auto render_if = [&](input_message_type type, const std::string & name, const json & assistant_msg) {
|
||||
|
|
@ -335,6 +335,24 @@ static void render_all_scenarios(const common_chat_template & tmpl,
|
|||
}
|
||||
}
|
||||
|
||||
static autoparser::generation_params prepare_params(const debug_options & opts, const json & tools) {
|
||||
autoparser::generation_params params;
|
||||
params.messages = json::array({ build_user_message() });
|
||||
params.reasoning_format = opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
|
||||
params.enable_thinking = opts.enable_reasoning;
|
||||
params.add_generation_prompt = opts.generation_prompt;
|
||||
|
||||
if (opts.with_tools) {
|
||||
params.tools = tools;
|
||||
params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
} else {
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
return params;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// Set log level to most verbose to capture all debug output
|
||||
common_log_set_verbosity_thold(99);
|
||||
|
|
@ -369,49 +387,41 @@ int main(int argc, char ** argv) {
|
|||
try {
|
||||
common_chat_template chat_template(template_source, "", "");
|
||||
|
||||
// Build tools definition
|
||||
json tools = opts.with_tools ? build_tools_definition() : json();
|
||||
|
||||
// Render template scenarios if requested
|
||||
if (opts.input_message != input_message_type::NONE &&
|
||||
(opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
|
||||
autoparser::generation_params params = prepare_params(opts, tools);
|
||||
common_chat_params parser_data;
|
||||
if (std::optional<common_chat_params> spec_tmpl =
|
||||
common_chat_try_specialized_template(chat_template, template_source, params)) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE RENDERING OUTPUT\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR("This template uses a specialized parser, analysis results will not be available.");
|
||||
parser_data = *spec_tmpl;
|
||||
} else {
|
||||
// Render template scenarios if requested
|
||||
if (opts.input_message != input_message_type::NONE &&
|
||||
(opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE RENDERING OUTPUT\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
|
||||
opts.input_message);
|
||||
}
|
||||
|
||||
// Output analysis if requested
|
||||
if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE ANALYSIS\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
autoparser::autoparser analysis;
|
||||
analysis.analyze_template(chat_template);
|
||||
|
||||
// Generate Parser
|
||||
autoparser::generation_params params;
|
||||
params.messages = json::array({ build_user_message() });
|
||||
params.reasoning_format =
|
||||
opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
|
||||
params.enable_thinking = opts.enable_reasoning;
|
||||
params.add_generation_prompt = opts.generation_prompt;
|
||||
|
||||
if (opts.with_tools) {
|
||||
params.tools = tools;
|
||||
params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
} else {
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
|
||||
opts.input_message);
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
|
||||
auto parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
|
||||
// Output analysis if requested
|
||||
if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE ANALYSIS\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
autoparser::autoparser analysis;
|
||||
analysis.analyze_template(chat_template);
|
||||
|
||||
// Generate Parser
|
||||
parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
|
||||
}
|
||||
|
||||
LOG_ERR("\n=== Generated Parser ===\n");
|
||||
common_peg_arena arena;
|
||||
|
|
|
|||
|
|
@ -167,6 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
|
||||
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||
| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
|
||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
|
|
@ -221,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
||||
|
|
|
|||
|
|
@ -605,6 +605,17 @@ private:
|
|||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void slot_save_and_clear(server_slot & slot) {
|
||||
if (slot.prompt.n_tokens() == 0) {
|
||||
return;
|
||||
}
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
|
||||
slot.prompt_save(*prompt_cache);
|
||||
slot.prompt_clear(false);
|
||||
prompt_cache->update();
|
||||
}
|
||||
|
||||
void handle_sleeping_state(bool new_state) {
|
||||
GGML_ASSERT(sleeping != new_state);
|
||||
if (new_state) {
|
||||
|
|
@ -864,6 +875,19 @@ private:
|
|||
|
||||
metrics.init();
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
|
||||
// populate webui settings
|
||||
{
|
||||
if (!params_base.webui_config_json.empty()) {
|
||||
|
|
@ -1010,15 +1034,15 @@ private:
|
|||
// cache prompts only for completion tasks
|
||||
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
// don't update the cache if the slot's context is empty
|
||||
update_cache = update_cache && tokens.size() > 0;
|
||||
|
||||
if (update_cache) {
|
||||
SRV_WRN("%s", "updating prompt cache\n");
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
ret->prompt_save(*prompt_cache);
|
||||
// don't save the slot's state if its context is empty
|
||||
if (tokens.size() > 0) {
|
||||
ret->prompt_save(*prompt_cache);
|
||||
}
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->prompt_clear(false);
|
||||
|
|
@ -1692,9 +1716,7 @@ private:
|
|||
const int id_slot = task.id_slot;
|
||||
const int id_task = task.id;
|
||||
|
||||
server_slot * slot = id_slot != -1
|
||||
? get_slot_by_id(id_slot)
|
||||
: get_available_slot(task);
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
//
|
||||
// slot scheduling logic
|
||||
|
|
@ -1731,6 +1753,14 @@ private:
|
|||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
for (auto & s : slots) {
|
||||
if (!s.is_processing()) {
|
||||
slot_save_and_clear(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_CANCEL:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -2008,7 +2008,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
|
|||
bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
|
||||
const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
|
||||
|
||||
float f_keep_best = float(lcp_best) / prompt.tokens.size();
|
||||
float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
|
||||
float sim_best = float(lcp_best) / tokens_new.size();
|
||||
|
||||
SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,115 @@
|
|||
import os
|
||||
import tempfile
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
class LogReader:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.pos = 0
|
||||
def drain(self):
|
||||
with open(self.path) as f:
|
||||
f.seek(self.pos)
|
||||
content = f.read()
|
||||
self.pos = f.tell()
|
||||
return content
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.n_slots = 2
|
||||
server.n_predict = 4
|
||||
server.temperature = 0.0
|
||||
server.server_slots = True
|
||||
server.cache_ram = 100
|
||||
server.kv_unified = True
|
||||
server.debug = True
|
||||
fd, server.log_path = tempfile.mkstemp(suffix='.log')
|
||||
os.close(fd)
|
||||
yield
|
||||
|
||||
|
||||
LONG_PROMPT = (
|
||||
"Once upon a time in a land far away, there lived a brave knight "
|
||||
"who traveled across mountains and rivers to find the legendary "
|
||||
"golden sword hidden deep within the enchanted forest of whispers. "
|
||||
"He met many creatures along the way including dragons and fairies "
|
||||
"and wizards who helped him on his noble quest to save the kingdom."
|
||||
)
|
||||
|
||||
|
||||
# idle slot cleared on launch should restore from cache-ram
|
||||
def test_clear_and_restore():
|
||||
global server
|
||||
server.start()
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# verify feature is enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"id_slot": 0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
original_prompt_n = res.body["timings"]["prompt_n"]
|
||||
|
||||
# Slot 0 is the only slot with KV — should NOT be cleared
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
# Launching slot 1 clears idle slot 0
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "The quick brown fox",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
|
||||
|
||||
# Re-send same prompt — should restore from cache-ram
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "updating prompt cache" in log.drain()
|
||||
assert res.body["timings"]["cache_n"] > 0
|
||||
assert res.body["timings"]["prompt_n"] < original_prompt_n
|
||||
|
||||
# Follow-up — slot 0 kept its KV, no clearing needed
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
|
||||
def test_disabled_with_flag():
|
||||
global server
|
||||
server.no_clear_idle = True
|
||||
server.start()
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# Feature should not be enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"id_slot": 0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
|
||||
# Request on different slot — should NOT trigger clearing
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "The quick brown fox",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
|
|
@ -70,7 +70,7 @@ def test_v1_models_supports_multimodal_capability():
|
|||
("What is this:\n", "malformed", False, None),
|
||||
("What is this:\n", "https://google.com/404", False, None), # non-existent image
|
||||
("What is this:\n", "https://ggml.ai", False, None), # non-image data
|
||||
# TODO @ngxson : test with multiple images, no images and with audio
|
||||
# TODO @ngxson : test with audio (needs a model that supports audio input)
|
||||
]
|
||||
)
|
||||
def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
|
|
@ -97,6 +97,69 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
|||
assert res.status_code != 200
|
||||
|
||||
|
||||
def test_vision_chat_completion_multiple_images():
|
||||
"""Test sending multiple images in a single chat completion request."""
|
||||
global server
|
||||
server.n_ctx = 2048
|
||||
server.n_slots = 1
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "What are these:\n"},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": get_img_url("IMG_URL_0"),
|
||||
}},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": get_img_url("IMG_URL_1"),
|
||||
}},
|
||||
]},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
choice = res.body["choices"][0]
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert len(choice["message"]["content"]) > 0
|
||||
|
||||
|
||||
def test_vision_chat_completion_no_image():
|
||||
"""Test sending a text-only message to a multimodal model (no images)."""
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello, how are you?"},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
choice = res.body["choices"][0]
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert len(choice["message"]["content"]) > 0
|
||||
|
||||
|
||||
def test_vision_chat_completion_no_image_content_parts():
|
||||
"""Test sending content parts with only text (no image_url parts)."""
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "Hello, how are you?"},
|
||||
]},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
choice = res.body["choices"][0]
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert len(choice["message"]["content"]) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt, image_data, success, re_content",
|
||||
[
|
||||
|
|
|
|||
|
|
@ -102,6 +102,9 @@ class ServerProcess:
|
|||
mmproj_url: str | None = None
|
||||
media_path: str | None = None
|
||||
sleep_idle_seconds: int | None = None
|
||||
cache_ram: int | None = None
|
||||
no_clear_idle: bool = False
|
||||
log_path: str | None = None
|
||||
webui_mcp_proxy: bool = False
|
||||
|
||||
# session variables
|
||||
|
|
@ -237,6 +240,10 @@ class ServerProcess:
|
|||
server_args.extend(["--media-path", self.media_path])
|
||||
if self.sleep_idle_seconds is not None:
|
||||
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
|
||||
if self.cache_ram is not None:
|
||||
server_args.extend(["--cache-ram", self.cache_ram])
|
||||
if self.no_clear_idle:
|
||||
server_args.append("--no-clear-idle")
|
||||
if self.webui_mcp_proxy:
|
||||
server_args.append("--webui-mcp-proxy")
|
||||
|
||||
|
|
@ -249,11 +256,16 @@ class ServerProcess:
|
|||
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
flags |= subprocess.CREATE_NO_WINDOW
|
||||
|
||||
if self.log_path:
|
||||
self._log = open(self.log_path, "w")
|
||||
else:
|
||||
self._log = sys.stdout
|
||||
|
||||
self.process = subprocess.Popen(
|
||||
[str(arg) for arg in [server_path, *server_args]],
|
||||
creationflags=flags,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stdout,
|
||||
stdout=self._log,
|
||||
stderr=self._log if self._log != sys.stdout else sys.stdout,
|
||||
env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
|
||||
)
|
||||
server_instances.add(self)
|
||||
|
|
@ -298,6 +310,8 @@ class ServerProcess:
|
|||
except Exception as e:
|
||||
print(f"Error waiting for server: {e}")
|
||||
self.process = None
|
||||
if hasattr(self, '_log') and self._log != sys.stdout:
|
||||
self._log.close()
|
||||
|
||||
def make_request(
|
||||
self,
|
||||
|
|
|
|||
Loading…
Reference in New Issue