Compare commits
122 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
9e2e2198b0 | |
|
|
88915cb55c | |
|
|
ebbf544ed1 | |
|
|
b91d7dfe5b | |
|
|
ae40cd27c8 | |
|
|
ceef6b5233 | |
|
|
07c6a59b4f | |
|
|
8b7d340b6f | |
|
|
559646472d | |
|
|
cf45437d35 | |
|
|
9cd4ebcfb1 | |
|
|
89d0aec042 | |
|
|
b9da4444df | |
|
|
617db241aa | |
|
|
1a3d8edbba | |
|
|
6b10a82c00 | |
|
|
d23355afc3 | |
|
|
b30a5fdf37 | |
|
|
b4768955c4 | |
|
|
fc350fdf96 | |
|
|
3a6f059909 | |
|
|
609ea50026 | |
|
|
9f774e45ee | |
|
|
94d0262277 | |
|
|
a93c0ef0fa | |
|
|
710878a7dd | |
|
|
0685848bc6 | |
|
|
0024a69b70 | |
|
|
d0b79aaa2f | |
|
|
f2c0dfb739 | |
|
|
9789c4ecdc | |
|
|
77e20cc107 | |
|
|
5a32a9b8a5 | |
|
|
3b439504ba | |
|
|
463b6a963c | |
|
|
e30f1fdf74 | |
|
|
1430c35948 | |
|
|
f17b3be63f | |
|
|
d7ba99c485 | |
|
|
fbaa95bc29 | |
|
|
b5e1212063 | |
|
|
8f974d2392 | |
|
|
2948e6049a | |
|
|
73c9eb8ced | |
|
|
983df142a9 | |
|
|
57819b8d4b | |
|
|
557fe2d913 | |
|
|
0e810413bb | |
|
|
128142fe7d | |
|
|
6de1bc631d | |
|
|
0a10c34dc1 | |
|
|
deee23863b | |
|
|
c3e3f9e533 | |
|
|
40c550d4f6 | |
|
|
de190154c8 | |
|
|
05039967da | |
|
|
e4cff0956b | |
|
|
4cc6eb158c | |
|
|
246ffc4b05 | |
|
|
aa429cf507 | |
|
|
5866e3bbc8 | |
|
|
0516e04bf9 | |
|
|
3d9ab225e7 | |
|
|
d63aa398de | |
|
|
a8304b4d27 | |
|
|
fdb17643d3 | |
|
|
1eea6a2968 | |
|
|
4a748b8f15 | |
|
|
f2ab047f27 | |
|
|
d28961d81e | |
|
|
f90bd1dd84 | |
|
|
5eae9cb1d9 | |
|
|
3ca19b0e9f | |
|
|
eaf1d7930c | |
|
|
76ea1c1c46 | |
|
|
bd1ec818e9 | |
|
|
b541241104 | |
|
|
c363256839 | |
|
|
ecac98ee53 | |
|
|
182acfe5c5 | |
|
|
b5fe4559ae | |
|
|
acb7c79069 | |
|
|
5f91b1d5d5 | |
|
|
9ef7523ee9 | |
|
|
00de615345 | |
|
|
e1a399992b | |
|
|
4f2f0a163d | |
|
|
0cec84f999 | |
|
|
b2e1427c9b | |
|
|
4d99d45084 | |
|
|
10e5b148b0 | |
|
|
90b2731894 | |
|
|
aa2d278a11 | |
|
|
6c770d16ca | |
|
|
8d880ac012 | |
|
|
0f1e9d14cc | |
|
|
1274fbee9e | |
|
|
a7b3dee7a5 | |
|
|
ec947d2b16 | |
|
|
0cd4f4720b | |
|
|
af237f3026 | |
|
|
1a5631beaa | |
|
|
1dab5f5a44 | |
|
|
c96f608d98 | |
|
|
0842b9b465 | |
|
|
59db9a357d | |
|
|
23fbfcb1ad | |
|
|
e22cd0aa15 | |
|
|
96cfc4992c | |
|
|
ed0007aa32 | |
|
|
344ee2a38a | |
|
|
d6e1556499 | |
|
|
f76565db92 | |
|
|
43e1cbd6c1 | |
|
|
107d599952 | |
|
|
e8bbc736cb | |
|
|
b518195101 | |
|
|
e2763a6723 | |
|
|
0beb8db3a0 | |
|
|
b2f460bd3c | |
|
|
5f4cdac385 | |
|
|
ae87863dc1 |
|
|
@ -0,0 +1,138 @@
|
||||||
|
ARG OPENVINO_VERSION_MAJOR=2026.0
|
||||||
|
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
|
||||||
|
ARG UBUNTU_VERSION=24.04
|
||||||
|
|
||||||
|
# Optional proxy build arguments - empty by default
|
||||||
|
ARG http_proxy=
|
||||||
|
ARG https_proxy=
|
||||||
|
|
||||||
|
## Build Image
|
||||||
|
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||||
|
|
||||||
|
# Pass proxy args to build stage
|
||||||
|
ARG http_proxy
|
||||||
|
ARG https_proxy
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates \
|
||||||
|
gnupg \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
cmake \
|
||||||
|
ninja-build \
|
||||||
|
build-essential \
|
||||||
|
libtbb12 \
|
||||||
|
libssl-dev \
|
||||||
|
ocl-icd-opencl-dev \
|
||||||
|
opencl-headers \
|
||||||
|
opencl-clhpp-headers \
|
||||||
|
intel-opencl-icd && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install OpenVINO for Ubuntu 24.04
|
||||||
|
ARG OPENVINO_VERSION_MAJOR
|
||||||
|
ARG OPENVINO_VERSION_FULL
|
||||||
|
RUN mkdir -p /opt/intel && \
|
||||||
|
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||||
|
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||||
|
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||||
|
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||||
|
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
|
||||||
|
cd - && \
|
||||||
|
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
|
||||||
|
|
||||||
|
ENV OpenVINO_DIR=/opt/intel/openvino
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Build Stage
|
||||||
|
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
|
||||||
|
cmake -B build/ReleaseOV -G Ninja \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENVINO=ON && \
|
||||||
|
cmake --build build/ReleaseOV -j$(nproc)"
|
||||||
|
|
||||||
|
# Copy all necessary libraries
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
|
||||||
|
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
|
||||||
|
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
|
||||||
|
|
||||||
|
# Create runtime directories and copy binaries
|
||||||
|
RUN mkdir -p /app/full \
|
||||||
|
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||||
|
&& cp *.py /app/full \
|
||||||
|
&& cp -r gguf-py /app/full \
|
||||||
|
&& cp -r requirements /app/full \
|
||||||
|
&& cp requirements.txt /app/full \
|
||||||
|
&& cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
## Base Runtime Image
|
||||||
|
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||||
|
|
||||||
|
# Pass proxy args to runtime stage
|
||||||
|
ARG http_proxy
|
||||||
|
ARG https_proxy
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgomp1 libtbb12 curl\
|
||||||
|
&& apt autoremove -y \
|
||||||
|
&& apt clean -y \
|
||||||
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||||
|
&& find /var/cache -type f -delete
|
||||||
|
|
||||||
|
COPY --from=build /app/lib/ /app/
|
||||||
|
|
||||||
|
### Full (all binaries)
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
ARG http_proxy
|
||||||
|
ARG https_proxy
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app/
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-venv \
|
||||||
|
python3-pip && \
|
||||||
|
python3 -m venv /ov-venv && \
|
||||||
|
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||||
|
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
|
||||||
|
apt-get autoremove -y && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /tmp/* /var/tmp/* && \
|
||||||
|
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||||
|
find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
|
||||||
|
|
||||||
|
|
||||||
|
### Light, CLI only
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app/
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Server, Server only
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app/
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
@ -53,10 +53,11 @@ RUN apt-get update \
|
||||||
&& apt-get install -y \
|
&& apt-get install -y \
|
||||||
build-essential \
|
build-essential \
|
||||||
git \
|
git \
|
||||||
python3 \
|
python3.13 \
|
||||||
python3-dev \
|
python3.13-dev \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
python3-wheel \
|
python3-wheel \
|
||||||
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
|
||||||
&& pip install --break-system-packages --upgrade setuptools \
|
&& pip install --break-system-packages --upgrade setuptools \
|
||||||
&& pip install --break-system-packages -r requirements.txt \
|
&& pip install --break-system-packages -r requirements.txt \
|
||||||
&& apt autoremove -y \
|
&& apt autoremove -y \
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
name: "Linux - Setup OpenVINO Toolkit"
|
||||||
|
description: "Setup OpenVINO Toolkit for Linux"
|
||||||
|
inputs:
|
||||||
|
path:
|
||||||
|
description: "Installation path"
|
||||||
|
required: true
|
||||||
|
version_major:
|
||||||
|
description: "OpenVINO major version (e.g., 2025.3)"
|
||||||
|
required: true
|
||||||
|
version_full:
|
||||||
|
description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
|
||||||
|
required: true
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: "composite"
|
||||||
|
steps:
|
||||||
|
- name: Setup OpenVINO Toolkit
|
||||||
|
id: setup
|
||||||
|
uses: ./.github/actions/unarchive-tar
|
||||||
|
with:
|
||||||
|
url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
|
||||||
|
path: ${{ inputs.path }}
|
||||||
|
type: z
|
||||||
|
strip: 1
|
||||||
|
|
||||||
|
|
@ -0,0 +1,57 @@
|
||||||
|
name: CI (3rd-party)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-3rd-party.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ubuntu-24-llguidance:
|
||||||
|
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential libssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_LLGUIDANCE=ON
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
|
|
@ -0,0 +1,140 @@
|
||||||
|
name: CI (android)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-android.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-android.yml',
|
||||||
|
'examples/llama.android/**'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
android:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
# Disabled due to size (400MB) and always 0 cache hits
|
||||||
|
# - name: ccache
|
||||||
|
# uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
# with:
|
||||||
|
# key: android-build
|
||||||
|
# evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Set up JDK
|
||||||
|
uses: actions/setup-java@v5
|
||||||
|
with:
|
||||||
|
java-version: 17
|
||||||
|
distribution: zulu
|
||||||
|
|
||||||
|
- name: Setup Android SDK
|
||||||
|
uses: android-actions/setup-android@v3
|
||||||
|
with:
|
||||||
|
log-accepted-android-sdk-licenses: false
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
cd examples/llama.android
|
||||||
|
./gradlew build --no-daemon
|
||||||
|
|
||||||
|
android-ndk:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
env:
|
||||||
|
OPENCL_VERSION: 2025.07.22
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- build: 'arm64-cpu'
|
||||||
|
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
|
||||||
|
- build: 'arm64-snapdragon'
|
||||||
|
defines: '--preset arm64-android-snapdragon-release'
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Install OpenCL Headers and Libs
|
||||||
|
id: install_opencl
|
||||||
|
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||||
|
run: |
|
||||||
|
mkdir opencl
|
||||||
|
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||||
|
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||||
|
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||||
|
tar -xaf opencl/headers.tar.gz -C opencl
|
||||||
|
tar -xaf opencl/clhpp.tar.gz -C opencl
|
||||||
|
tar -xaf opencl/icd-loader.tar.gz -C opencl
|
||||||
|
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
||||||
|
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
|
||||||
|
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
|
||||||
|
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
|
||||||
|
cmake --build build
|
||||||
|
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
||||||
|
rm -rf opencl
|
||||||
|
|
||||||
|
- name: Install Hexagon SDK
|
||||||
|
id: install_hexsdk
|
||||||
|
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||||
|
env:
|
||||||
|
HEXSDK_VER: 6.4.0.2
|
||||||
|
HEXTLS_VER: 19.0.04
|
||||||
|
run: |
|
||||||
|
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
|
||||||
|
mkdir hex-sdk
|
||||||
|
tar -xaf hex-sdk.tar.gz -C hex-sdk
|
||||||
|
ls -l hex-sdk
|
||||||
|
sudo mv hex-sdk /opt/hexagon
|
||||||
|
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
|
||||||
|
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
|
||||||
|
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
|
||||||
|
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
|
||||||
|
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
|
||||||
|
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
|
- name: Update CMake presets
|
||||||
|
id: update_presets
|
||||||
|
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||||
|
run: |
|
||||||
|
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: ndk_build
|
||||||
|
run: |
|
||||||
|
cmake ${{ matrix.defines }} -B build
|
||||||
|
cmake --build build
|
||||||
|
cmake --install build --prefix pkg-adb/llama.cpp
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
echo "FIXME: test on devices"
|
||||||
|
|
@ -0,0 +1,214 @@
|
||||||
|
name: CI (apple)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-apple.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'**/*.metal'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-apple.yml',
|
||||||
|
'ggml/src/ggml-metal/**'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
macOS-latest-ios:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: macOS-latest-ios
|
||||||
|
evict-old-files: 1d
|
||||||
|
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_BUILD_COMMON=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
|
macos-latest-ios-xcode:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Setup Xcode
|
||||||
|
uses: ggml-org/setup-xcode@v1
|
||||||
|
with:
|
||||||
|
xcode-version: latest-stable
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_OPENSSL=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
|
- name: xcodebuild for swift package
|
||||||
|
id: xcodebuild
|
||||||
|
run: |
|
||||||
|
./build-xcframework.sh
|
||||||
|
|
||||||
|
- name: Upload xcframework artifact
|
||||||
|
uses: actions/upload-artifact@v6
|
||||||
|
with:
|
||||||
|
name: llama-xcframework
|
||||||
|
path: build-apple/llama.xcframework/
|
||||||
|
retention-days: 1
|
||||||
|
|
||||||
|
- name: Build Xcode project
|
||||||
|
run: |
|
||||||
|
xcodebuild -downloadPlatform iOS
|
||||||
|
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||||
|
|
||||||
|
macOS-latest-tvos:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: macOS-latest-tvos
|
||||||
|
evict-old-files: 1d
|
||||||
|
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_BUILD_COMMON=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
|
macOS-latest-visionos:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_BUILD_COMMON=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
|
||||||
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
|
macOS-latest-swift:
|
||||||
|
runs-on: macos-latest
|
||||||
|
needs: macos-latest-ios-xcode
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: macOS-latest-swift
|
||||||
|
evict-old-files: 1d
|
||||||
|
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Download xcframework artifact
|
||||||
|
uses: actions/download-artifact@v7
|
||||||
|
with:
|
||||||
|
name: llama-xcframework
|
||||||
|
path: build-apple/llama.xcframework/
|
||||||
|
|
||||||
|
- name: Build llama.cpp with CMake
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
cmake -B build -G Xcode \
|
||||||
|
-DGGML_METAL_USE_BF16=ON \
|
||||||
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
|
-DLLAMA_OPENSSL=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TOOLS=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||||
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
@ -37,12 +37,39 @@ jobs:
|
||||||
path: ./vulkan_sdk
|
path: ./vulkan_sdk
|
||||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||||
|
|
||||||
ubuntu-24-spacemit-cache:
|
#ubuntu-24-spacemit-cache:
|
||||||
|
# runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
# env:
|
||||||
|
# # Make sure this is in sync with build-linux-cross.yml
|
||||||
|
# SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||||
|
|
||||||
|
# steps:
|
||||||
|
# - name: Clone
|
||||||
|
# id: checkout
|
||||||
|
# uses: actions/checkout@v6
|
||||||
|
|
||||||
|
# - name: Setup Cache
|
||||||
|
# uses: actions/cache@v5
|
||||||
|
# id: cache-toolchain
|
||||||
|
# with:
|
||||||
|
# path: ./spacemit_toolchain
|
||||||
|
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||||
|
|
||||||
|
# - name: Setup SpacemiT Toolchain
|
||||||
|
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||||
|
# uses: ./.github/actions/linux-setup-spacemit
|
||||||
|
# with:
|
||||||
|
# path: ./spacemit_toolchain
|
||||||
|
# version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||||
|
|
||||||
|
ubuntu-24-openvino-cache:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# Make sure this is in sync with build-linux-cross.yml
|
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||||
|
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
|
@ -51,17 +78,18 @@ jobs:
|
||||||
|
|
||||||
- name: Setup Cache
|
- name: Setup Cache
|
||||||
uses: actions/cache@v5
|
uses: actions/cache@v5
|
||||||
id: cache-toolchain
|
id: cache-openvino
|
||||||
with:
|
with:
|
||||||
path: ./spacemit_toolchain
|
path: ./openvino_toolkit
|
||||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||||
|
|
||||||
- name: Setup SpacemiT Toolchain
|
- name: Setup OpenVINO Toolkit
|
||||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||||
uses: ./.github/actions/linux-setup-spacemit
|
uses: ./.github/actions/linux-setup-openvino
|
||||||
with:
|
with:
|
||||||
path: ./spacemit_toolchain
|
path: ./openvino_toolkit
|
||||||
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||||
|
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||||
|
|
||||||
windows-2022-rocm-cache:
|
windows-2022-rocm-cache:
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,102 @@
|
||||||
|
name: CI (cann)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-cann.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-cann.yml',
|
||||||
|
'ggml/src/ggml-cann/**'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
openEuler-latest-cann:
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -el {0}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: [x86, aarch64]
|
||||||
|
chip_type: ['910b', '310p']
|
||||||
|
build: ['Release']
|
||||||
|
use_acl_graph: ['on', 'off']
|
||||||
|
exclude:
|
||||||
|
# 310P does not support USE_ACL_GRAPH=on
|
||||||
|
- chip_type: '310p'
|
||||||
|
use_acl_graph: 'on'
|
||||||
|
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Free up disk space
|
||||||
|
uses: ggml-org/free-disk-space@v1.3.1
|
||||||
|
with:
|
||||||
|
tool-cache: true
|
||||||
|
|
||||||
|
- name: Set container image
|
||||||
|
id: cann-image
|
||||||
|
run: |
|
||||||
|
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||||
|
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||||
|
|
||||||
|
- name: Pull container image
|
||||||
|
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
env:
|
||||||
|
BUILD_TYPE: ${{ matrix.build }}
|
||||||
|
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||||
|
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||||
|
run: |
|
||||||
|
HOST_UID=$(id -u)
|
||||||
|
HOST_GID=$(id -g)
|
||||||
|
|
||||||
|
docker run --rm \
|
||||||
|
-v "${PWD}:/workspace" \
|
||||||
|
-w /workspace \
|
||||||
|
-e SOC_TYPE=${SOC_TYPE} \
|
||||||
|
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||||
|
"${{ steps.cann-image.outputs.image }}" \
|
||||||
|
bash -lc '
|
||||||
|
set -e
|
||||||
|
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
git config --global --add safe.directory "/workspace"
|
||||||
|
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||||
|
-DGGML_CANN=on \
|
||||||
|
-DSOC_TYPE=${SOC_TYPE} \
|
||||||
|
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||||
|
cmake --build build -j $(nproc)
|
||||||
|
|
||||||
|
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||||
|
'
|
||||||
|
|
@ -5,7 +5,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-slim
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y build-essential tcl
|
sudo apt install -y build-essential tcl cmake
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,24 @@
|
||||||
name: Build on Linux using cross-compiler
|
name: CI (cross)
|
||||||
on:
|
on:
|
||||||
|
# only manual triggers due to low-importance of the workflows
|
||||||
|
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
workflow_call:
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-cross.yml',
|
||||||
|
'ggml/src/spacemit/*',
|
||||||
|
'ggml/src/arch/loongarch/*'
|
||||||
|
]
|
||||||
|
# run once every week
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# ubuntu-24-riscv64-cpu-cross:
|
# ubuntu-24-riscv64-cpu-cross:
|
||||||
|
|
@ -142,7 +159,7 @@ jobs:
|
||||||
# cmake --build build --config Release -j $(nproc)
|
# cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
debian-13-loongarch64-cpu-cross:
|
debian-13-loongarch64-cpu-cross:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
@ -197,7 +214,7 @@ jobs:
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
debian-13-loongarch64-vulkan-cross:
|
debian-13-loongarch64-vulkan-cross:
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
@ -264,15 +281,15 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Use SpacemiT Toolchain Cache
|
#- name: Use SpacemiT Toolchain Cache
|
||||||
uses: actions/cache@v5
|
# uses: actions/cache@v5
|
||||||
id: cache-toolchain
|
# id: cache-toolchain
|
||||||
with:
|
# with:
|
||||||
path: ./spacemit_toolchain
|
# path: ./spacemit_toolchain
|
||||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||||
|
|
||||||
- name: Setup SpacemiT Toolchain
|
- name: Setup SpacemiT Toolchain
|
||||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||||
uses: ./.github/actions/linux-setup-spacemit
|
uses: ./.github/actions/linux-setup-spacemit
|
||||||
with:
|
with:
|
||||||
path: ./spacemit_toolchain
|
path: ./spacemit_toolchain
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
name: CI (msys)
|
||||||
|
|
||||||
|
on:
|
||||||
|
# only manual triggers due to low-importance of the workflows
|
||||||
|
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||||
|
workflow_dispatch:
|
||||||
|
# run once every week
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
windows-msys2:
|
||||||
|
runs-on: windows-2025
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
|
||||||
|
- { sys: CLANG64, env: clang-x86_64, build: Release }
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
#- name: ccache
|
||||||
|
# uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
# with:
|
||||||
|
# key: windows-msys2
|
||||||
|
# variant: ccache
|
||||||
|
# evict-old-files: 1d
|
||||||
|
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Setup ${{ matrix.sys }}
|
||||||
|
uses: msys2/setup-msys2@v2
|
||||||
|
with:
|
||||||
|
update: true
|
||||||
|
msystem: ${{matrix.sys}}
|
||||||
|
install: >-
|
||||||
|
base-devel
|
||||||
|
git
|
||||||
|
mingw-w64-${{matrix.env}}-toolchain
|
||||||
|
mingw-w64-${{matrix.env}}-cmake
|
||||||
|
mingw-w64-${{matrix.env}}-openblas
|
||||||
|
|
||||||
|
- name: Build using CMake
|
||||||
|
shell: msys2 {0}
|
||||||
|
run: |
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Clean after building using CMake
|
||||||
|
shell: msys2 {0}
|
||||||
|
run: |
|
||||||
|
rm -rf build
|
||||||
|
|
||||||
|
- name: Build using CMake w/ OpenBLAS
|
||||||
|
shell: msys2 {0}
|
||||||
|
run: |
|
||||||
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
@ -0,0 +1,136 @@
|
||||||
|
name: CI (riscv)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-riscv.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-riscv.yml',
|
||||||
|
'ggml/src/ggml-cpu/arch/riscv/**'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ubuntu-riscv64-native-sanitizer:
|
||||||
|
runs-on: RISCV64
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
build_type: [Debug]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# Install necessary packages
|
||||||
|
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
|
||||||
|
|
||||||
|
# Set gcc-14 and g++-14 as the default compilers
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||||
|
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||||
|
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||||
|
|
||||||
|
# Install Rust stable version
|
||||||
|
rustup install stable
|
||||||
|
rustup default stable
|
||||||
|
|
||||||
|
git lfs install
|
||||||
|
|
||||||
|
- name: GCC version check
|
||||||
|
run: |
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
run: |
|
||||||
|
# Unique cache directory per matrix combination
|
||||||
|
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
|
||||||
|
mkdir -p "$CCACHE_DIR"
|
||||||
|
|
||||||
|
# Configure ccache
|
||||||
|
ccache --set-config=max_size=5G
|
||||||
|
ccache --set-config=compression=true
|
||||||
|
ccache --set-config=compression_level=6
|
||||||
|
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||||
|
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||||
|
ccache --set-config=hash_dir=false
|
||||||
|
|
||||||
|
# Export for subsequent steps
|
||||||
|
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||||
|
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_OPENSSL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DGGML_OPENMP=ON \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_OPENSSL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
@ -0,0 +1,87 @@
|
||||||
|
name: CI (sanitize)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-sanitize.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ubuntu-latest-sanitizer:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
build_type: [Debug]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||||
|
evict-old-files: 1d
|
||||||
|
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential libssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=ON \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DGGML_OPENMP=OFF
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
@ -0,0 +1,242 @@
|
||||||
|
name: CI (self-hosted)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.cuh',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'**/*.metal',
|
||||||
|
'**/*.comp',
|
||||||
|
'**/*.glsl',
|
||||||
|
'**/*.wgsl'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-self-hosted.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.cuh',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'**/*.metal',
|
||||||
|
'**/*.comp',
|
||||||
|
'**/*.glsl',
|
||||||
|
'**/*.wgsl'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ggml-ci-nvidia-cuda:
|
||||||
|
runs-on: [self-hosted, Linux, NVIDIA]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-nvidia-vulkan-cm:
|
||||||
|
runs-on: [self-hosted, Linux, NVIDIA]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
vulkaninfo --summary
|
||||||
|
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-nvidia-vulkan-cm2:
|
||||||
|
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
vulkaninfo --summary
|
||||||
|
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-cpu-amx:
|
||||||
|
runs-on: [self-hosted, Linux, CPU, AMX]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
# ggml-ci-amd-vulkan:
|
||||||
|
# runs-on: [self-hosted, Linux, AMD]
|
||||||
|
|
||||||
|
# steps:
|
||||||
|
# - name: Clone
|
||||||
|
# id: checkout
|
||||||
|
# uses: actions/checkout@v6
|
||||||
|
|
||||||
|
# - name: Test
|
||||||
|
# id: ggml-ci
|
||||||
|
# run: |
|
||||||
|
# vulkaninfo --summary
|
||||||
|
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
# ggml-ci-amd-rocm:
|
||||||
|
# runs-on: [self-hosted, Linux, AMD]
|
||||||
|
|
||||||
|
# steps:
|
||||||
|
# - name: Clone
|
||||||
|
# id: checkout
|
||||||
|
# uses: actions/checkout@v6
|
||||||
|
|
||||||
|
# - name: Test
|
||||||
|
# id: ggml-ci
|
||||||
|
# run: |
|
||||||
|
# amd-smi static
|
||||||
|
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-mac-metal:
|
||||||
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-mac-webgpu:
|
||||||
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Dawn Dependency
|
||||||
|
id: dawn-depends
|
||||||
|
run: |
|
||||||
|
DAWN_VERSION="v2.0.0"
|
||||||
|
DAWN_OWNER="reeselevine"
|
||||||
|
DAWN_REPO="dawn"
|
||||||
|
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||||
|
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
|
curl -L -o artifact.zip \
|
||||||
|
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
|
mkdir dawn
|
||||||
|
unzip artifact.zip
|
||||||
|
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||||
|
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-mac-vulkan:
|
||||||
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
vulkaninfo --summary
|
||||||
|
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-linux-intel-vulkan:
|
||||||
|
runs-on: [self-hosted, Linux, Intel]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
persist-credentials: false
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
vulkaninfo --summary
|
||||||
|
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-intel-openvino-gpu-low-perf:
|
||||||
|
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||||
|
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||||
|
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Setup OpenVINO Toolkit
|
||||||
|
uses: ./.github/actions/linux-setup-openvino
|
||||||
|
with:
|
||||||
|
path: ./openvino_toolkit
|
||||||
|
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||||
|
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||||
|
|
||||||
|
- name: Install OpenVINO dependencies
|
||||||
|
run: |
|
||||||
|
cd ./openvino_toolkit
|
||||||
|
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||||
|
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
source ./openvino_toolkit/setupvars.sh
|
||||||
|
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
name: CI (vulkan)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-vulkan.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
|
]
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/build-vulkan.yml',
|
||||||
|
'ggml/src/ggml-vulkan/**'
|
||||||
|
]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ubuntu-24-vulkan-llvmpipe:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-24-vulkan-llvmpipe
|
||||||
|
evict-old-files: 1d
|
||||||
|
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||||
|
sudo apt-get update -y
|
||||||
|
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||||
|
|
||||||
|
- name: Get latest Vulkan SDK version
|
||||||
|
id: vulkan_sdk_version
|
||||||
|
run: |
|
||||||
|
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
|
- name: Use Vulkan SDK Cache
|
||||||
|
uses: actions/cache@v5
|
||||||
|
id: cache-sdk
|
||||||
|
with:
|
||||||
|
path: ./vulkan_sdk
|
||||||
|
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||||
|
|
||||||
|
- name: Setup Vulkan SDK
|
||||||
|
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||||
|
uses: ./.github/actions/linux-setup-vulkan-llvmpipe
|
||||||
|
with:
|
||||||
|
path: ./vulkan_sdk
|
||||||
|
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
source ./vulkan_sdk/setup-env.sh
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_VULKAN=ON
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
export GGML_VK_VISIBLE_DEVICES=0
|
||||||
|
export GGML_VK_DISABLE_F16=1
|
||||||
|
export GGML_VK_DISABLE_COOPMAT=1
|
||||||
|
# This is using llvmpipe and runs slower than other backends
|
||||||
|
ctest -L main --verbose --timeout 4800
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -47,6 +47,7 @@ jobs:
|
||||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||||
|
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v6
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,16 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
paths: [
|
||||||
|
'.github/workflows/python-lint.yml',
|
||||||
|
'**/*.py'
|
||||||
|
]
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
paths: [
|
||||||
|
'.github/workflows/python-lint.yml',
|
||||||
|
'**/*.py'
|
||||||
|
]
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,22 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
paths: [
|
||||||
|
'.github/workflows/release.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/.cmake',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.cuh',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'**/*.metal',
|
||||||
|
'**/*.comp',
|
||||||
|
'**/*.glsl'
|
||||||
|
]
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
@ -34,7 +49,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: macOS-latest-cmake-arm64
|
key: macOS-latest-arm64
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
|
@ -81,7 +96,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: macOS-latest-cmake-x64
|
key: macOS-latest-x64
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
|
@ -140,7 +155,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: ubuntu-cpu-cmake-${{ matrix.build }}
|
key: ubuntu-cpu-${{ matrix.build }}
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
|
@ -191,7 +206,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: ubuntu-22-cmake-vulkan
|
key: ubuntu-22-vulkan
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
|
@ -231,6 +246,86 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
|
||||||
name: llama-bin-ubuntu-vulkan-x64.tar.gz
|
name: llama-bin-ubuntu-vulkan-x64.tar.gz
|
||||||
|
|
||||||
|
ubuntu-24-openvino:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
outputs:
|
||||||
|
openvino_version: ${{ steps.openvino_version.outputs.value }}
|
||||||
|
|
||||||
|
env:
|
||||||
|
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||||
|
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||||
|
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Set OpenVINO version output
|
||||||
|
id: openvino_version
|
||||||
|
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-24-openvino-release-no-preset-v1
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||||
|
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||||
|
|
||||||
|
- name: Use OpenVINO Toolkit Cache
|
||||||
|
uses: actions/cache@v5
|
||||||
|
id: cache-openvino
|
||||||
|
with:
|
||||||
|
path: ./openvino_toolkit
|
||||||
|
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||||
|
|
||||||
|
- name: Setup OpenVINO Toolkit
|
||||||
|
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||||
|
uses: ./.github/actions/linux-setup-openvino
|
||||||
|
with:
|
||||||
|
path: ./openvino_toolkit
|
||||||
|
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||||
|
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||||
|
|
||||||
|
- name: Install OpenVINO dependencies
|
||||||
|
run: |
|
||||||
|
cd ./openvino_toolkit
|
||||||
|
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||||
|
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
source ./openvino_toolkit/setupvars.sh
|
||||||
|
cmake -B build/ReleaseOV -G Ninja \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENVINO=ON
|
||||||
|
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Determine tag name
|
||||||
|
id: tag
|
||||||
|
uses: ./.github/actions/get-tag-name
|
||||||
|
|
||||||
|
- name: Pack artifacts
|
||||||
|
id: pack_artifacts
|
||||||
|
run: |
|
||||||
|
cp LICENSE ./build/ReleaseOV/bin/
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .
|
||||||
|
|
||||||
|
- name: Upload artifacts
|
||||||
|
uses: actions/upload-artifact@v6
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||||
|
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||||
|
|
||||||
windows-cpu:
|
windows-cpu:
|
||||||
runs-on: windows-2025
|
runs-on: windows-2025
|
||||||
|
|
||||||
|
|
@ -249,7 +344,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
key: windows-latest-cpu-${{ matrix.arch }}
|
||||||
variant: ccache
|
variant: ccache
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
|
@ -310,7 +405,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
|
||||||
variant: ccache
|
variant: ccache
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
|
@ -456,7 +551,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: windows-latest-cmake-sycl
|
key: windows-latest-sycl
|
||||||
variant: ccache
|
variant: ccache
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
|
@ -536,7 +631,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
|
@ -646,7 +741,7 @@ jobs:
|
||||||
- name: ccache
|
- name: ccache
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
with:
|
with:
|
||||||
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||||
evict-old-files: 1d
|
evict-old-files: 1d
|
||||||
|
|
||||||
- name: Install ROCm
|
- name: Install ROCm
|
||||||
|
|
@ -872,7 +967,7 @@ jobs:
|
||||||
permissions:
|
permissions:
|
||||||
contents: write # for creating release
|
contents: write # for creating release
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-slim
|
||||||
|
|
||||||
needs:
|
needs:
|
||||||
- windows
|
- windows
|
||||||
|
|
@ -883,6 +978,7 @@ jobs:
|
||||||
- ubuntu-22-rocm
|
- ubuntu-22-rocm
|
||||||
- ubuntu-22-cpu
|
- ubuntu-22-cpu
|
||||||
- ubuntu-22-vulkan
|
- ubuntu-22-vulkan
|
||||||
|
- ubuntu-24-openvino
|
||||||
- macOS-arm64
|
- macOS-arm64
|
||||||
- macOS-x64
|
- macOS-x64
|
||||||
- ios-xcode-build
|
- ios-xcode-build
|
||||||
|
|
@ -967,6 +1063,7 @@ jobs:
|
||||||
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
|
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
|
||||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||||
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
|
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
|
||||||
|
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||||
|
|
||||||
**Windows:**
|
**Windows:**
|
||||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
name: Server (sanitize)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
sha:
|
||||||
|
description: 'Commit SHA1 to build'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
slow_tests:
|
||||||
|
description: 'Run slow tests'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: [
|
||||||
|
'.github/workflows/server-sanitize.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/Makefile',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'tools/server/**.*'
|
||||||
|
]
|
||||||
|
|
||||||
|
env:
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
LLAMA_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
server:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
||||||
|
build_type: [RelWithDebInfo]
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install \
|
||||||
|
build-essential \
|
||||||
|
xxd \
|
||||||
|
git \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
language-pack-en \
|
||||||
|
libssl-dev
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||||
|
-DGGML_SCHED_NO_REALLOC=ON \
|
||||||
|
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||||
|
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||||
|
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
||||||
|
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||||
|
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||||
|
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Python setup
|
||||||
|
id: setup_python
|
||||||
|
uses: actions/setup-python@v6
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
pip-install: -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
id: server_integration_tests
|
||||||
|
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
export ${{ matrix.extra_args }}
|
||||||
|
pytest -v -x -m "not slow"
|
||||||
|
|
||||||
|
- name: Slow tests
|
||||||
|
id: server_integration_tests_slow
|
||||||
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
export ${{ matrix.extra_args }}
|
||||||
|
SLOW_TESTS=1 pytest -v -x
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
name: Server-Metal
|
name: Server (self-hosted)
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
workflow_dispatch: # allows manual triggering
|
||||||
|
|
@ -14,7 +14,19 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
paths: [
|
||||||
|
'.github/workflows/server-self-hosted.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/Makefile',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'tools/server/**.*'
|
||||||
|
]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LLAMA_LOG_COLORS: 1
|
LLAMA_LOG_COLORS: 1
|
||||||
|
|
@ -28,7 +40,7 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
server-metal:
|
server-metal:
|
||||||
runs-on: [self-hosted, macOS, ARM64]
|
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||||
|
|
||||||
name: server-metal (${{ matrix.wf_name }})
|
name: server-metal (${{ matrix.wf_name }})
|
||||||
strategy:
|
strategy:
|
||||||
|
|
@ -71,3 +83,42 @@ jobs:
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
export ${{ matrix.extra_args }}
|
export ${{ matrix.extra_args }}
|
||||||
pytest -v -x -m "not slow"
|
pytest -v -x -m "not slow"
|
||||||
|
|
||||||
|
server-cuda:
|
||||||
|
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||||
|
|
||||||
|
name: server-cuda (${{ matrix.wf_name }})
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
build_type: [Release]
|
||||||
|
wf_name: ["GPUx1"]
|
||||||
|
include:
|
||||||
|
- build_type: Release
|
||||||
|
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||||
|
wf_name: "GPUx1, backend-sampling"
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
id: server_integration_tests
|
||||||
|
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
export ${{ matrix.extra_args }}
|
||||||
|
pytest -v -x -m "not slow"
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
# Server WebUI build and tests
|
|
||||||
name: Server WebUI
|
name: Server WebUI
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
|
@ -11,10 +10,20 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
paths: [
|
||||||
|
'.github/workflows/server-webui.yml',
|
||||||
|
'tools/server/webui/**.*',
|
||||||
|
'tools/server/tests/**.*',
|
||||||
|
'tools/server/public/**'
|
||||||
|
]
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
paths: [
|
||||||
|
'.github/workflows/server-webui.yml',
|
||||||
|
'tools/server/webui/**.*',
|
||||||
|
'tools/server/tests/**.*',
|
||||||
|
'tools/server/public/**'
|
||||||
|
]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LLAMA_LOG_COLORS: 1
|
LLAMA_LOG_COLORS: 1
|
||||||
|
|
@ -29,7 +38,7 @@ concurrency:
|
||||||
jobs:
|
jobs:
|
||||||
webui-check:
|
webui-check:
|
||||||
name: WebUI Checks
|
name: WebUI Checks
|
||||||
runs-on: ubuntu-latest
|
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
# Server build and tests
|
|
||||||
name: Server
|
name: Server
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
|
@ -15,10 +14,34 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
paths: [
|
||||||
|
'.github/workflows/server.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/Makefile',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'tools/server/**.*'
|
||||||
|
]
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
paths: [
|
||||||
|
'.github/workflows/server.yml',
|
||||||
|
'**/CMakeLists.txt',
|
||||||
|
'**/Makefile',
|
||||||
|
'**/*.h',
|
||||||
|
'**/*.hpp',
|
||||||
|
'**/*.c',
|
||||||
|
'**/*.cpp',
|
||||||
|
'**/*.cu',
|
||||||
|
'**/*.swift',
|
||||||
|
'**/*.m',
|
||||||
|
'tools/server/**.*'
|
||||||
|
]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LLAMA_LOG_COLORS: 1
|
LLAMA_LOG_COLORS: 1
|
||||||
|
|
@ -34,17 +57,18 @@ jobs:
|
||||||
server:
|
server:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
name: server (${{ matrix.wf_name }})
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
build_type: [Release]
|
||||||
build_type: [RelWithDebInfo]
|
wf_name: ["default"]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
|
||||||
extra_args: ""
|
extra_args: ""
|
||||||
|
wf_name: "default"
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
|
||||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||||
|
wf_name: "backend-sampling"
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
@ -74,13 +98,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||||
-DGGML_SCHED_NO_REALLOC=ON \
|
-DGGML_SCHED_NO_REALLOC=ON
|
||||||
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
|
||||||
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
|
||||||
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
|
||||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
|
||||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
|
||||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
|
|
|
||||||
|
|
@ -124,6 +124,11 @@ poetry.toml
|
||||||
# Scripts
|
# Scripts
|
||||||
!/scripts/install-oneapi.bat
|
!/scripts/install-oneapi.bat
|
||||||
|
|
||||||
|
# Generated by scripts
|
||||||
|
/hellaswag_val_full.txt
|
||||||
|
/winogrande-debiased-eval.csv
|
||||||
|
/wikitext-2-raw/
|
||||||
|
|
||||||
# Test models for lora adapters
|
# Test models for lora adapters
|
||||||
/lora-tests
|
/lora-tests
|
||||||
|
|
||||||
|
|
|
||||||
57
CODEOWNERS
57
CODEOWNERS
|
|
@ -2,29 +2,13 @@
|
||||||
# multiplie collaborators per item can be specified
|
# multiplie collaborators per item can be specified
|
||||||
|
|
||||||
/.devops/*.Dockerfile @ngxson
|
/.devops/*.Dockerfile @ngxson
|
||||||
/.github/actions/ @CISC
|
/.github/actions/ @ggml-org/ci
|
||||||
/.github/workflows/ @CISC
|
/.github/workflows/ @ggml-org/ci
|
||||||
/ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
/cmake/ @ggerganov
|
/cmake/ @ggerganov
|
||||||
/common/CMakeLists.txt @ggerganov
|
/common/ @ggml-org/llama-common
|
||||||
/common/arg.* @ggerganov
|
/common/jinja/ @CISC
|
||||||
/common/base64.hpp.* @ggerganov
|
|
||||||
/common/build-info.* @ggerganov
|
|
||||||
/common/chat.* @pwilkin
|
|
||||||
/common/chat-auto*.* @pwilkin
|
|
||||||
/common/chat-diff-analyzer.* @pwilkin
|
|
||||||
/common/chat-peg-parser.* @aldehir
|
|
||||||
/common/common.* @ggerganov
|
|
||||||
/common/console.* @ggerganov
|
|
||||||
/common/http.* @angt
|
|
||||||
/common/jinja/ @ngxson @CISC @aldehir
|
|
||||||
/common/llguidance.* @ggerganov
|
|
||||||
/common/log.* @ggerganov
|
|
||||||
/common/ngram-map.* @srogmann
|
/common/ngram-map.* @srogmann
|
||||||
/common/peg-parser.* @aldehir
|
|
||||||
/common/sampling.* @ggerganov
|
|
||||||
/common/speculative.* @ggerganov
|
|
||||||
/common/unicode.* @aldehir
|
|
||||||
/convert_*.py @CISC
|
/convert_*.py @CISC
|
||||||
/examples/batched.swift/ @ggerganov
|
/examples/batched.swift/ @ggerganov
|
||||||
/examples/batched/ @ggerganov
|
/examples/batched/ @ggerganov
|
||||||
|
|
@ -51,29 +35,28 @@
|
||||||
/examples/speculative/ @ggerganov
|
/examples/speculative/ @ggerganov
|
||||||
/ggml/cmake/ @ggerganov
|
/ggml/cmake/ @ggerganov
|
||||||
/ggml/include/ @ggerganov
|
/ggml/include/ @ggerganov
|
||||||
|
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
|
||||||
/ggml/src/ggml-common.h @ggerganov
|
/ggml/src/ggml-common.h @ggerganov
|
||||||
/ggml/src/ggml-cpu/ @ggerganov
|
/ggml/src/ggml-cpu/ @ggerganov
|
||||||
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
||||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
|
||||||
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
|
|
||||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
|
||||||
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
||||||
/ggml/src/ggml-hip/ @IMbackK
|
/ggml/src/ggml-hip/ @IMbackK
|
||||||
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
|
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
|
||||||
/ggml/src/ggml-impl.h @ggerganov
|
/ggml/src/ggml-impl.h @ggerganov
|
||||||
/ggml/src/ggml-metal/ @ggerganov
|
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
|
||||||
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
|
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
|
||||||
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
|
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
|
||||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||||
/ggml/src/ggml-quants.* @ggerganov
|
/ggml/src/ggml-quants.* @ggerganov
|
||||||
/ggml/src/ggml-rpc/ @rgerganov
|
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
|
||||||
|
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
|
||||||
/ggml/src/ggml-threading.* @ggerganov
|
/ggml/src/ggml-threading.* @ggerganov
|
||||||
/ggml/src/ggml-vulkan/ @0cc4m
|
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
|
||||||
/ggml/src/ggml-virtgpu/ @kpouget
|
/ggml/src/ggml-virtgpu/ @kpouget
|
||||||
/ggml/src/ggml-webgpu/ @reeselevine
|
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
|
||||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||||
|
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
|
||||||
/ggml/src/ggml.c @ggerganov
|
/ggml/src/ggml.c @ggerganov
|
||||||
/ggml/src/ggml.cpp @ggerganov
|
/ggml/src/ggml.cpp @ggerganov
|
||||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||||
|
|
@ -92,16 +75,18 @@
|
||||||
/src/models/ @CISC
|
/src/models/ @CISC
|
||||||
/tests/ @ggerganov
|
/tests/ @ggerganov
|
||||||
/tests/test-chat.* @pwilkin
|
/tests/test-chat.* @pwilkin
|
||||||
|
/tests/test-llama-archs.cpp @JohannesGaessler
|
||||||
/tools/batched-bench/ @ggerganov
|
/tools/batched-bench/ @ggerganov
|
||||||
/tools/cli/ @ngxson
|
/tools/cli/ @ngxson
|
||||||
/tools/completion/ @ggerganov
|
/tools/completion/ @ggerganov
|
||||||
/tools/mtmd/ @ngxson
|
/tools/mtmd/ @ggml-org/llama-mtmd
|
||||||
/tools/perplexity/ @ggerganov
|
/tools/perplexity/ @ggerganov
|
||||||
/tools/parser/ @pwilkin
|
/tools/parser/ @pwilkin
|
||||||
/tools/quantize/ @ggerganov
|
/tools/quantize/ @ggerganov
|
||||||
/tools/rpc/ @rgerganov
|
/tools/rpc/ @ggml-org/ggml-rpc
|
||||||
/tools/server/* @ngxson @ggerganov # no subdir
|
/tools/server/* @ggml-org/llama-server # no subdir
|
||||||
/tools/server/webui/ @allozaur
|
/tools/server/tests/ @ggml-org/llama-server
|
||||||
|
/tools/server/webui/ @ggml-org/llama-webui
|
||||||
/tools/tokenize/ @ggerganov
|
/tools/tokenize/ @ggerganov
|
||||||
/tools/tts/ @ggerganov
|
/tools/tts/ @ggerganov
|
||||||
/vendor/ @ggerganov
|
/vendor/ @ggerganov
|
||||||
|
|
|
||||||
|
|
@ -30,15 +30,21 @@ Before submitting your PR:
|
||||||
- Search for existing PRs to prevent duplicating efforts
|
- Search for existing PRs to prevent duplicating efforts
|
||||||
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
|
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
||||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||||
- Create separate PRs for each feature or fix:
|
- Create separate PRs for each feature or fix:
|
||||||
- Avoid combining unrelated changes in a single PR
|
- Avoid combining unrelated changes in a single PR
|
||||||
- For intricate features, consider opening a feature request first to discuss and align expectations
|
- For intricate features, consider opening a feature request first to discuss and align expectations
|
||||||
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
|
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
|
||||||
|
- In particular, adding new data types (extension of the `ggml_type` enum) carries with it a disproportionate maintenance burden. As such, to add a new quantization type you will need to meet the following *additional* criteria *at minimum*:
|
||||||
|
- convert a small model to GGUF using the new type and upload it to HuggingFace
|
||||||
|
- provide [perplexity](https://github.com/ggml-org/llama.cpp/tree/master/tools/perplexity) comparisons to FP16/BF16 (whichever is the native precision) as well as to types of similar size
|
||||||
|
- provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
|
||||||
|
- provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||||
|
- If you are a new contributor, limit your open PRs to 1.
|
||||||
|
|
||||||
After submitting your PR:
|
After submitting your PR:
|
||||||
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
|
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
|
||||||
|
|
|
||||||
|
|
@ -279,6 +279,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
| [BLAS](docs/build.md#blas-build) | All |
|
| [BLAS](docs/build.md#blas-build) | All |
|
||||||
| [BLIS](docs/backend/BLIS.md) | All |
|
| [BLIS](docs/backend/BLIS.md) | All |
|
||||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
|
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
|
||||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
# NVIDIA DGX Spark
|
||||||
|
|
||||||
|
## System info
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uname --all
|
||||||
|
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
|
||||||
|
|
||||||
|
g++ --version
|
||||||
|
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
|
||||||
|
|
||||||
|
nvidia-smi
|
||||||
|
Fri Mar 6 11:39:45 2026
|
||||||
|
+-----------------------------------------------------------------------------------------+
|
||||||
|
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
|
||||||
|
+-----------------------------------------+------------------------+----------------------+
|
||||||
|
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||||
|
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
||||||
|
| | | MIG M. |
|
||||||
|
|=========================================+========================+======================|
|
||||||
|
| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A |
|
||||||
|
| N/A 52C P0 13W / N/A | Not Supported | 0% Default |
|
||||||
|
| | | N/A |
|
||||||
|
+-----------------------------------------+------------------------+----------------------+
|
||||||
|
```
|
||||||
|
|
||||||
|
## ggml-org/nemotron-3-super-120b-GGUF
|
||||||
|
|
||||||
|
Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
|
||||||
|
|
||||||
|
- `llama-batched-bench`
|
||||||
|
|
||||||
|
main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
|
||||||
|
|
||||||
|
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||||
|
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||||
|
| 512 | 32 | 1 | 544 | 1.094 | 468.05 | 1.621 | 19.74 | 2.715 | 200.37 |
|
||||||
|
| 512 | 32 | 2 | 1088 | 1.463 | 700.16 | 2.437 | 26.26 | 3.900 | 279.01 |
|
||||||
|
| 512 | 32 | 4 | 2176 | 2.647 | 773.76 | 4.043 | 31.66 | 6.689 | 325.29 |
|
||||||
|
| 512 | 32 | 8 | 4352 | 5.291 | 774.14 | 6.151 | 41.62 | 11.442 | 380.37 |
|
||||||
|
| 512 | 32 | 16 | 8704 | 10.603 | 772.62 | 10.385 | 49.30 | 20.987 | 414.72 |
|
||||||
|
| 512 | 32 | 32 | 17408 | 21.231 | 771.69 | 18.235 | 56.16 | 39.466 | 441.09 |
|
||||||
|
| 4096 | 32 | 1 | 4128 | 5.340 | 767.05 | 1.616 | 19.81 | 6.956 | 593.47 |
|
||||||
|
| 4096 | 32 | 2 | 8256 | 10.673 | 767.55 | 2.454 | 26.08 | 13.127 | 628.94 |
|
||||||
|
| 4096 | 32 | 4 | 16512 | 21.348 | 767.46 | 4.072 | 31.44 | 25.420 | 649.57 |
|
||||||
|
| 4096 | 32 | 8 | 33024 | 42.714 | 767.15 | 6.277 | 40.78 | 48.991 | 674.08 |
|
||||||
|
| 4096 | 32 | 16 | 66048 | 85.385 | 767.54 | 10.596 | 48.32 | 95.981 | 688.14 |
|
||||||
|
| 4096 | 32 | 32 | 132096 | 170.819 | 767.32 | 18.619 | 55.00 | 189.437 | 697.31 |
|
||||||
|
| 8192 | 32 | 1 | 8224 | 10.690 | 766.32 | 1.619 | 19.76 | 12.310 | 668.10 |
|
||||||
|
| 8192 | 32 | 2 | 16448 | 21.382 | 766.24 | 2.467 | 25.94 | 23.850 | 689.65 |
|
||||||
|
| 8192 | 32 | 4 | 32896 | 42.782 | 765.92 | 4.098 | 31.23 | 46.881 | 701.69 |
|
||||||
|
| 8192 | 32 | 8 | 65792 | 85.582 | 765.77 | 6.368 | 40.20 | 91.951 | 715.52 |
|
||||||
|
| 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 |
|
||||||
|
| 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 |
|
||||||
|
|
||||||
|
|
||||||
|
- `llama-bench`
|
||||||
|
|
||||||
|
| model | size | params | backend | n_ubatch | fa | test | t/s |
|
||||||
|
| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 | 768.84 ± 0.90 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 | 19.94 ± 0.16 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 764.51 ± 0.50 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d4096 | 19.95 ± 0.18 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 759.53 ± 0.71 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d8192 | 19.83 ± 0.18 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 747.98 ± 1.58 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d16384 | 19.84 ± 0.18 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 724.40 ± 2.70 |
|
||||||
|
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 |
|
||||||
|
|
||||||
|
build: 04a65daab (8268)
|
||||||
22
ci/run.sh
22
ci/run.sh
|
|
@ -25,6 +25,9 @@
|
||||||
# # with KLEIDIAI support
|
# # with KLEIDIAI support
|
||||||
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with OPENVINO support
|
||||||
|
# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
|
@ -46,6 +49,7 @@ cd $sd/../
|
||||||
SRC=`pwd`
|
SRC=`pwd`
|
||||||
|
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
||||||
|
CTEST_EXTRA=""
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
|
|
@ -165,6 +169,18 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
|
||||||
-DBUILD_SHARED_LIBS=OFF"
|
-DBUILD_SHARED_LIBS=OFF"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||||
|
if [ -z ${OpenVINO_DIR} ]; then
|
||||||
|
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
|
||||||
|
echo "source /opt/intel/openvino/setupvars.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
|
||||||
|
|
||||||
|
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||||
|
CTEST_EXTRA="-E test-llama-archs"
|
||||||
|
fi
|
||||||
|
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
|
|
@ -222,7 +238,7 @@ function gg_run_ctest_debug {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
@ -254,9 +270,9 @@ function gg_run_ctest_release {
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
else
|
else
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,8 @@ add_library(${TARGET} STATIC
|
||||||
preset.cpp
|
preset.cpp
|
||||||
preset.h
|
preset.h
|
||||||
regex-partial.cpp
|
regex-partial.cpp
|
||||||
|
reasoning-budget.cpp
|
||||||
|
reasoning-budget.h
|
||||||
regex-partial.h
|
regex-partial.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
|
|
|
||||||
|
|
@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||||
"llama-completion",
|
"llama-completion",
|
||||||
"llama-convert-llama2c-to-ggml",
|
"llama-convert-llama2c-to-ggml",
|
||||||
"llama-cvector-generator",
|
"llama-cvector-generator",
|
||||||
|
"llama-debug",
|
||||||
|
"llama-diffusion-cli",
|
||||||
"llama-embedding",
|
"llama-embedding",
|
||||||
"llama-eval-callback",
|
"llama-eval-callback",
|
||||||
"llama-export-lora",
|
"llama-export-lora",
|
||||||
|
"llama-finetune",
|
||||||
|
"llama-fit-params",
|
||||||
|
"llama-gemma3-cli",
|
||||||
"llama-gen-docs",
|
"llama-gen-docs",
|
||||||
"llama-gguf",
|
"llama-gguf",
|
||||||
"llama-gguf-hash",
|
"llama-gguf-hash",
|
||||||
"llama-gguf-split",
|
"llama-gguf-split",
|
||||||
"llama-gritlm",
|
"llama-idle",
|
||||||
"llama-imatrix",
|
"llama-imatrix",
|
||||||
"llama-infill",
|
"llama-llava-cli",
|
||||||
"llama-mtmd-cli",
|
|
||||||
"llama-llava-clip-quantize-cli",
|
|
||||||
"llama-lookahead",
|
"llama-lookahead",
|
||||||
"llama-lookup",
|
"llama-lookup",
|
||||||
"llama-lookup-create",
|
"llama-lookup-create",
|
||||||
"llama-lookup-merge",
|
"llama-lookup-merge",
|
||||||
"llama-lookup-stats",
|
"llama-lookup-stats",
|
||||||
|
"llama-minicpmv-cli",
|
||||||
|
"llama-mtmd-cli",
|
||||||
"llama-parallel",
|
"llama-parallel",
|
||||||
"llama-passkey",
|
"llama-passkey",
|
||||||
"llama-perplexity",
|
"llama-perplexity",
|
||||||
|
|
@ -2427,11 +2432,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (split_arg.size() == 1) {
|
if (split_arg.size() == 1) {
|
||||||
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < split_arg.size(); i++) {
|
for (size_t i = 0; i < split_arg.size(); i++) {
|
||||||
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_FIT_TARGET"));
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||||
|
|
@ -2666,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.out_file = value;
|
params.out_file = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ofreq", "--output-frequency"}, "N",
|
{"-ofreq", "--output-frequency"}, "N",
|
||||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||||
|
|
@ -2913,6 +2919,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
auto parsed = json::parse(value);
|
auto parsed = json::parse(value);
|
||||||
for (const auto & item : parsed.items()) {
|
for (const auto & item : parsed.items()) {
|
||||||
|
if (item.key() == "enable_thinking") {
|
||||||
|
LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
|
||||||
|
"Use --reasoning on / --reasoning off instead.\n");
|
||||||
|
}
|
||||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3048,14 +3058,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.reasoning_format = common_reasoning_format_from_name(value);
|
params.reasoning_format = common_reasoning_format_from_name(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-rea", "--reasoning"}, "[on|off|auto]",
|
||||||
|
"Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
params.enable_reasoning = 1;
|
||||||
|
params.default_template_kwargs["enable_thinking"] = "true";
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
params.enable_reasoning = 0;
|
||||||
|
params.default_template_kwargs["enable_thinking"] = "false";
|
||||||
|
} else if (is_autoy(value)) {
|
||||||
|
params.enable_reasoning = -1;
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument(
|
||||||
|
string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-budget"}, "N",
|
{"--reasoning-budget"}, "N",
|
||||||
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
||||||
params.reasoning_budget = value;
|
params.reasoning_budget = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--reasoning-budget-message"}, "MESSAGE",
|
||||||
|
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.reasoning_budget_message = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||||
string_format(
|
string_format(
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
|
#include "log.h"
|
||||||
#include "nlohmann/json.hpp"
|
#include "nlohmann/json.hpp"
|
||||||
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
@ -90,7 +91,7 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
|
||||||
// pre-register a json-string rule that accepts both quote styles. This must happen
|
// pre-register a json-string rule that accepts both quote styles. This must happen
|
||||||
// before any call to p.json() so that all JSON parsing inherits the flexible rule.
|
// before any call to p.json() so that all JSON parsing inherits the flexible rule.
|
||||||
if (tools.format.uses_python_dicts) {
|
if (tools.format.uses_python_dicts) {
|
||||||
p.rule("json-string", [&]() { return p.choice({ p.double_quoted_string(), p.single_quoted_string() }); });
|
p.rule("json-string", p.quoted_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
parser_build_context ctx(p, inputs);
|
parser_build_context ctx(p, inputs);
|
||||||
|
|
@ -135,7 +136,9 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
|
||||||
if (thinking_forced_open || thinking_forced_closed) {
|
if (thinking_forced_open || thinking_forced_closed) {
|
||||||
// Thinking is forced open OR forced closed with enable_thinking=true
|
// Thinking is forced open OR forced closed with enable_thinking=true
|
||||||
// In both cases, expect only the closing tag (opening was in template)
|
// In both cases, expect only the closing tag (opening was in template)
|
||||||
return p.reasoning(p.until(end)) + end;
|
// However, since we might have incorrectly detected the open/close pattern,
|
||||||
|
// we admit an optional starting marker
|
||||||
|
return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
|
||||||
}
|
}
|
||||||
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
|
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
|
||||||
// Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
|
// Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
|
||||||
|
|
@ -180,7 +183,10 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
||||||
case tool_format::TAG_WITH_TAGGED:
|
case tool_format::TAG_WITH_TAGGED:
|
||||||
return build_tool_parser_tag_tagged(ctx);
|
return build_tool_parser_tag_tagged(ctx);
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("Unable to create tool parser");
|
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||||
|
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||||
|
"report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
|
||||||
|
return ctx.p.eps();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -479,6 +479,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
|
||||||
|
|
||||||
if (!comparison_with_tools || !comparison_with_reasoning) {
|
if (!comparison_with_tools || !comparison_with_reasoning) {
|
||||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & diff_tools = comparison_with_tools->diff;
|
const auto & diff_tools = comparison_with_tools->diff;
|
||||||
|
|
@ -911,8 +912,10 @@ void analyze_tools::extract_function_markers() {
|
||||||
// we'll have to rely on an extra diff with no-calls version
|
// we'll have to rely on an extra diff with no-calls version
|
||||||
auto notool_comp = compare_variants(
|
auto notool_comp = compare_variants(
|
||||||
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_nocall }); });
|
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_nocall }); });
|
||||||
auto nt_diff = notool_comp->diff;
|
if (notool_comp) {
|
||||||
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
|
auto nt_diff = notool_comp->diff;
|
||||||
|
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
closer_suffix = diff.suffix.substr(0, diff.suffix.find(suffix_marker));
|
closer_suffix = diff.suffix.substr(0, diff.suffix.find(suffix_marker));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using ordered_json = nlohmann::ordered_json;
|
||||||
|
|
||||||
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
@ -68,7 +68,7 @@ static int json_brace_depth(const std::string & s) {
|
||||||
|
|
||||||
// JSON-escape a string and return the inner content (without surrounding quotes).
|
// JSON-escape a string and return the inner content (without surrounding quotes).
|
||||||
static std::string escape_json_string_inner(const std::string & s) {
|
static std::string escape_json_string_inner(const std::string & s) {
|
||||||
std::string escaped = json(s).dump();
|
std::string escaped = ordered_json(s).dump();
|
||||||
if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
|
if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
|
||||||
return escaped.substr(1, escaped.size() - 2);
|
return escaped.substr(1, escaped.size() - 2);
|
||||||
}
|
}
|
||||||
|
|
@ -309,7 +309,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||||
if (arg_count > 0) {
|
if (arg_count > 0) {
|
||||||
arg_entry = ",";
|
arg_entry = ",";
|
||||||
}
|
}
|
||||||
arg_entry += json(trim(node.text)).dump() + ":";
|
arg_entry += ordered_json(trim(node.text)).dump() + ":";
|
||||||
++arg_count;
|
++arg_count;
|
||||||
|
|
||||||
auto & target = args_target();
|
auto & target = args_target();
|
||||||
|
|
@ -343,7 +343,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
|
||||||
// Try to parse as JSON value (number, bool, null, object, array)
|
// Try to parse as JSON value (number, bool, null, object, array)
|
||||||
try {
|
try {
|
||||||
json parsed = json::parse(value_content);
|
ordered_json parsed = ordered_json::parse(value_content);
|
||||||
if (parsed.is_string()) {
|
if (parsed.is_string()) {
|
||||||
// Don't add closing quote yet (added by arg_close) for monotonic streaming
|
// Don't add closing quote yet (added by arg_close) for monotonic streaming
|
||||||
std::string escaped = parsed.dump();
|
std::string escaped = parsed.dump();
|
||||||
|
|
@ -408,7 +408,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
|
||||||
common_peg_parser common_chat_peg_builder::standard_constructed_tools(
|
common_peg_parser common_chat_peg_builder::standard_constructed_tools(
|
||||||
const std::map<std::string, std::string> & markers,
|
const std::map<std::string, std::string> & markers,
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
bool parallel_tool_calls,
|
bool parallel_tool_calls,
|
||||||
bool force_tool_calls) {
|
bool force_tool_calls) {
|
||||||
if (!tools.is_array() || tools.empty()) {
|
if (!tools.is_array() || tools.empty()) {
|
||||||
|
|
@ -439,7 +439,7 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
|
||||||
}
|
}
|
||||||
const auto & function = tool_def.at("function");
|
const auto & function = tool_def.at("function");
|
||||||
std::string name = function.at("name");
|
std::string name = function.at("name");
|
||||||
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
|
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||||
|
|
||||||
// Build argument parsers
|
// Build argument parsers
|
||||||
auto args = eps();
|
auto args = eps();
|
||||||
|
|
@ -479,8 +479,8 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
|
||||||
// Python-style tool calls: name(arg1="value1", arg2=123)
|
// Python-style tool calls: name(arg1="value1", arg2=123)
|
||||||
// Used only by LFM2 for now, so we don't merge it into autoparser
|
// Used only by LFM2 for now, so we don't merge it into autoparser
|
||||||
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
bool parallel_tool_calls) {
|
bool parallel_tool_calls) {
|
||||||
if (!tools.is_array() || tools.empty()) {
|
if (!tools.is_array() || tools.empty()) {
|
||||||
return eps();
|
return eps();
|
||||||
}
|
}
|
||||||
|
|
@ -493,7 +493,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||||
}
|
}
|
||||||
const auto & function = tool_def.at("function");
|
const auto & function = tool_def.at("function");
|
||||||
std::string name = function.at("name");
|
std::string name = function.at("name");
|
||||||
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
|
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||||
|
|
||||||
auto args = eps();
|
auto args = eps();
|
||||||
if (params.contains("properties") && !params["properties"].empty()) {
|
if (params.contains("properties") && !params["properties"].empty()) {
|
||||||
|
|
@ -507,8 +507,8 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||||
|
|
||||||
common_peg_parser arg_value_parser = eps();
|
common_peg_parser arg_value_parser = eps();
|
||||||
auto string_value_parser = choice({
|
auto string_value_parser = choice({
|
||||||
literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""),
|
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
|
||||||
literal("'") + tool_arg_string_value(json_string_content()) + literal("'")
|
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
|
||||||
});
|
});
|
||||||
|
|
||||||
if (is_string_type) {
|
if (is_string_type) {
|
||||||
|
|
@ -555,11 +555,11 @@ static std::pair<std::string, std::string> parse_key_spec(const std::string & ke
|
||||||
|
|
||||||
// Mode 1: function_is_key — parse {"function_name": {...}}
|
// Mode 1: function_is_key — parse {"function_name": {...}}
|
||||||
common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
const std::string & args_key,
|
const std::string & args_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
const std::string & gen_call_id_key) {
|
const std::string & gen_call_id_key) {
|
||||||
|
|
||||||
auto tool_choices = choice();
|
auto tool_choices = choice();
|
||||||
|
|
||||||
|
|
@ -569,7 +569,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
||||||
}
|
}
|
||||||
const auto & function = tool_def.at("function");
|
const auto & function = tool_def.at("function");
|
||||||
std::string name = function.at("name");
|
std::string name = function.at("name");
|
||||||
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
|
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||||
|
|
||||||
// Build inner object fields
|
// Build inner object fields
|
||||||
std::vector<common_peg_parser> inner_fields;
|
std::vector<common_peg_parser> inner_fields;
|
||||||
|
|
@ -577,7 +577,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
||||||
if (!call_id_key.empty()) {
|
if (!call_id_key.empty()) {
|
||||||
auto id_parser = atomic(
|
auto id_parser = atomic(
|
||||||
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\"")
|
literal("\"") + tool_id(string_content('"')) + literal("\"")
|
||||||
);
|
);
|
||||||
inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space())));
|
inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space())));
|
||||||
}
|
}
|
||||||
|
|
@ -586,7 +586,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
||||||
auto gen_id_parser = atomic(
|
auto gen_id_parser = atomic(
|
||||||
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
choice({
|
choice({
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\""),
|
literal("\"") + tool_id(string_content('"')) + literal("\""),
|
||||||
tool_id(json_number())
|
tool_id(json_number())
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
@ -634,11 +634,11 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
|
||||||
|
|
||||||
// Mode 2: Nested keys (dot notation like "function.name")
|
// Mode 2: Nested keys (dot notation like "function.name")
|
||||||
common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
const std::string & effective_name_key,
|
const std::string & effective_name_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
const std::string & gen_call_id_key) {
|
const std::string & gen_call_id_key) {
|
||||||
|
|
||||||
auto tool_choices = choice();
|
auto tool_choices = choice();
|
||||||
|
|
||||||
|
|
@ -655,7 +655,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||||
}
|
}
|
||||||
const auto & function = tool_def.at("function");
|
const auto & function = tool_def.at("function");
|
||||||
std::string name = function.at("name");
|
std::string name = function.at("name");
|
||||||
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
|
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||||
|
|
||||||
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
|
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
|
||||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||||
|
|
@ -675,7 +675,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||||
if (id_spec.first.empty()) {
|
if (id_spec.first.empty()) {
|
||||||
auto id_parser = atomic(
|
auto id_parser = atomic(
|
||||||
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\"")
|
literal("\"") + tool_id(string_content('"')) + literal("\"")
|
||||||
);
|
);
|
||||||
tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space());
|
tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space());
|
||||||
}
|
}
|
||||||
|
|
@ -687,7 +687,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||||
auto gen_id_parser = atomic(
|
auto gen_id_parser = atomic(
|
||||||
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
choice({
|
choice({
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\""),
|
literal("\"") + tool_id(string_content('"')) + literal("\""),
|
||||||
tool_id(json_number())
|
tool_id(json_number())
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
@ -706,7 +706,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||||
|
|
||||||
// Mode 3: Flat keys with optional ID fields and parameter ordering
|
// Mode 3: Flat keys with optional ID fields and parameter ordering
|
||||||
common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
const std::string & effective_name_key,
|
const std::string & effective_name_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
|
|
@ -723,7 +723,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||||
}
|
}
|
||||||
const auto & function = tool_def.at("function");
|
const auto & function = tool_def.at("function");
|
||||||
std::string name = function.at("name");
|
std::string name = function.at("name");
|
||||||
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
|
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||||
|
|
||||||
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
|
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
|
||||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||||
|
|
@ -736,7 +736,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||||
id_parser = atomic(
|
id_parser = atomic(
|
||||||
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
choice({
|
choice({
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\""),
|
literal("\"") + tool_id(string_content('"')) + literal("\""),
|
||||||
tool_id(json_number())
|
tool_id(json_number())
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
@ -747,7 +747,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||||
gen_id_parser = atomic(
|
gen_id_parser = atomic(
|
||||||
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
|
||||||
choice({
|
choice({
|
||||||
literal("\"") + tool_id(json_string_content()) + literal("\""),
|
literal("\"") + tool_id(string_content('"')) + literal("\""),
|
||||||
tool_id(json_number())
|
tool_id(json_number())
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
@ -791,7 +791,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||||
common_peg_parser common_chat_peg_builder::standard_json_tools(
|
common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||||
const std::string & section_start,
|
const std::string & section_start,
|
||||||
const std::string & section_end,
|
const std::string & section_end,
|
||||||
const nlohmann::json & tools,
|
const ordered_json & tools,
|
||||||
bool parallel_tool_calls,
|
bool parallel_tool_calls,
|
||||||
bool force_tool_calls,
|
bool force_tool_calls,
|
||||||
const std::string & name_key,
|
const std::string & name_key,
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||||
// parameters_order: order in which JSON fields should be parsed
|
// parameters_order: order in which JSON fields should be parsed
|
||||||
common_peg_parser standard_json_tools(const std::string & section_start,
|
common_peg_parser standard_json_tools(const std::string & section_start,
|
||||||
const std::string & section_end,
|
const std::string & section_end,
|
||||||
const nlohmann::json & tools,
|
const nlohmann::ordered_json & tools,
|
||||||
bool parallel_tool_calls,
|
bool parallel_tool_calls,
|
||||||
bool force_tool_calls,
|
bool force_tool_calls,
|
||||||
const std::string & name_key = "",
|
const std::string & name_key = "",
|
||||||
|
|
@ -108,30 +108,30 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||||
// Legacy-compatible helper for building XML/tagged style tool calls
|
// Legacy-compatible helper for building XML/tagged style tool calls
|
||||||
// Used by tests and manual parsers
|
// Used by tests and manual parsers
|
||||||
common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
|
common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
|
||||||
const nlohmann::json & tools,
|
const nlohmann::ordered_json & tools,
|
||||||
bool parallel_tool_calls,
|
bool parallel_tool_calls,
|
||||||
bool force_tool_calls);
|
bool force_tool_calls);
|
||||||
|
|
||||||
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
|
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
|
||||||
// Used by LFM2 and similar templates
|
// Used by LFM2 and similar templates
|
||||||
common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
|
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
|
||||||
bool parallel_tool_calls);
|
bool parallel_tool_calls);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
|
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
|
||||||
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
|
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
|
||||||
const std::string & args_key,
|
const std::string & args_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
const std::string & gen_call_id_key);
|
const std::string & gen_call_id_key);
|
||||||
|
|
||||||
common_peg_parser build_json_tools_nested_keys(const nlohmann::json & tools,
|
common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
|
||||||
const std::string & effective_name_key,
|
const std::string & effective_name_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
const std::string & gen_call_id_key);
|
const std::string & gen_call_id_key);
|
||||||
|
|
||||||
common_peg_parser build_json_tools_flat_keys(const nlohmann::json & tools,
|
common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json & tools,
|
||||||
const std::string & effective_name_key,
|
const std::string & effective_name_key,
|
||||||
const std::string & effective_args_key,
|
const std::string & effective_args_key,
|
||||||
const std::string & call_id_key,
|
const std::string & call_id_key,
|
||||||
|
|
|
||||||
121
common/chat.cpp
121
common/chat.cpp
|
|
@ -857,7 +857,9 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
|
||||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||||
auto include_grammar = true;
|
auto include_grammar = true;
|
||||||
|
|
||||||
data.supports_thinking = true;
|
data.supports_thinking = true;
|
||||||
|
data.thinking_start_tag = "[THINK]";
|
||||||
|
data.thinking_end_tag = "[/THINK]";
|
||||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||||
data.preserved_tokens = {
|
data.preserved_tokens = {
|
||||||
|
|
@ -1165,9 +1167,11 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||||
const autoparser::templates_params & inputs) {
|
const autoparser::templates_params & inputs) {
|
||||||
common_chat_params data;
|
common_chat_params data;
|
||||||
|
|
||||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||||
data.supports_thinking = true;
|
data.supports_thinking = true;
|
||||||
|
data.thinking_start_tag = "<think>";
|
||||||
|
data.thinking_end_tag = "</think>";
|
||||||
data.preserved_tokens = {
|
data.preserved_tokens = {
|
||||||
"<|tool_calls_section_begin|>",
|
"<|tool_calls_section_begin|>",
|
||||||
"<|tool_calls_section_end|>",
|
"<|tool_calls_section_end|>",
|
||||||
|
|
@ -1350,8 +1354,90 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static common_chat_params common_chat_params_init_gigachat_v3(
|
||||||
|
const common_chat_template & tmpl,
|
||||||
|
const autoparser::templates_params & inputs) {
|
||||||
|
|
||||||
|
common_chat_params data;
|
||||||
|
|
||||||
|
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||||
|
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||||
|
data.supports_thinking = false;
|
||||||
|
data.preserved_tokens = {
|
||||||
|
"<|message_sep|>\n\n",
|
||||||
|
"<|role_sep|>\n",
|
||||||
|
};
|
||||||
|
|
||||||
|
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||||
|
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||||
|
auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
|
||||||
|
|
||||||
|
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||||
|
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||||
|
// Build a choice of all available tools
|
||||||
|
auto tool_choice = p.choice();
|
||||||
|
for (const auto & tool : inputs.tools) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
std::string name = function.at("name");
|
||||||
|
const auto & schema = function.at("parameters");
|
||||||
|
|
||||||
|
auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
|
||||||
|
auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
|
||||||
|
|
||||||
|
auto tool_open = p.tool_open(p.literal("{") << tool_name);
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name, tool_open << "," << tool_args << "}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define the tool call structure
|
||||||
|
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||||
|
auto max_calls = 1; // parallel toolcalls are not supported
|
||||||
|
auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
|
||||||
|
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
|
||||||
|
|
||||||
|
return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content only parser
|
||||||
|
include_grammar = false;
|
||||||
|
return p.content(p.rest());
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
data.parser = parser.save();
|
||||||
|
|
||||||
|
if (include_grammar) {
|
||||||
|
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
|
|
||||||
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
auto schema = function.at("parameters");
|
||||||
|
builder.resolve_refs(schema);
|
||||||
|
});
|
||||||
|
parser.build_grammar(builder, data.grammar_lazy);
|
||||||
|
});
|
||||||
|
|
||||||
|
data.grammar_triggers = {
|
||||||
|
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, tool_call_start_prefix}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
namespace workaround {
|
namespace workaround {
|
||||||
|
|
||||||
|
static void map_developer_role_to_system(json & messages) {
|
||||||
|
for (auto & message : messages) {
|
||||||
|
if (message.contains("role")) {
|
||||||
|
if (message["role"] == "developer") {
|
||||||
|
message["role"] = "system";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// if first message is system and template does not support it, merge it with next message
|
// if first message is system and template does not support it, merge it with next message
|
||||||
static void system_message_not_supported(json & messages) {
|
static void system_message_not_supported(json & messages) {
|
||||||
if (!messages.empty() && messages.front().at("role") == "system") {
|
if (!messages.empty() && messages.front().at("role") == "system") {
|
||||||
|
|
@ -1429,6 +1515,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||||
params.add_bos = tmpls->add_bos;
|
params.add_bos = tmpls->add_bos;
|
||||||
params.add_eos = tmpls->add_eos;
|
params.add_eos = tmpls->add_eos;
|
||||||
|
|
||||||
|
if (src.find("<|channel|>") == std::string::npos) {
|
||||||
|
// map developer to system for all models except for GPT-OSS
|
||||||
|
workaround::map_developer_role_to_system(params.messages);
|
||||||
|
}
|
||||||
workaround::func_args_not_string(params.messages);
|
workaround::func_args_not_string(params.messages);
|
||||||
|
|
||||||
if (!tmpl.original_caps().supports_system_role) {
|
if (!tmpl.original_caps().supports_system_role) {
|
||||||
|
|
@ -1506,12 +1596,31 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||||
return common_chat_params_init_lfm2(tmpl, params);
|
return common_chat_params_init_lfm2(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GigaChatV3 format detection
|
||||||
|
if (src.find("<|role_sep|>") != std::string::npos &&
|
||||||
|
src.find("<|message_sep|>") != std::string::npos &&
|
||||||
|
src.find("<|function_call|>") == std::string::npos
|
||||||
|
) {
|
||||||
|
LOG_DBG("Using specialized template: GigaChatV3\n");
|
||||||
|
return common_chat_params_init_gigachat_v3(tmpl, params);
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
LOG_DBG("Using differential autoparser\n");
|
LOG_DBG("Using differential autoparser\n");
|
||||||
struct autoparser::autoparser autoparser;
|
struct autoparser::autoparser autoparser;
|
||||||
autoparser.analyze_template(tmpl);
|
autoparser.analyze_template(tmpl);
|
||||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||||
|
if (auto_params.supports_thinking) {
|
||||||
|
auto_params.thinking_start_tag = autoparser.reasoning.start;
|
||||||
|
auto_params.thinking_end_tag = autoparser.reasoning.end;
|
||||||
|
// FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
|
||||||
|
// (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
|
||||||
|
// but forces <think> open when thinking is enabled)
|
||||||
|
auto_params.thinking_forced_open =
|
||||||
|
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
|
||||||
|
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
|
||||||
|
}
|
||||||
return auto_params;
|
return auto_params;
|
||||||
} catch (const std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
|
throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
|
||||||
|
|
@ -1605,8 +1714,8 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||||
build_chat_peg_parser([](common_chat_peg_builder & p) { return p.content(p.rest()) + p.end(); }) :
|
build_chat_peg_parser([](common_chat_peg_builder & p) { return p.content(p.rest()) + p.end(); }) :
|
||||||
src_parser;
|
src_parser;
|
||||||
|
|
||||||
if (src_parser.empty()) {
|
if (src_parser.empty()) {
|
||||||
LOG_WRN("No parser definition detected, assuming pure content parser.");
|
LOG_DBG("No parser definition detected, assuming pure content parser.");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
|
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
|
||||||
|
|
|
||||||
|
|
@ -213,6 +213,8 @@ struct common_chat_params {
|
||||||
bool grammar_lazy = false;
|
bool grammar_lazy = false;
|
||||||
bool thinking_forced_open = false;
|
bool thinking_forced_open = false;
|
||||||
bool supports_thinking = false;
|
bool supports_thinking = false;
|
||||||
|
std::string thinking_start_tag; // e.g., "<think>"
|
||||||
|
std::string thinking_end_tag; // e.g., "</think>"
|
||||||
std::vector<common_grammar_trigger> grammar_triggers;
|
std::vector<common_grammar_trigger> grammar_triggers;
|
||||||
std::vector<std::string> preserved_tokens;
|
std::vector<std::string> preserved_tokens;
|
||||||
std::vector<std::string> additional_stops;
|
std::vector<std::string> additional_stops;
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
LLAMA_EXAMPLE_RESULTS,
|
LLAMA_EXAMPLE_RESULTS,
|
||||||
|
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -235,6 +236,14 @@ struct common_params_sampling {
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||||
|
|
||||||
|
// reasoning budget sampler parameters
|
||||||
|
// these are populated by the server/CLI based on chat template params
|
||||||
|
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
|
||||||
|
bool reasoning_budget_activate_immediately = false;
|
||||||
|
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
||||||
|
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||||
|
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||||
|
|
||||||
bool backend_sampling = false;
|
bool backend_sampling = false;
|
||||||
|
|
||||||
bool has_logit_bias() const {
|
bool has_logit_bias() const {
|
||||||
|
|
@ -536,7 +545,9 @@ struct common_params {
|
||||||
bool use_jinja = true; // NOLINT
|
bool use_jinja = true; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
|
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
||||||
int reasoning_budget = -1;
|
int reasoning_budget = -1;
|
||||||
|
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||||
|
|
||||||
|
|
@ -916,7 +927,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
// MoE utils
|
// MoE utils
|
||||||
//
|
//
|
||||||
|
|
||||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
|
||||||
|
|
||||||
inline std::string llm_ffn_exps_block_regex(int idx) {
|
inline std::string llm_ffn_exps_block_regex(int idx) {
|
||||||
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ struct common_http_url {
|
||||||
std::string user;
|
std::string user;
|
||||||
std::string password;
|
std::string password;
|
||||||
std::string host;
|
std::string host;
|
||||||
|
int port;
|
||||||
std::string path;
|
std::string path;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -47,6 +48,20 @@ static common_http_url common_http_parse_url(const std::string & url) {
|
||||||
parts.host = rest;
|
parts.host = rest;
|
||||||
parts.path = "/";
|
parts.path = "/";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto colon_pos = parts.host.find(':');
|
||||||
|
|
||||||
|
if (colon_pos != std::string::npos) {
|
||||||
|
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
|
||||||
|
parts.host = parts.host.substr(0, colon_pos);
|
||||||
|
} else if (parts.scheme == "http") {
|
||||||
|
parts.port = 80;
|
||||||
|
} else if (parts.scheme == "https") {
|
||||||
|
parts.port = 443;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
|
||||||
|
}
|
||||||
|
|
||||||
return parts;
|
return parts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -68,7 +83,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
httplib::Client cli(parts.scheme + "://" + parts.host);
|
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
|
||||||
|
|
||||||
if (!parts.user.empty()) {
|
if (!parts.user.empty()) {
|
||||||
cli.set_basic_auth(parts.user, parts.password);
|
cli.set_basic_auth(parts.user, parts.password);
|
||||||
|
|
|
||||||
|
|
@ -790,7 +790,7 @@ public:
|
||||||
} else if (target.is_array()) {
|
} else if (target.is_array()) {
|
||||||
size_t sel_index;
|
size_t sel_index;
|
||||||
try {
|
try {
|
||||||
sel_index = std::stoul(sel);
|
sel_index = std::stoull(sel);
|
||||||
} catch (const std::invalid_argument & e) {
|
} catch (const std::invalid_argument & e) {
|
||||||
sel_index = target.size();
|
sel_index = target.size();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -658,7 +658,7 @@ struct parser_executor {
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
|
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos, const char delimiter) {
|
||||||
++pos; // consume '\'
|
++pos; // consume '\'
|
||||||
if (pos >= ctx.input.size()) {
|
if (pos >= ctx.input.size()) {
|
||||||
if (!ctx.is_lenient()) {
|
if (!ctx.is_lenient()) {
|
||||||
|
|
@ -667,23 +667,14 @@ struct parser_executor {
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (ctx.input[pos]) {
|
char c = ctx.input[pos];
|
||||||
case '"':
|
if (c == delimiter || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') {
|
||||||
case '\'':
|
++pos;
|
||||||
case '\\':
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
||||||
case '/':
|
} else if (c == 'u') {
|
||||||
case 'b':
|
return handle_unicode_escape(ctx, start, pos);
|
||||||
case 'f':
|
} else {
|
||||||
case 'n':
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
||||||
case 'r':
|
|
||||||
case 't':
|
|
||||||
++pos;
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
|
||||||
case 'u':
|
|
||||||
return handle_unicode_escape(ctx, start, pos);
|
|
||||||
default:
|
|
||||||
// Invalid escape sequence
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -704,62 +695,20 @@ struct parser_executor {
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
|
common_peg_parse_result operator()(const common_peg_string_parser & p) {
|
||||||
auto pos = start_pos;
|
auto pos = start_pos;
|
||||||
|
|
||||||
// Parse string content (without quotes)
|
// Parse string content (without quotes)
|
||||||
while (pos < ctx.input.size()) {
|
while (pos < ctx.input.size()) {
|
||||||
char c = ctx.input[pos];
|
char c = ctx.input[pos];
|
||||||
|
|
||||||
if (c == '"') {
|
if (c == p.delimiter) {
|
||||||
// Found closing quote - success (don't consume it)
|
// Found closing delimiter - success (don't consume it)
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c == '\\') {
|
if (c == '\\') {
|
||||||
auto result = handle_escape_sequence(ctx, start_pos, pos);
|
auto result = handle_escape_sequence(ctx, start_pos, pos, p.delimiter);
|
||||||
if (!result.success()) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
|
|
||||||
|
|
||||||
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
|
|
||||||
if (!ctx.is_lenient()) {
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
||||||
}
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (utf8_result.status == utf8_parse_result::INVALID) {
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
pos += utf8_result.bytes_consumed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reached end without finding closing quote
|
|
||||||
if (!ctx.is_lenient()) {
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
|
|
||||||
}
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
common_peg_parse_result operator()(const common_peg_python_dict_string_parser & /* p */) {
|
|
||||||
auto pos = start_pos;
|
|
||||||
|
|
||||||
// Parse string content (without quotes)
|
|
||||||
while (pos < ctx.input.size()) {
|
|
||||||
char c = ctx.input[pos];
|
|
||||||
|
|
||||||
if (c == '\'') {
|
|
||||||
// Found closing quote - success (don't consume it)
|
|
||||||
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c == '\\') {
|
|
||||||
auto result = handle_escape_sequence(ctx, start_pos, pos);
|
|
||||||
if (!result.success()) {
|
if (!result.success()) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
@ -988,8 +937,7 @@ void common_peg_arena::resolve_refs() {
|
||||||
std::is_same_v<T, common_peg_ref_parser> ||
|
std::is_same_v<T, common_peg_ref_parser> ||
|
||||||
std::is_same_v<T, common_peg_until_parser> ||
|
std::is_same_v<T, common_peg_until_parser> ||
|
||||||
std::is_same_v<T, common_peg_literal_parser> ||
|
std::is_same_v<T, common_peg_literal_parser> ||
|
||||||
std::is_same_v<T, common_peg_json_string_parser> ||
|
std::is_same_v<T, common_peg_string_parser> ||
|
||||||
std::is_same_v<T, common_peg_python_dict_string_parser> ||
|
|
||||||
std::is_same_v<T, common_peg_chars_parser> ||
|
std::is_same_v<T, common_peg_chars_parser> ||
|
||||||
std::is_same_v<T, common_peg_any_parser> ||
|
std::is_same_v<T, common_peg_any_parser> ||
|
||||||
std::is_same_v<T, common_peg_space_parser>) {
|
std::is_same_v<T, common_peg_space_parser>) {
|
||||||
|
|
@ -1065,10 +1013,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
|
||||||
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
|
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
|
||||||
}
|
}
|
||||||
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
|
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
|
||||||
return "JsonString()";
|
return "String(" + std::string(1, p.delimiter) + ")";
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
|
|
||||||
return "PythonDictString()";
|
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
||||||
return "Until(" + string_join(p.delimiters, " | ") + ")";
|
return "Until(" + string_join(p.delimiters, " | ") + ")";
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
||||||
|
|
@ -1281,47 +1227,25 @@ common_peg_arena common_peg_parser_builder::build() {
|
||||||
|
|
||||||
// String primitives
|
// String primitives
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::json_string_content() {
|
common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
|
||||||
return wrap(arena_.add_parser(common_peg_json_string_parser{}));
|
return wrap(arena_.add_parser(common_peg_string_parser{delimiter}));
|
||||||
}
|
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::single_quoted_string_content() {
|
|
||||||
return wrap(arena_.add_parser(common_peg_python_dict_string_parser{}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::double_quoted_string() {
|
common_peg_parser common_peg_parser_builder::double_quoted_string() {
|
||||||
return rule("dq-string",
|
return rule("double-quoted-string", [this]() {
|
||||||
[this]() { return sequence({ literal("\""), json_string_content(), literal("\""), space() }); });
|
return sequence({literal("\""), string_content('"'), literal("\""), space()});
|
||||||
}
|
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::single_quoted_string() {
|
|
||||||
return rule("sq-string",
|
|
||||||
[this]() { return sequence({ literal("'"), single_quoted_string_content(), literal("'"), space() }); });
|
|
||||||
}
|
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::flexible_string() {
|
|
||||||
return rule("flexible-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generic helpers for object/array structure
|
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::generic_object(const std::string & name,
|
|
||||||
const common_peg_parser & string_parser,
|
|
||||||
const common_peg_parser & value_parser) {
|
|
||||||
return rule(name, [this, string_parser, value_parser]() {
|
|
||||||
auto ws = space();
|
|
||||||
auto member = sequence({ string_parser, ws, literal(":"), ws, value_parser });
|
|
||||||
auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
|
|
||||||
return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }) });
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::generic_array(const std::string & name,
|
common_peg_parser common_peg_parser_builder::single_quoted_string() {
|
||||||
const common_peg_parser & value_parser) {
|
return rule("single-quoted-string", [this]() {
|
||||||
return rule(name, [this, value_parser]() {
|
return sequence({literal("'"), string_content('\''), literal("'"), space()});
|
||||||
auto ws = space();
|
});
|
||||||
auto elements = sequence({ value_parser, zero_or_more(sequence({ literal(","), ws, value_parser })) });
|
}
|
||||||
return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }) });
|
|
||||||
|
common_peg_parser common_peg_parser_builder::quoted_string() {
|
||||||
|
return rule("quoted-string", [this]() {
|
||||||
|
return choice({double_quoted_string(), single_quoted_string()});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1344,7 +1268,7 @@ common_peg_parser common_peg_parser_builder::json_number() {
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::json_string() {
|
common_peg_parser common_peg_parser_builder::json_string() {
|
||||||
return rule("json-string", [this]() {
|
return rule("json-string", [this]() {
|
||||||
return sequence({literal("\""), json_string_content(), literal("\""), space()});
|
return sequence({literal("\""), string_content('"'), literal("\""), space()});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1361,11 +1285,36 @@ common_peg_parser common_peg_parser_builder::json_null() {
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::json_object() {
|
common_peg_parser common_peg_parser_builder::json_object() {
|
||||||
return generic_object("json-object", json_string(), json());
|
return rule("json-object", [this]() {
|
||||||
|
auto ws = space();
|
||||||
|
auto member = sequence({json_string(), ws, literal(":"), ws, json()});
|
||||||
|
auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
|
||||||
|
return sequence({
|
||||||
|
literal("{"),
|
||||||
|
ws,
|
||||||
|
choice({
|
||||||
|
literal("}"),
|
||||||
|
sequence({members, ws, literal("}")})
|
||||||
|
}),
|
||||||
|
ws
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::json_array() {
|
common_peg_parser common_peg_parser_builder::json_array() {
|
||||||
return generic_array("json-array", json());
|
return rule("json-array", [this]() {
|
||||||
|
auto ws = space();
|
||||||
|
auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
|
||||||
|
return sequence({
|
||||||
|
literal("["),
|
||||||
|
ws,
|
||||||
|
choice({
|
||||||
|
literal("]"),
|
||||||
|
sequence({elements, ws, literal("]")})
|
||||||
|
}),
|
||||||
|
ws
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::json() {
|
common_peg_parser common_peg_parser_builder::json() {
|
||||||
|
|
@ -1382,7 +1331,9 @@ common_peg_parser common_peg_parser_builder::json() {
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_string() {
|
common_peg_parser common_peg_parser_builder::python_string() {
|
||||||
return rule("python-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); });
|
return rule("python-string", [this]() {
|
||||||
|
return choice({double_quoted_string(), single_quoted_string()});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_number() {
|
common_peg_parser common_peg_parser_builder::python_number() {
|
||||||
|
|
@ -1390,24 +1341,63 @@ common_peg_parser common_peg_parser_builder::python_number() {
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_bool() {
|
common_peg_parser common_peg_parser_builder::python_bool() {
|
||||||
return rule("python-bool", [this]() { return sequence({ choice({ literal("True"), literal("False") }), space() }); });
|
return rule("python-bool", [this]() {
|
||||||
|
return sequence({
|
||||||
|
choice({literal("True"), literal("False")}),
|
||||||
|
space()
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_null() {
|
common_peg_parser common_peg_parser_builder::python_null() {
|
||||||
return rule("python-none", [this]() { return sequence({ literal("None"), space() }); });
|
return rule("python-none", [this]() {
|
||||||
|
return sequence({literal("None"), space()});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_dict() {
|
common_peg_parser common_peg_parser_builder::python_dict() {
|
||||||
return generic_object("python-dict", python_string(), python_value());
|
return rule("python-dict", [this]() {
|
||||||
|
auto ws = space();
|
||||||
|
auto member = sequence({python_string(), ws, literal(":"), ws, python_value()});
|
||||||
|
auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
|
||||||
|
return sequence({
|
||||||
|
literal("{"),
|
||||||
|
ws,
|
||||||
|
choice({
|
||||||
|
literal("}"),
|
||||||
|
sequence({members, ws, literal("}")})
|
||||||
|
}),
|
||||||
|
ws
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_array() {
|
common_peg_parser common_peg_parser_builder::python_array() {
|
||||||
return generic_array("python-array", python_value());
|
return rule("python-array", [this]() {
|
||||||
|
auto ws = space();
|
||||||
|
auto elements = sequence({python_value(), zero_or_more(sequence({literal(","), ws, python_value()}))});
|
||||||
|
return sequence({
|
||||||
|
literal("["),
|
||||||
|
ws,
|
||||||
|
choice({
|
||||||
|
literal("]"),
|
||||||
|
sequence({elements, ws, literal("]")})
|
||||||
|
}),
|
||||||
|
ws
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
common_peg_parser common_peg_parser_builder::python_value() {
|
common_peg_parser common_peg_parser_builder::python_value() {
|
||||||
return rule("python-value", [this]() {
|
return rule("python-value", [this]() {
|
||||||
return choice({ python_dict(), python_array(), python_string(), python_number(), python_bool(), python_null() });
|
return choice({
|
||||||
|
python_dict(),
|
||||||
|
python_array(),
|
||||||
|
python_string(),
|
||||||
|
python_number(),
|
||||||
|
python_bool(),
|
||||||
|
python_null()
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1528,8 +1518,7 @@ static std::unordered_set<std::string> collect_reachable_rules(
|
||||||
std::is_same_v<T, common_peg_chars_parser> ||
|
std::is_same_v<T, common_peg_chars_parser> ||
|
||||||
std::is_same_v<T, common_peg_space_parser> ||
|
std::is_same_v<T, common_peg_space_parser> ||
|
||||||
std::is_same_v<T, common_peg_any_parser> ||
|
std::is_same_v<T, common_peg_any_parser> ||
|
||||||
std::is_same_v<T, common_peg_json_string_parser> ||
|
std::is_same_v<T, common_peg_string_parser>) {
|
||||||
std::is_same_v<T, common_peg_python_dict_string_parser>) {
|
|
||||||
// These parsers do not have any children
|
// These parsers do not have any children
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
|
||||||
for (auto child : p.children) {
|
for (auto child : p.children) {
|
||||||
|
|
@ -1665,10 +1654,9 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
||||||
return result + "{" + std::to_string(p.min_count) + "}";
|
return result + "{" + std::to_string(p.min_count) + "}";
|
||||||
}
|
}
|
||||||
return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
|
return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
|
||||||
return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
|
const std::string delim(1, p.delimiter);
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
|
return R"(( [^)" + delim + R"(\\] | "\\" ( [)" + delim + R"(\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
|
||||||
return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
|
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
||||||
if (p.delimiters.empty()) {
|
if (p.delimiters.empty()) {
|
||||||
return ".*";
|
return ".*";
|
||||||
|
|
@ -1798,10 +1786,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
|
||||||
{"min_count", p.min_count},
|
{"min_count", p.min_count},
|
||||||
{"max_count", p.max_count}
|
{"max_count", p.max_count}
|
||||||
};
|
};
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
|
||||||
return json{{"type", "json_string"}};
|
return json{{"type", "string"}, {"delimiter", std::string(1, p.delimiter)}};
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
|
|
||||||
return json{{ "type", "python_dict_string" }};
|
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
|
||||||
return json{{"type", "until"}, {"delimiters", p.delimiters}};
|
return json{{"type", "until"}, {"delimiters", p.delimiters}};
|
||||||
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
||||||
|
|
@ -1928,11 +1914,15 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
|
||||||
}
|
}
|
||||||
return parser;
|
return parser;
|
||||||
}
|
}
|
||||||
if (type == "json_string") {
|
if (type == "string") {
|
||||||
return common_peg_json_string_parser{};
|
if (!j.contains("delimiter")) {
|
||||||
}
|
throw std::runtime_error("string parser missing delimiter field.");
|
||||||
if (type == "python_dict_string") {
|
}
|
||||||
return common_peg_python_dict_string_parser{};
|
std::string delimiter = j["delimiter"];
|
||||||
|
if (delimiter.empty()) {
|
||||||
|
throw std::runtime_error("string parser delimiter is empty.");
|
||||||
|
}
|
||||||
|
return common_peg_string_parser{delimiter[0]};
|
||||||
}
|
}
|
||||||
if (type == "until") {
|
if (type == "until") {
|
||||||
if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
|
if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
|
||||||
|
|
|
||||||
|
|
@ -231,8 +231,9 @@ struct common_peg_chars_parser {
|
||||||
int max_count; // -1 for unbounded
|
int max_count; // -1 for unbounded
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_peg_json_string_parser {};
|
struct common_peg_string_parser {
|
||||||
struct common_peg_python_dict_string_parser {};
|
char delimiter;
|
||||||
|
};
|
||||||
|
|
||||||
struct common_peg_until_parser {
|
struct common_peg_until_parser {
|
||||||
std::vector<std::string> delimiters;
|
std::vector<std::string> delimiters;
|
||||||
|
|
@ -280,8 +281,7 @@ using common_peg_parser_variant = std::variant<
|
||||||
common_peg_any_parser,
|
common_peg_any_parser,
|
||||||
common_peg_space_parser,
|
common_peg_space_parser,
|
||||||
common_peg_chars_parser,
|
common_peg_chars_parser,
|
||||||
common_peg_json_string_parser,
|
common_peg_string_parser,
|
||||||
common_peg_python_dict_string_parser,
|
|
||||||
common_peg_until_parser,
|
common_peg_until_parser,
|
||||||
common_peg_schema_parser,
|
common_peg_schema_parser,
|
||||||
common_peg_rule_parser,
|
common_peg_rule_parser,
|
||||||
|
|
@ -340,10 +340,6 @@ class common_peg_parser_builder {
|
||||||
common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
|
common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
|
||||||
common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
|
common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
|
||||||
|
|
||||||
// Generic helpers for building object/array structures with configurable string/value parsers.
|
|
||||||
common_peg_parser generic_object(const std::string & name, const common_peg_parser & string_parser, const common_peg_parser & value_parser);
|
|
||||||
common_peg_parser generic_array(const std::string & name, const common_peg_parser & value_parser);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
common_peg_parser_builder();
|
common_peg_parser_builder();
|
||||||
|
|
||||||
|
|
@ -444,13 +440,10 @@ class common_peg_parser_builder {
|
||||||
common_peg_parser single_quoted_string();
|
common_peg_parser single_quoted_string();
|
||||||
|
|
||||||
// Matches a string that accepts both double-quoted and single-quoted styles.
|
// Matches a string that accepts both double-quoted and single-quoted styles.
|
||||||
common_peg_parser flexible_string();
|
common_peg_parser quoted_string();
|
||||||
|
|
||||||
// Matches double-quoted string content without the surrounding quotes.
|
// Matches string content without the surrounding delimiter.
|
||||||
common_peg_parser json_string_content();
|
common_peg_parser string_content(char delimiter);
|
||||||
|
|
||||||
// Matches single-quoted string content without the surrounding quotes.
|
|
||||||
common_peg_parser single_quoted_string_content();
|
|
||||||
|
|
||||||
// Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
|
// Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
|
||||||
// value -> object | array | string | number | true | false | null
|
// value -> object | array | string | number | true | false | null
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,219 @@
|
||||||
|
#include "reasoning-budget.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "unicode.h"
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct token_matcher {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
size_t pos = 0;
|
||||||
|
|
||||||
|
bool advance(llama_token token) {
|
||||||
|
if (tokens.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token == tokens[pos]) {
|
||||||
|
pos++;
|
||||||
|
if (pos >= tokens.size()) {
|
||||||
|
pos = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pos = 0;
|
||||||
|
if (token == tokens[0]) {
|
||||||
|
pos = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() { pos = 0; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_reasoning_budget_ctx {
|
||||||
|
const llama_vocab * vocab;
|
||||||
|
|
||||||
|
token_matcher start_matcher;
|
||||||
|
token_matcher end_matcher;
|
||||||
|
std::vector<llama_token> forced_tokens;
|
||||||
|
|
||||||
|
int32_t budget; // maximum tokens in reasoning block
|
||||||
|
int32_t remaining; // tokens remaining in budget
|
||||||
|
|
||||||
|
common_reasoning_budget_state state;
|
||||||
|
|
||||||
|
// for forcing
|
||||||
|
size_t force_pos; // next position in forced_tokens to force
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * common_reasoning_budget_name(const struct llama_sampler * /*smpl*/) {
|
||||||
|
return "reasoning-budget";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_token token) {
|
||||||
|
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
|
||||||
|
|
||||||
|
switch (ctx->state) {
|
||||||
|
case REASONING_BUDGET_IDLE:
|
||||||
|
{
|
||||||
|
if (ctx->start_matcher.advance(token)) {
|
||||||
|
ctx->state = REASONING_BUDGET_COUNTING;
|
||||||
|
ctx->remaining = ctx->budget;
|
||||||
|
LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
|
||||||
|
|
||||||
|
if (ctx->remaining <= 0) {
|
||||||
|
ctx->state = REASONING_BUDGET_FORCING;
|
||||||
|
ctx->force_pos = 0;
|
||||||
|
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case REASONING_BUDGET_COUNTING:
|
||||||
|
case REASONING_BUDGET_WAITING_UTF8:
|
||||||
|
{
|
||||||
|
if (ctx->end_matcher.advance(token)) {
|
||||||
|
ctx->state = REASONING_BUDGET_DONE;
|
||||||
|
LOG_INF("reasoning-budget: deactivated (natural end)\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool utf8_complete = true;
|
||||||
|
if (ctx->vocab != nullptr) {
|
||||||
|
const std::string piece = common_token_to_piece(ctx->vocab, token, false);
|
||||||
|
utf8_complete = common_utf8_is_complete(piece);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->state == REASONING_BUDGET_WAITING_UTF8) {
|
||||||
|
if (utf8_complete) {
|
||||||
|
ctx->state = REASONING_BUDGET_FORCING;
|
||||||
|
ctx->force_pos = 0;
|
||||||
|
ctx->end_matcher.reset();
|
||||||
|
LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
|
||||||
|
}
|
||||||
|
} else if (ctx->state == REASONING_BUDGET_COUNTING) {
|
||||||
|
ctx->remaining--;
|
||||||
|
if (ctx->remaining <= 0) {
|
||||||
|
if (utf8_complete) {
|
||||||
|
ctx->state = REASONING_BUDGET_FORCING;
|
||||||
|
ctx->force_pos = 0;
|
||||||
|
ctx->end_matcher.reset();
|
||||||
|
LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
|
||||||
|
} else {
|
||||||
|
ctx->state = REASONING_BUDGET_WAITING_UTF8;
|
||||||
|
ctx->end_matcher.reset();
|
||||||
|
LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case REASONING_BUDGET_FORCING:
|
||||||
|
// force_pos is advanced in apply(), not here.
|
||||||
|
// This ensures the first forced token isn't skipped when the sampler
|
||||||
|
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
|
||||||
|
break;
|
||||||
|
case REASONING_BUDGET_DONE:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
|
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->state != REASONING_BUDGET_FORCING) {
|
||||||
|
// passthrough — don't modify logits
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_token forced = ctx->forced_tokens[ctx->force_pos];
|
||||||
|
|
||||||
|
// set all logits to -inf except the forced token
|
||||||
|
for (size_t i = 0; i < cur_p->size; i++) {
|
||||||
|
if (cur_p->data[i].id != forced) {
|
||||||
|
cur_p->data[i].logit = -INFINITY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// advance to next forced token (done here rather than in accept so that
|
||||||
|
// the first forced token isn't skipped when starting in FORCING state)
|
||||||
|
ctx->force_pos++;
|
||||||
|
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||||
|
ctx->state = REASONING_BUDGET_DONE;
|
||||||
|
LOG_INF("reasoning-budget: forced sequence complete, done\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
|
||||||
|
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
|
||||||
|
ctx->state = REASONING_BUDGET_IDLE;
|
||||||
|
ctx->remaining = ctx->budget;
|
||||||
|
ctx->start_matcher.reset();
|
||||||
|
ctx->end_matcher.reset();
|
||||||
|
ctx->force_pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
|
||||||
|
const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
|
||||||
|
return common_reasoning_budget_init(
|
||||||
|
ctx->vocab,
|
||||||
|
ctx->start_matcher.tokens,
|
||||||
|
ctx->end_matcher.tokens,
|
||||||
|
ctx->forced_tokens,
|
||||||
|
ctx->budget,
|
||||||
|
ctx->state);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void common_reasoning_budget_free(struct llama_sampler * smpl) {
|
||||||
|
delete (common_reasoning_budget_ctx *) smpl->ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler_i common_reasoning_budget_i = {
|
||||||
|
/* .name = */ common_reasoning_budget_name,
|
||||||
|
/* .accept = */ common_reasoning_budget_accept,
|
||||||
|
/* .apply = */ common_reasoning_budget_apply,
|
||||||
|
/* .reset = */ common_reasoning_budget_reset,
|
||||||
|
/* .clone = */ common_reasoning_budget_clone,
|
||||||
|
/* .free = */ common_reasoning_budget_free,
|
||||||
|
/* .backend_init = */ nullptr,
|
||||||
|
/* .backend_accept = */ nullptr,
|
||||||
|
/* .backend_apply = */ nullptr,
|
||||||
|
/* .backend_set_input = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_sampler * common_reasoning_budget_init(
|
||||||
|
const struct llama_vocab * vocab,
|
||||||
|
const std::vector<llama_token> & start_tokens,
|
||||||
|
const std::vector<llama_token> & end_tokens,
|
||||||
|
const std::vector<llama_token> & forced_tokens,
|
||||||
|
int32_t budget,
|
||||||
|
common_reasoning_budget_state initial_state) {
|
||||||
|
// promote COUNTING with budget <= 0 to FORCING
|
||||||
|
if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
|
||||||
|
initial_state = REASONING_BUDGET_FORCING;
|
||||||
|
}
|
||||||
|
|
||||||
|
return llama_sampler_init(
|
||||||
|
/* .iface = */ &common_reasoning_budget_i,
|
||||||
|
/* .ctx = */ new common_reasoning_budget_ctx {
|
||||||
|
/* .vocab = */ vocab,
|
||||||
|
/* .start_matcher = */ { start_tokens, 0 },
|
||||||
|
/* .end_matcher = */ { end_tokens, 0 },
|
||||||
|
/* .forced_tokens = */ forced_tokens,
|
||||||
|
/* .budget = */ budget,
|
||||||
|
/* .remaining = */ budget,
|
||||||
|
/* .state = */ initial_state,
|
||||||
|
/* .force_pos = */ 0,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
enum common_reasoning_budget_state {
|
||||||
|
REASONING_BUDGET_IDLE, // waiting for start sequence
|
||||||
|
REASONING_BUDGET_COUNTING, // counting down tokens
|
||||||
|
REASONING_BUDGET_FORCING, // forcing budget message + end sequence
|
||||||
|
REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
|
||||||
|
REASONING_BUDGET_DONE, // passthrough forever
|
||||||
|
};
|
||||||
|
|
||||||
|
// Creates a reasoning budget sampler that limits token generation inside a
|
||||||
|
// reasoning block (e.g. between <think> and </think>).
|
||||||
|
//
|
||||||
|
// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
|
||||||
|
// IDLE: passthrough, watching for start_tokens sequence
|
||||||
|
// COUNTING: counting down remaining tokens, watching for natural end_tokens
|
||||||
|
// WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
|
||||||
|
// FORCING: forces forced_tokens token-by-token (all other logits -> -inf)
|
||||||
|
// DONE: passthrough forever
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
|
||||||
|
// start_tokens - token sequence that activates counting
|
||||||
|
// end_tokens - token sequence for natural deactivation
|
||||||
|
// forced_tokens - token sequence forced when budget expires
|
||||||
|
// budget - max tokens allowed in the reasoning block
|
||||||
|
// initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
|
||||||
|
// note: COUNTING with budget <= 0 is promoted to FORCING
|
||||||
|
//
|
||||||
|
struct llama_sampler * common_reasoning_budget_init(
|
||||||
|
const struct llama_vocab * vocab,
|
||||||
|
const std::vector<llama_token> & start_tokens,
|
||||||
|
const std::vector<llama_token> & end_tokens,
|
||||||
|
const std::vector<llama_token> & forced_tokens,
|
||||||
|
int32_t budget,
|
||||||
|
common_reasoning_budget_state initial_state);
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
#include "reasoning-budget.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
@ -250,6 +251,17 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reasoning budget sampler — added first so it can force tokens before other samplers
|
||||||
|
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
|
||||||
|
samplers.push_back(common_reasoning_budget_init(
|
||||||
|
vocab,
|
||||||
|
params.reasoning_budget_start,
|
||||||
|
params.reasoning_budget_end,
|
||||||
|
params.reasoning_budget_forced,
|
||||||
|
params.reasoning_budget_tokens,
|
||||||
|
params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
|
||||||
|
}
|
||||||
|
|
||||||
if (params.has_logit_bias()) {
|
if (params.has_logit_bias()) {
|
||||||
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
// implementation adopted from src/unicode.cpp
|
// implementation adopted from src/unicode.cpp
|
||||||
|
|
||||||
|
|
@ -67,6 +69,20 @@ utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t off
|
||||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool common_utf8_is_complete(const std::string & s) {
|
||||||
|
if (s.empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
|
||||||
|
unsigned char c = s[s.size() - i];
|
||||||
|
if ((c & 0xC0) != 0x80) {
|
||||||
|
int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
|
||||||
|
return i >= expected;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||||
std::string result;
|
std::string result;
|
||||||
for (size_t i = 0; i < cps.size(); ++i) {
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,9 @@ struct utf8_parse_result {
|
||||||
// Returns 0 for invalid first bytes
|
// Returns 0 for invalid first bytes
|
||||||
size_t common_utf8_sequence_length(unsigned char first_byte);
|
size_t common_utf8_sequence_length(unsigned char first_byte);
|
||||||
|
|
||||||
|
// Check if a string ends with a complete UTF-8 sequence.
|
||||||
|
bool common_utf8_is_complete(const std::string & s);
|
||||||
|
|
||||||
// Parse a single UTF-8 codepoint from input
|
// Parse a single UTF-8 codepoint from input
|
||||||
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);
|
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -144,6 +144,7 @@ class ModelBase:
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
self._is_nvfp4 = False
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
|
|
@ -271,6 +272,9 @@ class ModelBase:
|
||||||
return tensors
|
return tensors
|
||||||
|
|
||||||
def dequant_model(self):
|
def dequant_model(self):
|
||||||
|
if self._is_nvfp4:
|
||||||
|
return # NVFP4 weights are repacked in _generate_nvfp4_tensors
|
||||||
|
|
||||||
tensors_to_remove: list[str] = []
|
tensors_to_remove: list[str] = []
|
||||||
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
||||||
|
|
||||||
|
|
@ -516,6 +520,13 @@ class ModelBase:
|
||||||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
|
||||||
|
if self._is_nvfp4:
|
||||||
|
if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
|
||||||
|
return []
|
||||||
|
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
|
||||||
|
return []
|
||||||
|
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
# Handle gate/up expert tensor fusion if enabled
|
# Handle gate/up expert tensor fusion if enabled
|
||||||
|
|
@ -551,9 +562,135 @@ class ModelBase:
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
|
||||||
|
"""Repack NVFP4 ModelOpt tensors into ggml super-block layout.
|
||||||
|
Preserves original E4M3 scale bits as UE4M3 (strip sign bit).
|
||||||
|
The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul().
|
||||||
|
Returns (raw_data, logical_shape)."""
|
||||||
|
|
||||||
|
out_features = weight.shape[0]
|
||||||
|
n_blocks = scale.shape[1]
|
||||||
|
|
||||||
|
# Unpack ModelOpt nibble-packed weights
|
||||||
|
w = weight.reshape(out_features, n_blocks, 8)
|
||||||
|
vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16)
|
||||||
|
|
||||||
|
# Preserve original E4M3 scale bits as UE4M3 (strip sign bit)
|
||||||
|
d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F
|
||||||
|
qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy()
|
||||||
|
|
||||||
|
# Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements
|
||||||
|
n_super = n_blocks // 4
|
||||||
|
d_grouped = d_ue.reshape(out_features, n_super, 4)
|
||||||
|
qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32)
|
||||||
|
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
|
||||||
|
return raw, [out_features, n_super * 64]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
|
||||||
|
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
|
||||||
|
|
||||||
|
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
|
||||||
|
raw, shape = self._nvfp4_pack(weight, scale)
|
||||||
|
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
|
||||||
|
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||||
|
|
||||||
|
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
|
||||||
|
if not self._nvfp4_scale2_is_trivial(scale2):
|
||||||
|
scale2_f32 = scale2.float().numpy().flatten()
|
||||||
|
scale_name = new_name.replace(".weight", ".scale")
|
||||||
|
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
|
||||||
|
self.gguf_writer.add_tensor(scale_name, scale2_f32)
|
||||||
|
|
||||||
|
def _generate_nvfp4_tensors(self):
|
||||||
|
# Per-layer expert merging to avoid holding all experts in memory
|
||||||
|
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
|
||||||
|
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||||
|
expert_shapes: dict[tuple[int, str], list[int]] = {}
|
||||||
|
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||||
|
|
||||||
|
for name in list(self.model_tensors.keys()):
|
||||||
|
if not name.endswith(".weight"):
|
||||||
|
continue
|
||||||
|
scale_name = name.replace(".weight", ".weight_scale")
|
||||||
|
scale2_name = name.replace(".weight", ".weight_scale_2")
|
||||||
|
if scale_name not in self.model_tensors:
|
||||||
|
continue
|
||||||
|
# Force eager materialization of lazy tensors
|
||||||
|
weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
|
||||||
|
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
|
||||||
|
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
|
||||||
|
|
||||||
|
# Check if this is a per-expert tensor
|
||||||
|
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
|
||||||
|
if m:
|
||||||
|
expert_id = int(m.group(1))
|
||||||
|
proj_type = m.group(2)
|
||||||
|
bid_m = re.search(r'\.layers\.(\d+)\.', name)
|
||||||
|
bid = int(bid_m.group(1)) if bid_m else 0
|
||||||
|
key = (bid, proj_type)
|
||||||
|
|
||||||
|
raw, shape = self._nvfp4_pack(weight, scale)
|
||||||
|
|
||||||
|
if key not in expert_blocks:
|
||||||
|
expert_blocks[key] = []
|
||||||
|
expert_scales[key] = []
|
||||||
|
expert_shapes[key] = shape
|
||||||
|
expert_blocks[key].append((expert_id, raw.copy()))
|
||||||
|
# Collect per-expert scale2 (scalar per expert)
|
||||||
|
expert_scales[key].append((expert_id, float(scale2.float().sum())))
|
||||||
|
|
||||||
|
# Flush when all experts for this (layer, proj) are collected
|
||||||
|
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
|
||||||
|
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||||
|
else:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
self._repack_nvfp4(new_name, weight, scale, scale2)
|
||||||
|
|
||||||
|
# Flush any remaining experts (fallback if n_experts was unknown)
|
||||||
|
for (bid, proj_type) in list(expert_blocks.keys()):
|
||||||
|
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||||
|
|
||||||
|
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
|
||||||
|
experts = expert_blocks.pop(key)
|
||||||
|
scales = expert_scales.pop(key)
|
||||||
|
shape = expert_shapes.pop(key)
|
||||||
|
|
||||||
|
experts.sort(key=lambda x: x[0])
|
||||||
|
merged = np.stack([e[1] for e in experts], axis=0)
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight"
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
|
||||||
|
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||||
|
|
||||||
|
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
|
||||||
|
scales.sort(key=lambda x: x[0])
|
||||||
|
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
|
||||||
|
if not np.allclose(scale_vals, 1.0, atol=1e-6):
|
||||||
|
scale_name = new_name.replace(".weight", ".scale")
|
||||||
|
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
|
||||||
|
self.gguf_writer.add_tensor(scale_name, scale_vals)
|
||||||
|
|
||||||
|
del experts, merged
|
||||||
|
|
||||||
def prepare_tensors(self):
|
def prepare_tensors(self):
|
||||||
|
# detect NVFP4 quantization (ModelOpt format)
|
||||||
|
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||||
|
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||||
|
|
||||||
|
if not quant_algo and quant_config_file.is_file():
|
||||||
|
with open(quant_config_file, "r", encoding="utf-8") as f:
|
||||||
|
quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
|
||||||
|
|
||||||
|
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||||
|
|
||||||
self.dequant_model()
|
self.dequant_model()
|
||||||
|
|
||||||
|
# NVFP4 weights are repacked and written directly to gguf_writer
|
||||||
|
if self._is_nvfp4:
|
||||||
|
self._generate_nvfp4_tensors()
|
||||||
|
|
||||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||||
if self.tensor_map.mapping:
|
if self.tensor_map.mapping:
|
||||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||||
|
|
@ -2057,6 +2194,8 @@ class GPTNeoXModel(TextModel):
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
assert n_head is not None
|
||||||
|
assert n_embed is not None
|
||||||
|
|
||||||
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||||
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
|
@ -2094,6 +2233,8 @@ class BloomModel(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
assert n_head is not None
|
||||||
|
assert n_embed is not None
|
||||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||||
self.gguf_writer.add_embedding_length(n_embed)
|
self.gguf_writer.add_embedding_length(n_embed)
|
||||||
self.gguf_writer.add_feed_forward_length(4 * n_embed)
|
self.gguf_writer.add_feed_forward_length(4 * n_embed)
|
||||||
|
|
@ -2106,6 +2247,8 @@ class BloomModel(TextModel):
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
assert n_head is not None
|
||||||
|
assert n_embed is not None
|
||||||
|
|
||||||
name = re.sub(r'transformer\.', '', name)
|
name = re.sub(r'transformer\.', '', name)
|
||||||
|
|
||||||
|
|
@ -3716,6 +3859,7 @@ class LLaDAModel(TextModel):
|
||||||
|
|
||||||
if (rope_dim := hparams.get("head_dim")) is None:
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
|
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
|
||||||
|
assert n_heads is not None
|
||||||
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
|
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
|
@ -3747,6 +3891,7 @@ class LLaDAModel(TextModel):
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
|
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
|
||||||
|
assert n_head is not None
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
|
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
|
||||||
|
|
||||||
if self.undo_permute:
|
if self.undo_permute:
|
||||||
|
|
@ -4303,6 +4448,14 @@ class Qwen2MoeModel(TextModel):
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
name = name.replace("language_model.", "") # InternVL
|
name = name.replace("language_model.", "") # InternVL
|
||||||
|
|
||||||
|
# NVFP4 expert weights are handled in _generate_nvfp4_tensors
|
||||||
|
if self._is_nvfp4 and "experts" in name:
|
||||||
|
if name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
|
||||||
|
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
|
||||||
|
return
|
||||||
|
if not name.endswith(".weight"):
|
||||||
|
return
|
||||||
|
|
||||||
# handle aggregated expert tensors
|
# handle aggregated expert tensors
|
||||||
# GGUF stores dimensions reversed from PyTorch, so:
|
# GGUF stores dimensions reversed from PyTorch, so:
|
||||||
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
|
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
|
||||||
|
|
@ -4390,15 +4543,31 @@ class Qwen3Model(Qwen2Model):
|
||||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||||
|
|
||||||
# a bit hacky, but currently the only way to detect if this is a rerank model
|
if self._is_qwen3_reranker():
|
||||||
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
self._find_rerank_config()
|
||||||
|
|
||||||
|
def _is_qwen3_reranker(self) -> bool:
|
||||||
readme_path = self.dir_model / "README.md"
|
readme_path = self.dir_model / "README.md"
|
||||||
readme_text = ""
|
readme_text = ""
|
||||||
if readme_path.exists():
|
if readme_path.exists():
|
||||||
with readme_path.open("r", encoding="utf-8") as f:
|
with readme_path.open("r", encoding="utf-8") as f:
|
||||||
readme_text = f.read()
|
readme_text = f.read()
|
||||||
if "# Qwen3-Reranker" in readme_text:
|
|
||||||
self._find_rerank_config()
|
name_hints = [
|
||||||
|
str(self.dir_model.name),
|
||||||
|
str(self.hparams.get("_name_or_path", "")),
|
||||||
|
str(self.hparams.get("model_type", "")),
|
||||||
|
str(self.origin_hf_arch or ""),
|
||||||
|
]
|
||||||
|
name_hints = [hint.lower() for hint in name_hints if hint]
|
||||||
|
|
||||||
|
if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower():
|
||||||
|
return True
|
||||||
|
|
||||||
|
if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return "sequenceclassification" in (self.origin_hf_arch or "").lower()
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
# deal with intern-s1-mini
|
# deal with intern-s1-mini
|
||||||
|
|
@ -4901,7 +5070,7 @@ class Phi2Model(TextModel):
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Phi3ForCausalLM")
|
@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
|
||||||
class Phi3MiniModel(TextModel):
|
class Phi3MiniModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.PHI3
|
model_arch = gguf.MODEL_ARCH.PHI3
|
||||||
|
|
||||||
|
|
@ -5076,6 +5245,129 @@ class Phi3MiniModel(TextModel):
|
||||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
|
||||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
|
||||||
|
return
|
||||||
|
|
||||||
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Phi4ForCausalLMV")
|
||||||
|
class Phi4VisionMmprojModel(MmprojModel):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
|
||||||
|
self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
|
||||||
|
if self.vision_total_layers < 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
|
||||||
|
# drop post-layernorm/head weights. This makes the GGUF runtime output match
|
||||||
|
# the feature map consumed by the patched siglip.cpp Phi-4 projector path.
|
||||||
|
self.vision_export_layers = self.vision_total_layers - 1
|
||||||
|
self.vision_last_layer_idx = self.vision_total_layers - 1
|
||||||
|
|
||||||
|
for key in self.n_block_keys:
|
||||||
|
if key in self.hparams_vision:
|
||||||
|
self.hparams_vision[key] = self.vision_export_layers
|
||||||
|
break
|
||||||
|
|
||||||
|
self.block_count = self.vision_export_layers
|
||||||
|
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
||||||
|
|
||||||
|
patch_size = self.preprocessor_config.get("patch_size")
|
||||||
|
if patch_size is None:
|
||||||
|
raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
|
||||||
|
|
||||||
|
self.hparams_vision["patch_size"] = patch_size
|
||||||
|
|
||||||
|
pos_emb_name = next(
|
||||||
|
(
|
||||||
|
name for name in self.model_tensors
|
||||||
|
if name.endswith("vision_model.embeddings.position_embedding.weight")
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if pos_emb_name is None:
|
||||||
|
raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
|
||||||
|
|
||||||
|
pos_emb_shape = self.model_tensors[pos_emb_name]().shape
|
||||||
|
base_grid_tokens = int(pos_emb_shape[0])
|
||||||
|
grid_side = math.isqrt(base_grid_tokens)
|
||||||
|
if grid_side * grid_side != base_grid_tokens:
|
||||||
|
raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
|
||||||
|
|
||||||
|
self.hparams_vision["image_size"] = grid_side * patch_size
|
||||||
|
|
||||||
|
min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
|
||||||
|
max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
|
||||||
|
if min_num_patches is None or max_num_patches is None:
|
||||||
|
raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
|
||||||
|
|
||||||
|
self.min_pixels = int(min_num_patches) * patch_size * patch_size
|
||||||
|
self.max_pixels = int(max_num_patches) * patch_size * patch_size
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
|
||||||
|
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
|
||||||
|
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
|
||||||
|
self.gguf_writer.add_vision_use_gelu(True)
|
||||||
|
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
|
||||||
|
if ".vision_model.head." in name:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
|
||||||
|
|
||||||
|
if ".vision_model.post_layernorm." in new_name:
|
||||||
|
return
|
||||||
|
|
||||||
|
if bid is not None and bid == self.vision_last_layer_idx:
|
||||||
|
return
|
||||||
|
|
||||||
|
if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
if data_torch.ndim != 2:
|
||||||
|
raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
|
||||||
|
|
||||||
|
patch_area = self.hparams_vision["patch_size"] ** 2
|
||||||
|
in_features = data_torch.shape[1]
|
||||||
|
if in_features % patch_area != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
|
||||||
|
)
|
||||||
|
|
||||||
|
num_channels = in_features // patch_area
|
||||||
|
patch_size = self.hparams_vision["patch_size"]
|
||||||
|
data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
|
||||||
|
data_torch = data_torch.permute(0, 3, 1, 2)
|
||||||
|
|
||||||
|
yield from super().modify_tensors(data_torch, new_name, bid)
|
||||||
|
return
|
||||||
|
|
||||||
|
if name.startswith(("model.mm_projector.", "mm_projector.")):
|
||||||
|
local_name = name
|
||||||
|
local_name = local_name.replace("model.mm_projector.", "")
|
||||||
|
local_name = local_name.replace("mm_projector.", "")
|
||||||
|
|
||||||
|
if not (local_name.startswith("0.") or local_name.startswith("2.")):
|
||||||
|
return
|
||||||
|
|
||||||
|
suffix = ".bias" if local_name.endswith(".bias") else ".weight"
|
||||||
|
mm_idx = int(local_name.split(".", maxsplit=1)[0])
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
|
||||||
|
return
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("PhiMoEForCausalLM")
|
@ModelBase.register("PhiMoEForCausalLM")
|
||||||
class PhiMoeModel(Phi3MiniModel):
|
class PhiMoeModel(Phi3MiniModel):
|
||||||
|
|
@ -9201,7 +9493,9 @@ class ChatGLMModel(TextModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
assert n_embed is not None
|
||||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
assert n_head is not None
|
||||||
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
||||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||||
self.gguf_writer.add_embedding_length(n_embed)
|
self.gguf_writer.add_embedding_length(n_embed)
|
||||||
|
|
@ -9727,20 +10021,35 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
# M: Mamba2, *: Attention, -: MLP
|
# M: Mamba2, *: Attention, -: MLP
|
||||||
# MoE:
|
# MoE:
|
||||||
# M: Mamba2, *: Attention, E: Expert
|
# M: Mamba2, *: Attention, E: Expert
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
|
||||||
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
if pattern is None:
|
||||||
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
|
self._ssm_layers = []
|
||||||
|
self._mlp_layers = []
|
||||||
|
elif isinstance(pattern, str):
|
||||||
|
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"]
|
||||||
|
self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")]
|
||||||
|
else:
|
||||||
|
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"]
|
||||||
|
self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"]
|
||||||
|
|
||||||
def get_attn_layers(self):
|
def get_attn_layers(self):
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
|
||||||
assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
|
if pattern is None:
|
||||||
return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
|
return []
|
||||||
|
assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!"
|
||||||
|
if isinstance(pattern, str):
|
||||||
|
return [i for i, val in enumerate(pattern) if val == "*"]
|
||||||
|
|
||||||
|
return [i for i, val in enumerate(pattern) if val == "attention"]
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
self.gguf_writer.add_key_length(self.head_dim)
|
head_dim = self.head_dim
|
||||||
self.gguf_writer.add_value_length(self.head_dim)
|
if head_dim is None:
|
||||||
|
raise ValueError("Could not find the attention head dim in config")
|
||||||
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
|
|
||||||
# Set feed_forward_length
|
# Set feed_forward_length
|
||||||
# NOTE: This will trigger an override warning. This is preferable to
|
# NOTE: This will trigger an override warning. This is preferable to
|
||||||
|
|
@ -9768,6 +10077,9 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
|
if (latent_size := self.hparams.get("moe_latent_size")) is not None:
|
||||||
|
self.gguf_writer.add_moe_latent_size(latent_size)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
super().set_vocab()
|
super().set_vocab()
|
||||||
|
|
||||||
|
|
@ -9787,6 +10099,13 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
name = name[len("language_model."):]
|
name = name[len("language_model."):]
|
||||||
|
|
||||||
if self.is_moe and bid is not None:
|
if self.is_moe and bid is not None:
|
||||||
|
# Skip Multi-Token Prediction (MTP) tensors. These are used for
|
||||||
|
# for speculative decoding but we don't include them in this model
|
||||||
|
# conversion. See https://github.com/ggml-org/llama.cpp/pull/18886
|
||||||
|
if name.startswith("mtp."):
|
||||||
|
logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}")
|
||||||
|
return
|
||||||
|
|
||||||
if name.endswith("mixer.gate.e_score_correction_bias"):
|
if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||||
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
|
yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,12 @@ class LoraTorchTensor:
|
||||||
assert dim is None
|
assert dim is None
|
||||||
return self.shape
|
return self.shape
|
||||||
|
|
||||||
|
def contiguous(self) -> LoraTorchTensor:
|
||||||
|
return LoraTorchTensor(
|
||||||
|
self._lora_A.contiguous(),
|
||||||
|
self._lora_B.contiguous(),
|
||||||
|
)
|
||||||
|
|
||||||
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
if isinstance(shape[0], tuple):
|
if isinstance(shape[0], tuple):
|
||||||
new_shape: tuple[int, ...] = shape[0]
|
new_shape: tuple[int, ...] = shape[0]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,343 @@
|
||||||
|
# OpenVINO Backend for llama.cpp
|
||||||
|
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
|
||||||
|
This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
|
||||||
|
|
||||||
|
The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:
|
||||||
|
|
||||||
|
- Walks the GGML graph and identifies inputs, outputs, weights, and KV cache tensors.
|
||||||
|
- Translates the GGML operations into an `ov::Model` using OpenVINO's frontend API.
|
||||||
|
- Compiles and caches the model for the target device.
|
||||||
|
- Binds GGML tensor memory to OpenVINO inference tensors and runs inference.
|
||||||
|
|
||||||
|
## Supported Devices
|
||||||
|
|
||||||
|
OpenVINO backend supports the following hardware:
|
||||||
|
|
||||||
|
- Intel CPUs
|
||||||
|
- Intel GPUs (integrated and discrete)
|
||||||
|
- Intel NPUs
|
||||||
|
|
||||||
|
Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2.
|
||||||
|
|
||||||
|
## Supported Model Precisions
|
||||||
|
|
||||||
|
- `FP16`
|
||||||
|
- `BF16` (on Intel Xeon)
|
||||||
|
- `Q8_0`
|
||||||
|
- `Q4_0`
|
||||||
|
- `Q4_1`
|
||||||
|
- `Q4_K`
|
||||||
|
- `Q4_K_M`
|
||||||
|
- `Q5_K` (converted to Q8_0_C at runtime)
|
||||||
|
- `Q6_K` (converted to Q8_0_C at runtime)
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Accuracy validation and performance optimizations for quantized models are a work in progress.
|
||||||
|
|
||||||
|
## Quantization Support Details
|
||||||
|
|
||||||
|
### CPU and GPU
|
||||||
|
|
||||||
|
- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
|
||||||
|
- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
|
||||||
|
|
||||||
|
### NPU
|
||||||
|
|
||||||
|
- **Primary supported quantization scheme is `Q4_0`**
|
||||||
|
- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
|
||||||
|
|
||||||
|
### Additional Notes
|
||||||
|
|
||||||
|
- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
|
||||||
|
- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
|
||||||
|
- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
|
||||||
|
|
||||||
|
## Validated Models
|
||||||
|
|
||||||
|
The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
|
||||||
|
|
||||||
|
- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
|
||||||
|
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
|
||||||
|
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
|
||||||
|
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
|
||||||
|
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
|
||||||
|
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
|
||||||
|
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
|
||||||
|
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
|
||||||
|
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
|
||||||
|
|
||||||
|
## Build Instructions
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
|
||||||
|
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
|
||||||
|
|
||||||
|
- **Linux:**
|
||||||
|
- Git, CMake, and Ninja software tools are needed for building.
|
||||||
|
```bash
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
|
||||||
|
```
|
||||||
|
- OpenCL
|
||||||
|
```bash
|
||||||
|
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Windows:**
|
||||||
|
- Download and install [Microsoft Visual Studio 2022 Build Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe). During installation, select the **"Desktop development with C++"** workload.
|
||||||
|
|
||||||
|
- Install required tools:
|
||||||
|
```powershell
|
||||||
|
# Windows PowerShell
|
||||||
|
winget install Git.Git
|
||||||
|
winget install GNU.Wget
|
||||||
|
winget install Ninja-build.Ninja
|
||||||
|
```
|
||||||
|
|
||||||
|
- Install **OpenCL** using **vcpkg**:
|
||||||
|
```powershell
|
||||||
|
# Windows PowerShell
|
||||||
|
cd C:\
|
||||||
|
git clone https://github.com/microsoft/vcpkg
|
||||||
|
cd vcpkg
|
||||||
|
.\bootstrap-vcpkg.bat
|
||||||
|
.\vcpkg install opencl
|
||||||
|
# Optional but recommended: Integrate vcpkg with Visual Studio / CMake:
|
||||||
|
.\vcpkg integrate install
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1. Install OpenVINO Runtime
|
||||||
|
|
||||||
|
- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)
|
||||||
|
|
||||||
|
- **Linux:**
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>📦 Click to expand OpenVINO installation from an archive file on Ubuntu</summary>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
|
||||||
|
chmod +x install-openvino-from-archive.sh
|
||||||
|
./install-openvino-from-archive.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify OpenVINO is initialized properly:
|
||||||
|
```bash
|
||||||
|
echo $OpenVINO_DIR
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
### 2. Build llama.cpp with OpenVINO Backend
|
||||||
|
|
||||||
|
Clone the OpenVINO-enabled llama.cpp fork and build it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Linux:**
|
||||||
|
```bash
|
||||||
|
source /opt/intel/openvino/setupvars.sh
|
||||||
|
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
|
||||||
|
cmake --build build/ReleaseOV --parallel
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Windows:**
|
||||||
|
```cmd
|
||||||
|
# x64 Native Tools Command Prompt for VS 2022
|
||||||
|
"C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
|
||||||
|
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
|
||||||
|
cmake --build build\ReleaseOV --parallel
|
||||||
|
```
|
||||||
|
> [!NOTE]
|
||||||
|
> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
|
||||||
|
|
||||||
|
### 3. Download Sample Model
|
||||||
|
|
||||||
|
Download models for testing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Linux
|
||||||
|
mkdir -p ~/models/
|
||||||
|
wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
|
||||||
|
-O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# Windows PowerShell
|
||||||
|
mkdir C:\models
|
||||||
|
Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# Windows Command Line
|
||||||
|
mkdir C:\models
|
||||||
|
curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Run Inference with OpenVINO Backend
|
||||||
|
|
||||||
|
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# If device is unset or unavailable, defaults to CPU.
|
||||||
|
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
|
||||||
|
|
||||||
|
# Linux
|
||||||
|
export GGML_OPENVINO_DEVICE=GPU
|
||||||
|
# To run llama-simple:
|
||||||
|
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||||
|
# To run in chat mode:
|
||||||
|
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# Windows Command Line
|
||||||
|
set GGML_OPENVINO_DEVICE=GPU
|
||||||
|
# Windows PowerShell
|
||||||
|
$env:GGML_OPENVINO_DEVICE = "GPU"
|
||||||
|
|
||||||
|
# To run llama-simple
|
||||||
|
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
|
||||||
|
# To run in chat mode:
|
||||||
|
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
|
||||||
|
|
||||||
|
```
|
||||||
|
> [!NOTE]
|
||||||
|
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
### Docker Build
|
||||||
|
|
||||||
|
You can build and run llama.cpp with OpenVINO backend using Docker.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build the base runtime image with compiled shared libraries and minimal dependencies.
|
||||||
|
docker build -t llama-openvino:base -f .devops/openvino.Dockerfile .
|
||||||
|
|
||||||
|
# Build the complete image with all binaries, Python tools, gguf-py library, and model conversion utilities.
|
||||||
|
docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile .
|
||||||
|
|
||||||
|
# Build a minimal CLI-only image containing just the llama-cli executable.
|
||||||
|
docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
|
||||||
|
|
||||||
|
# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
|
||||||
|
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
|
||||||
|
|
||||||
|
# If you are behind a proxy:
|
||||||
|
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
Run llama.cpp with OpenVINO backend Docker container.
|
||||||
|
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run Docker container
|
||||||
|
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# With Intel GPU access (iGPU or dGPU)
|
||||||
|
docker run --rm -it -v ~/models:/models \
|
||||||
|
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||||
|
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# With Intel NPU access
|
||||||
|
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
|
||||||
|
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
|
||||||
|
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
Run Llama.cpp Server with OpenVINO Backend:
|
||||||
|
```bash
|
||||||
|
# Run the Server Docker container
|
||||||
|
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||||
|
|
||||||
|
# In a NEW terminal, test the server with curl
|
||||||
|
|
||||||
|
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
|
||||||
|
export NO_PROXY=localhost,127.0.0.1
|
||||||
|
|
||||||
|
# Test health endpoint
|
||||||
|
curl -f http://localhost:8080/health
|
||||||
|
|
||||||
|
# Test with a simple prompt
|
||||||
|
curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" \
|
||||||
|
-d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Runtime Configuration
|
||||||
|
|
||||||
|
The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
|
||||||
|
|
||||||
|
### Configuration Options
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
|
||||||
|
| `GGML_OPENVINO_DEVICE` | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
|
||||||
|
| `GGML_OPENVINO_CACHE_DIR` | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
|
||||||
|
| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256` | Token chunk size for **NPU** prefill. |
|
||||||
|
| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0` | Enable stateful KV cache on for better performance. Recommended on CPU, GPU. |
|
||||||
|
| `GGML_OPENVINO_PROFILING` | `0` | Enable execution-time profiling. |
|
||||||
|
| `GGML_OPENVINO_DUMP_CGRAPH` | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
|
||||||
|
| `GGML_OPENVINO_DUMP_IR` | `0` | Serialize OpenVINO IR files with timestamps. |
|
||||||
|
| `GGML_OPENVINO_DEBUG_INPUT` | `0` | Enable input debugging and print input tensor info. |
|
||||||
|
| `GGML_OPENVINO_DEBUG_OUTPUT` | `0` | Enable output debugging and print output tensor info. |
|
||||||
|
| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once. |
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
>`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
|
||||||
|
|
||||||
|
### Example Usage
|
||||||
|
|
||||||
|
#### GPU Inference with Profiling
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
|
||||||
|
|
||||||
|
# Linux
|
||||||
|
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
|
||||||
|
export GGML_OPENVINO_PROFILING=1
|
||||||
|
export GGML_OPENVINO_DEVICE=GPU
|
||||||
|
|
||||||
|
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
|
||||||
|
|
||||||
|
# Windows Command Line
|
||||||
|
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
|
||||||
|
set GGML_OPENVINO_PROFILING=1
|
||||||
|
set GGML_OPENVINO_DEVICE=GPU
|
||||||
|
|
||||||
|
# Windows PowerShell
|
||||||
|
$env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
|
||||||
|
$env:GGML_OPENVINO_PROFILING = "1"
|
||||||
|
$env:GGML_OPENVINO_DEVICE = "GPU"
|
||||||
|
|
||||||
|
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### llama-bench
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# -fa 1 is required when running llama-bench with the OpenVINO backend.
|
||||||
|
GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
|
||||||
|
```
|
||||||
|
|
||||||
|
### NPU Notes
|
||||||
|
|
||||||
|
- Model caching is not yet supported
|
||||||
|
- Does not support llama-server -np > 1 (multiple parallel sequences)
|
||||||
|
- Only supports llama-perplexity -b 512 or smaller
|
||||||
|
|
||||||
|
## Llama.cpp Tools
|
||||||
|
|
||||||
|
The following tools work with the OpenVINO backend on CPU, GPU, NPU:
|
||||||
|
- llama-simple
|
||||||
|
- llama-run
|
||||||
|
- llama-cli
|
||||||
|
- llama-server
|
||||||
|
- llama-bench
|
||||||
|
- llama-perplexity
|
||||||
|
|
||||||
|
## Work in Progress
|
||||||
|
|
||||||
|
- Performance and memory optimizations
|
||||||
|
- Accuracy validation
|
||||||
|
- Broader quantization coverage
|
||||||
|
- Support for additional model architectures
|
||||||
|
|
@ -382,17 +382,27 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
## Windows
|
## Windows
|
||||||
|
|
||||||
### I. Setup Environment
|
### Install GPU driver
|
||||||
|
|
||||||
1. Install GPU driver
|
|
||||||
|
|
||||||
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||||
|
|
||||||
2. Install Visual Studio
|
### Option 1: download the binary package directly
|
||||||
|
|
||||||
|
Download the binary package for Windows from: https://github.com/ggml-org/llama.cpp/releases.
|
||||||
|
|
||||||
|
Extract the package to local folder, run the llama tools directly. Refer to [Run the inference](#iii-run-the-inference-1).
|
||||||
|
|
||||||
|
Note, the package includes the SYCL running time and all depended dll files, no need to install oneAPI package and activte them.
|
||||||
|
|
||||||
|
### Option 2: build locally from the source code.
|
||||||
|
|
||||||
|
#### I. Setup environment
|
||||||
|
|
||||||
|
1. Install Visual Studio
|
||||||
|
|
||||||
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
|
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
|
||||||
|
|
||||||
3. Install Intel® oneAPI Base toolkit
|
2. Install Intel® oneAPI Base toolkit
|
||||||
|
|
||||||
SYCL backend depends on:
|
SYCL backend depends on:
|
||||||
- Intel® oneAPI DPC++/C++ compiler/running-time.
|
- Intel® oneAPI DPC++/C++ compiler/running-time.
|
||||||
|
|
@ -443,25 +453,25 @@ Output (example):
|
||||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Install build tools
|
3. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
#### II. Build llama.cpp
|
||||||
|
|
||||||
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
||||||
|
|
||||||
Choose one of following methods to build from source code.
|
Choose one of following methods to build from source code.
|
||||||
|
|
||||||
#### 1. Script
|
##### Option 1: Script
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. CMake
|
##### Option 2: CMake
|
||||||
|
|
||||||
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
|
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
|
||||||
|
|
||||||
|
|
@ -490,7 +500,7 @@ cmake --preset x64-windows-sycl-debug
|
||||||
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
|
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Visual Studio
|
##### Option 3: Visual Studio
|
||||||
|
|
||||||
You have two options to use Visual Studio to build llama.cpp:
|
You have two options to use Visual Studio to build llama.cpp:
|
||||||
- As CMake Project using CMake presets.
|
- As CMake Project using CMake presets.
|
||||||
|
|
@ -500,7 +510,7 @@ You have two options to use Visual Studio to build llama.cpp:
|
||||||
|
|
||||||
All following commands are executed in PowerShell.
|
All following commands are executed in PowerShell.
|
||||||
|
|
||||||
##### - Open as a CMake Project
|
###### - Open as a CMake Project
|
||||||
|
|
||||||
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
|
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
|
||||||
|
|
||||||
|
|
@ -515,7 +525,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
|
||||||
cmake --build build --config Release -j --target llama-completion
|
cmake --build build --config Release -j --target llama-completion
|
||||||
```
|
```
|
||||||
|
|
||||||
##### - Generating a Visual Studio Solution
|
###### - Generating a Visual Studio Solution
|
||||||
|
|
||||||
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
|
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
|
||||||
|
|
||||||
|
|
@ -603,7 +613,7 @@ found 2 SYCL devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Choose level-zero devices
|
##### Choose level-zero devices
|
||||||
|
|
||||||
|Chosen Device ID|Setting|
|
|Chosen Device ID|Setting|
|
||||||
|-|-|
|
|-|-|
|
||||||
|
|
@ -611,7 +621,7 @@ found 2 SYCL devices:
|
||||||
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|
||||||
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
|
||||||
|
|
||||||
#### Execute
|
##### Execute
|
||||||
|
|
||||||
Choose one of following methods to run.
|
Choose one of following methods to run.
|
||||||
|
|
||||||
|
|
@ -669,7 +679,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
## Environment Variable
|
## Environment Variable
|
||||||
|
|
||||||
#### Build
|
### Build
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|---------------------------------------|---------------------------------------------|
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
|
|
@ -684,7 +694,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
|
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
|
||||||
|
|
||||||
#### Runtime
|
### Runtime
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
|
@ -777,7 +787,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
```
|
```
|
||||||
|
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
|
Please add the `[SYCL]` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,8 @@ LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
|
||||||
cmake -S . -B $LLAMA_MAC_BUILD \
|
cmake -S . -B $LLAMA_MAC_BUILD \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DGGML_REMOTINGBACKEND=ONLY \
|
-DGGML_VIRTGPU=ON \
|
||||||
|
-DGGML_VIRTGPU_BACKEND=ONLY \
|
||||||
-DGGML_METAL=ON
|
-DGGML_METAL=ON
|
||||||
|
|
||||||
TARGETS="ggml-metal"
|
TARGETS="ggml-metal"
|
||||||
|
|
@ -71,6 +72,7 @@ cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
|
||||||
```bash
|
```bash
|
||||||
# Build virglrenderer with APIR support
|
# Build virglrenderer with APIR support
|
||||||
mkdir virglrenderer
|
mkdir virglrenderer
|
||||||
|
cd virglrenderer
|
||||||
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
|
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
|
||||||
cd src
|
cd src
|
||||||
|
|
||||||
|
|
@ -95,7 +97,7 @@ mkdir llama.cpp
|
||||||
git clone https://github.com/ggml-org/llama.cpp.git src
|
git clone https://github.com/ggml-org/llama.cpp.git src
|
||||||
cd src
|
cd src
|
||||||
|
|
||||||
LLAMA_LINUX_BUILD=$PWD//build-virtgpu
|
LLAMA_LINUX_BUILD=$PWD/build-virtgpu
|
||||||
|
|
||||||
cmake -S . -B $LLAMA_LINUX_BUILD \
|
cmake -S . -B $LLAMA_LINUX_BUILD \
|
||||||
-DGGML_VIRTGPU=ON
|
-DGGML_VIRTGPU=ON
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,21 @@ cd llama.cpp
|
||||||
|
|
||||||
The following sections describe how to build with different backends and options.
|
The following sections describe how to build with different backends and options.
|
||||||
|
|
||||||
|
* [CPU Build](#cpu-build)
|
||||||
|
* [BLAS Build](#blas-build)
|
||||||
|
* [Metal Build](#metal-build)
|
||||||
|
* [SYCL](#sycl)
|
||||||
|
* [CUDA](#cuda)
|
||||||
|
* [MUSA](#musa)
|
||||||
|
* [HIP](#hip)
|
||||||
|
* [Vulkan](#vulkan)
|
||||||
|
* [CANN](#cann)
|
||||||
|
* [Arm® KleidiAI™](#arm-kleidiai)
|
||||||
|
* [OpenCL](#opencl)
|
||||||
|
* [Android](#android-1)
|
||||||
|
* [OpenVINO](#openvino)
|
||||||
|
* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
|
||||||
|
|
||||||
## CPU Build
|
## CPU Build
|
||||||
|
|
||||||
Build llama.cpp using `CMake`:
|
Build llama.cpp using `CMake`:
|
||||||
|
|
@ -254,6 +269,14 @@ The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cu
|
||||||
|
|
||||||
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
||||||
|
|
||||||
|
#### GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F
|
||||||
|
|
||||||
|
Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` environment variable to use FP32 compute type on all GPUs in FP16 cuBLAS for preventing possible numerical overflows in exchange for slower prompt processing (small impact on RTX PRO/Datacenter products and significant on GeForce products).
|
||||||
|
|
||||||
|
#### GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F
|
||||||
|
|
||||||
|
Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16 compute type (instead of default FP32) in FP16 cuBLAS for V100, CDNA and RDNA4.
|
||||||
|
|
||||||
### Unified Memory
|
### Unified Memory
|
||||||
|
|
||||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
||||||
|
|
@ -265,7 +288,7 @@ The following compilation options are also available to tweak performance:
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for CDNA and RDNA4) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). |
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for V100, CDNA and RDNA4 which use FP32 compute type by default) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). |
|
||||||
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
|
|
@ -599,7 +622,13 @@ If KleidiAI is enabled, the output will contain a line similar to:
|
||||||
```
|
```
|
||||||
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
|
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
|
||||||
```
|
```
|
||||||
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
|
KleidiAI’s microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm, SVE, and SME. Llama.cpp selects the most efficient kernels at runtime based on detected CPU capabilities.
|
||||||
|
On CPUs that support SME, SME microkernels are enabled automatically using runtime detection.
|
||||||
|
The environment variable GGML_KLEIDIAI_SME can be used to control SME behavior:
|
||||||
|
- Not set: enable SME automatically if supported and detected.
|
||||||
|
- 0: disable SME.
|
||||||
|
- <n> > 0: enable SME and assume <n> available SME units (override auto detection).
|
||||||
|
If SME is not supported by the CPU, SME microkernels are always disabled.
|
||||||
|
|
||||||
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
|
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
|
||||||
|
|
||||||
|
|
@ -718,6 +747,14 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
|
||||||
|
|
||||||
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
|
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
|
||||||
|
|
||||||
|
## OpenVINO
|
||||||
|
|
||||||
|
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware (CPUs, GPUs, and NPUs).
|
||||||
|
|
||||||
|
For build instructions and usage examples, refer to [OPENVINO.md](backend/OPENVINO.md).
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
## Notes about GPU-accelerated backends
|
## Notes about GPU-accelerated backends
|
||||||
|
|
||||||
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
||||||
|
|
|
||||||
23
docs/ops.md
23
docs/ops.md
|
|
@ -15,7 +15,7 @@ Legend:
|
||||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
|
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
|
||||||
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
||||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -23,7 +23,7 @@ Legend:
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -31,7 +31,7 @@ Legend:
|
||||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -47,6 +47,7 @@ Legend:
|
||||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -63,7 +64,7 @@ Legend:
|
||||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||||
|
|
@ -75,34 +76,34 @@ Legend:
|
||||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
|
| OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
|
||||||
| PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
| PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| POOL_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| POOL_1D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
|
||||||
8525
docs/ops/CPU.csv
8525
docs/ops/CPU.csv
File diff suppressed because it is too large
Load Diff
14142
docs/ops/SYCL.csv
14142
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -1,8 +1,8 @@
|
||||||
"backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
|
"backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
|
||||||
"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
|
"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
|
"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
|
|
@ -85,8 +85,8 @@
|
||||||
"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
|
"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan"
|
||||||
"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan"
|
"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan"
|
"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan"
|
||||||
"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan"
|
||||||
|
|
@ -13591,3 +13591,16 @@
|
||||||
"Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan"
|
"Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan"
|
||||||
"Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
|
"Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
|
||||||
"Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
|
"Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","Vulkan"
|
||||||
|
"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","Vulkan"
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
|
|
@ -5023,20 +5023,20 @@
|
||||||
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[1024,12,1,1]","support","1","yes","WebGPU"
|
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[1024,12,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[2000,10,1,1]","support","1","yes","WebGPU"
|
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[2000,10,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[5438,3,1,1]","support","1","yes","WebGPU"
|
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[5438,3,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","0","no","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","0","no","WebGPU"
|
||||||
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","0","no","WebGPU"
|
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","0","no","WebGPU"
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
|
|
@ -633,7 +633,7 @@ class SchemaConverter:
|
||||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
||||||
|
|
||||||
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
||||||
items = schema.get('items') or schema['prefixItems']
|
items = schema.get('items', schema.get('prefixItems'))
|
||||||
if isinstance(items, list):
|
if isinstance(items, list):
|
||||||
return self._add_rule(
|
return self._add_rule(
|
||||||
rule_name,
|
rule_name,
|
||||||
|
|
|
||||||
|
|
@ -248,12 +248,14 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||||
"ggml: sycl device architecture")
|
"ggml: sycl device architecture")
|
||||||
|
|
||||||
|
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
|
||||||
|
|
||||||
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||||
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||||
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||||
"gmml: OpenCL API version to target")
|
"ggml: OpenCL API version to target")
|
||||||
|
|
||||||
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
||||||
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
||||||
|
|
@ -327,6 +329,7 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-vulkan.h
|
include/ggml-vulkan.h
|
||||||
include/ggml-webgpu.h
|
include/ggml-webgpu.h
|
||||||
include/ggml-zendnn.h
|
include/ggml-zendnn.h
|
||||||
|
include/ggml-openvino.h
|
||||||
include/gguf.h)
|
include/gguf.h)
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_OPENVINO_NAME "OPENVINO"
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
|
||||||
|
|
||||||
|
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
|
GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
|
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
|
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
|
// device buffer
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
|
||||||
|
|
||||||
|
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
@ -8,7 +8,12 @@ extern "C" {
|
||||||
|
|
||||||
#define RPC_PROTO_MAJOR_VERSION 3
|
#define RPC_PROTO_MAJOR_VERSION 3
|
||||||
#define RPC_PROTO_MINOR_VERSION 6
|
#define RPC_PROTO_MINOR_VERSION 6
|
||||||
#define RPC_PROTO_PATCH_VERSION 0
|
#define RPC_PROTO_PATCH_VERSION 1
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||||
|
#endif
|
||||||
|
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
|
|
|
||||||
|
|
@ -427,7 +427,8 @@ extern "C" {
|
||||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||||
GGML_TYPE_COUNT = 40,
|
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||||
|
GGML_TYPE_COUNT = 41,
|
||||||
};
|
};
|
||||||
|
|
||||||
// precision
|
// precision
|
||||||
|
|
@ -463,6 +464,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
@ -2464,6 +2466,8 @@ extern "C" {
|
||||||
bool lower,
|
bool lower,
|
||||||
bool uni);
|
bool uni);
|
||||||
|
|
||||||
|
// TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
|
||||||
GGML_API struct ggml_tensor * ggml_gated_delta_net(
|
GGML_API struct ggml_tensor * ggml_gated_delta_net(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * q,
|
struct ggml_tensor * q,
|
||||||
|
|
|
||||||
|
|
@ -460,6 +460,7 @@ ggml_add_backend(zDNN)
|
||||||
ggml_add_backend(OpenCL)
|
ggml_add_backend(OpenCL)
|
||||||
ggml_add_backend(Hexagon)
|
ggml_add_backend(Hexagon)
|
||||||
ggml_add_backend(ZenDNN)
|
ggml_add_backend(ZenDNN)
|
||||||
|
ggml_add_backend(OPENVINO)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,10 @@
|
||||||
#include "ggml-zendnn.h"
|
#include "ggml-zendnn.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_OPENVINO
|
||||||
|
#include "ggml-openvino.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
static std::string path_str(const fs::path & path) {
|
static std::string path_str(const fs::path & path) {
|
||||||
|
|
@ -154,6 +158,9 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_RPC
|
#ifdef GGML_USE_RPC
|
||||||
register_backend(ggml_backend_rpc_reg());
|
register_backend(ggml_backend_rpc_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_OPENVINO
|
||||||
|
register_backend(ggml_backend_openvino_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_CPU
|
#ifdef GGML_USE_CPU
|
||||||
register_backend(ggml_backend_cpu_reg());
|
register_backend(ggml_backend_cpu_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -557,6 +564,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||||
ggml_backend_load_best("opencl", silent, dir_path);
|
ggml_backend_load_best("opencl", silent, dir_path);
|
||||||
ggml_backend_load_best("hexagon", silent, dir_path);
|
ggml_backend_load_best("hexagon", silent, dir_path);
|
||||||
ggml_backend_load_best("musa", silent, dir_path);
|
ggml_backend_load_best("musa", silent, dir_path);
|
||||||
|
ggml_backend_load_best("openvino", silent, dir_path);
|
||||||
ggml_backend_load_best("cpu", silent, dir_path);
|
ggml_backend_load_best("cpu", silent, dir_path);
|
||||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||||
|
|
|
||||||
|
|
@ -1455,10 +1455,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
int split_backend_id = split->backend_id;
|
int split_backend_id = split->backend_id;
|
||||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||||
|
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
|
|
||||||
ggml_backend_synchronize(split_backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy the input tensors to the split backend
|
// copy the input tensors to the split backend
|
||||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||||
|
|
@ -1469,12 +1465,16 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||||
|
} else {
|
||||||
|
ggml_backend_synchronize(split_backend);
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
ggml_backend_tensor_copy(input, input_cpy);
|
||||||
} else {
|
} else {
|
||||||
// wait for the split backend to finish using the input before overwriting it
|
// wait for the split backend to finish using the input before overwriting it
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||||
|
} else {
|
||||||
|
ggml_backend_synchronize(split_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
||||||
|
|
@ -1578,10 +1578,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
|
|
||||||
ggml_backend_synchronize(split_backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sched->callback_eval) {
|
if (!sched->callback_eval) {
|
||||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||||
if (ec != GGML_STATUS_SUCCESS) {
|
if (ec != GGML_STATUS_SUCCESS) {
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,9 @@ typedef sycl::half2 ggml_half2;
|
||||||
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
|
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
|
||||||
#define QR_MXFP4 2
|
#define QR_MXFP4 2
|
||||||
|
|
||||||
|
#define QI_NVFP4 (QK_NVFP4 / (4 * QR_NVFP4))
|
||||||
|
#define QR_NVFP4 2
|
||||||
|
|
||||||
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
||||||
#define QR5_0 2
|
#define QR5_0 2
|
||||||
|
|
||||||
|
|
@ -194,6 +197,14 @@ typedef struct {
|
||||||
} block_mxfp4;
|
} block_mxfp4;
|
||||||
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
|
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
|
||||||
|
|
||||||
|
#define QK_NVFP4 64
|
||||||
|
#define QK_NVFP4_SUB 16 // sub-block size for per-group scales
|
||||||
|
typedef struct {
|
||||||
|
uint8_t d[QK_NVFP4/QK_NVFP4_SUB]; // UE4M3 scales (4 bytes, one per 16-element sub-block)
|
||||||
|
uint8_t qs[QK_NVFP4/2]; // packed 4-bit E2M1 values (32 bytes)
|
||||||
|
} block_nvfp4;
|
||||||
|
static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
|
||||||
|
|
||||||
#define QK5_0 32
|
#define QK5_0 32
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||||
|
|
@ -79,6 +80,8 @@
|
||||||
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
|
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
|
// quants.c
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||||
|
|
@ -108,6 +111,7 @@
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
|
@ -155,6 +159,7 @@
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
|
@ -194,16 +199,11 @@
|
||||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||||
#elif defined(__riscv)
|
#elif defined(__riscv)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
|
||||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
|
||||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
|
||||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
|
||||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
#define ggml_quantize_mat_q8_K_4x1_generic ggml_quantize_mat_q8_K_4x1
|
||||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
|
@ -239,6 +239,7 @@
|
||||||
#elif defined(__s390x__)
|
#elif defined(__s390x__)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||||
|
|
@ -301,6 +302,7 @@
|
||||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||||
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
|
|
||||||
|
|
@ -650,6 +650,90 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
assert(n % QK_NVFP4 == 0);
|
||||||
|
|
||||||
|
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||||
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||||
|
|
||||||
|
// Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
|
||||||
|
const int nb = n / QK_NVFP4;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
#if defined __ARM_NEON
|
||||||
|
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||||
|
float32x4_t acc = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
|
||||||
|
const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
|
||||||
|
|
||||||
|
const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
|
||||||
|
const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
|
||||||
|
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
|
||||||
|
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
|
||||||
|
|
||||||
|
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
|
||||||
|
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
|
||||||
|
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
|
||||||
|
const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
|
||||||
|
|
||||||
|
const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
|
||||||
|
const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
|
||||||
|
const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
|
||||||
|
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
|
||||||
|
|
||||||
|
const int32x4_t p0 = vaddq_s32(
|
||||||
|
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
|
||||||
|
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
|
||||||
|
const int32x4_t p1 = vaddq_s32(
|
||||||
|
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
|
||||||
|
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
|
||||||
|
|
||||||
|
const int32x4_t sums = vpaddq_s32(p0, p1);
|
||||||
|
|
||||||
|
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
|
||||||
|
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
|
||||||
|
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
|
||||||
|
const float32x4_t nvsc = {
|
||||||
|
ggml_ue4m3_to_fp32(x[ib].d[0]),
|
||||||
|
ggml_ue4m3_to_fp32(x[ib].d[1]),
|
||||||
|
ggml_ue4m3_to_fp32(x[ib].d[2]),
|
||||||
|
ggml_ue4m3_to_fp32(x[ib].d[3])
|
||||||
|
};
|
||||||
|
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
|
||||||
|
|
||||||
|
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
|
||||||
|
}
|
||||||
|
sumf = vaddvq_f32(acc);
|
||||||
|
#else
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
for (int si = 0; si < 4; ++si) {
|
||||||
|
const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
|
||||||
|
const int q8b = si / 2;
|
||||||
|
const int q8o = (si % 2) * QK_NVFP4_SUB;
|
||||||
|
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
|
||||||
|
|
||||||
|
int sumi_lo = 0, sumi_hi = 0;
|
||||||
|
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
|
||||||
|
const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
|
||||||
|
sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
|
||||||
|
sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
|
||||||
|
}
|
||||||
|
sumf += dy * d * (sumi_lo + sumi_hi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
*s = sumf;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -270,6 +270,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_NVFP4] = {
|
||||||
|
.from_float = quantize_row_nvfp4,
|
||||||
|
.vec_dot = ggml_vec_dot_nvfp4_q8_0,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
.from_float = quantize_row_q2_K,
|
.from_float = quantize_row_q2_K,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
|
|
|
||||||
|
|
@ -520,7 +520,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||||
/* .op_type = */ GGML_TYPE_F32,
|
/* .op_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -631,7 +631,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||||
/* .op_type = */ GGML_TYPE_F32,
|
/* .op_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -801,7 +801,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
/* .rhs_type = */ GGML_TYPE_Q8_0,
|
/* .rhs_type = */ GGML_TYPE_Q8_0,
|
||||||
/* .op_type = */ GGML_TYPE_F32,
|
/* .op_type = */ GGML_TYPE_F32,
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -670,6 +670,7 @@ void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -1119,6 +1120,7 @@ void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -1247,6 +1249,7 @@ void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -4334,6 +4337,7 @@ void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -4609,6 +4613,7 @@ void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -4831,6 +4836,7 @@ void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -5555,6 +5561,7 @@ void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_MXFP4:
|
case GGML_TYPE_MXFP4:
|
||||||
|
case GGML_TYPE_NVFP4:
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
@ -9617,7 +9624,7 @@ void ggml_compute_forward_win_unpart(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//gmml_compute_forward_unary
|
//ggml_compute_forward_unary
|
||||||
|
|
||||||
void ggml_compute_forward_unary(
|
void ggml_compute_forward_unary(
|
||||||
const ggml_compute_params * params,
|
const ggml_compute_params * params,
|
||||||
|
|
@ -10436,8 +10443,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||||
|
|
||||||
const float * state_in_base = (const float *)src_state->data;
|
const float * state_in_base = (const float *)src_state->data;
|
||||||
|
|
||||||
const int64_t rq1 = nev1 / neq1;
|
//const int64_t rq1 = nev1 / neq1;
|
||||||
const int64_t rk1 = nev1 / nek1;
|
//const int64_t rk1 = nev1 / nek1;
|
||||||
const int64_t rq3 = nev3 / neq3;
|
const int64_t rq3 = nev3 / neq3;
|
||||||
const int64_t rk3 = nev3 / nek3;
|
const int64_t rk3 = nev3 / nek3;
|
||||||
|
|
||||||
|
|
@ -10447,8 +10454,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||||
const int64_t iv1 = ir % H; // head_index
|
const int64_t iv1 = ir % H; // head_index
|
||||||
const int64_t iv3 = ir / H; // sequence
|
const int64_t iv3 = ir / H; // sequence
|
||||||
|
|
||||||
const int64_t iq1 = iv1 / rq1;
|
const int64_t iq1 = iv1 % neq1;
|
||||||
const int64_t ik1 = iv1 / rk1;
|
const int64_t ik1 = iv1 % nek1;
|
||||||
|
|
||||||
const int64_t iq3 = iv3 / rq3;
|
const int64_t iq3 = iv3 / rq3;
|
||||||
const int64_t ik3 = iv3 / rk3;
|
const int64_t ik3 = iv3 / rk3;
|
||||||
|
|
@ -10468,40 +10475,45 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
||||||
const float * v_d = (const float *)((const char *)src_v->data + iv3 * nbv3 + t * nbv2 + iv1 * nbv1);
|
const float * v_d = (const float *)((const char *)src_v->data + iv3 * nbv3 + t * nbv2 + iv1 * nbv1);
|
||||||
|
|
||||||
const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
|
const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
|
||||||
const float * g_d = (const float *)((const char *)src_g->data + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);
|
const float * g_d = (const float *)((const char *)src_g->data + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);
|
||||||
|
|
||||||
|
// state is stored transposed: s_out[j*S_v + i] = S[i][j]
|
||||||
|
// so row j of s_out = column j of S (contiguous access)
|
||||||
|
|
||||||
if (kda) {
|
if (kda) {
|
||||||
|
// precompute exp(g) into delta scratch (reused below)
|
||||||
for (int64_t i = 0; i < S_v; ++i) {
|
for (int64_t i = 0; i < S_v; ++i) {
|
||||||
ggml_vec_scale_f32(S_v, &s_out[i * S_v], expf(g_d[i]));
|
delta[i] = expf(g_d[i]);
|
||||||
|
}
|
||||||
|
// S[i][:] *= exp(g[i]) => for each row j of M: M[j][i] *= exp(g[i])
|
||||||
|
for (int64_t j = 0; j < S_v; ++j) {
|
||||||
|
ggml_vec_mul_f32(S_v, &s_out[j * S_v], &s_out[j * S_v], delta);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
|
ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
// delta[j] = sum_i S[j][i] * k[i]
|
// delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
|
||||||
memset(delta, 0, S_v * sizeof(float));
|
|
||||||
for (int64_t i = 0; i < S_v; ++i) {
|
|
||||||
ggml_vec_mad_f32(S_v, delta, &s_out[i * S_v], k_d[i]);
|
|
||||||
}
|
|
||||||
for (int64_t j = 0; j < S_v; ++j) {
|
for (int64_t j = 0; j < S_v; ++j) {
|
||||||
delta[j] = (v_d[j] - delta[j]) * beta_val;
|
float sum = 0.0f;
|
||||||
|
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
|
||||||
|
delta[j] = (v_d[j] - sum) * beta_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
// outer product: S[j][i] += k[i] * delta[j]
|
// outer product: S[i][j] += k[i] * delta[j] => M[j][i] += delta[j] * k[i]
|
||||||
for (int64_t i = 0; i < S_v; ++i) {
|
for (int64_t j = 0; j < S_v; ++j) {
|
||||||
ggml_vec_mad_f32(S_v, &s_out[i * S_v], delta, k_d[i]);
|
ggml_vec_mad_f32(S_v, &s_out[j * S_v], k_d, delta[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// attn_out[j] = sum_i S[j][i] * q[i]
|
// attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
|
||||||
memset(attn_data, 0, S_v * sizeof(float));
|
for (int64_t j = 0; j < S_v; ++j) {
|
||||||
for (int64_t i = 0; i < S_v; ++i) {
|
float sum = 0.0f;
|
||||||
ggml_vec_mad_f32(S_v, attn_data, &s_out[i * S_v], q_d[i]);
|
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
|
||||||
|
attn_data[j] = sum * scale;
|
||||||
}
|
}
|
||||||
ggml_vec_scale_f32(S_v, attn_data, scale);
|
|
||||||
|
|
||||||
attn_data += S_v * H; // advance to next token
|
attn_data += S_v * H; // advance to next token
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,10 @@ void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i
|
||||||
quantize_row_mxfp4_ref(x, y, k);
|
quantize_row_mxfp4_ref(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||||
|
quantize_row_nvfp4_ref(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// 2-6 bit quantization in super-blocks
|
// 2-6 bit quantization in super-blocks
|
||||||
//
|
//
|
||||||
|
|
@ -216,6 +220,42 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
|
||||||
|
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
assert(n % QK_NVFP4 == 0);
|
||||||
|
|
||||||
|
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||||
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||||
|
|
||||||
|
const int nb = n / QK_NVFP4;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int ib = 0; ib < nb; ++ib) {
|
||||||
|
for (int s_idx = 0; s_idx < 4; ++s_idx) {
|
||||||
|
const float d = ggml_ue4m3_to_fp32(x[ib].d[s_idx]);
|
||||||
|
const int q8_block = s_idx / 2;
|
||||||
|
const int q8_off = (s_idx % 2) * QK_NVFP4_SUB;
|
||||||
|
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
|
||||||
|
|
||||||
|
int sumi_lo = 0, sumi_hi = 0;
|
||||||
|
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
|
||||||
|
const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
|
||||||
|
sumi_lo += y[2*ib + q8_block].qs[q8_off + j + 0] * kvalues_mxfp4[qv & 0xf];
|
||||||
|
sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += dy * d * (sumi_lo + sumi_hi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*s = sumf;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
@ -42,6 +43,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
@ -73,6 +75,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -28,13 +28,17 @@ template <int K, int N> struct block {
|
||||||
// control size
|
// control size
|
||||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||||
|
static_assert(sizeof(block<4, 16>) == 16 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<4,16> size/padding");
|
||||||
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
||||||
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
||||||
|
static_assert(sizeof(block<8, 16>) == 16 * sizeof(ggml_half) + QK8_0 * 16, "wrong block<8,16> size/padding");
|
||||||
|
|
||||||
using block_q4_0x4 = block<4, 4>;
|
using block_q4_0x4 = block<4, 4>;
|
||||||
using block_q4_0x8 = block<4, 8>;
|
using block_q4_0x8 = block<4, 8>;
|
||||||
|
using block_q4_0x16 = block<4, 16>;
|
||||||
using block_q8_0x4 = block<8, 4>;
|
using block_q8_0x4 = block<8, 4>;
|
||||||
using block_q8_0x8 = block<8, 8>;
|
using block_q8_0x8 = block<8, 8>;
|
||||||
|
using block_q8_0x16 = block<8, 16>;
|
||||||
|
|
||||||
struct block_q4_Kx8 {
|
struct block_q4_Kx8 {
|
||||||
ggml_half d[8]; // super-block scale for quantized scales
|
ggml_half d[8]; // super-block scale for quantized scales
|
||||||
|
|
@ -44,7 +48,14 @@ struct block_q4_Kx8 {
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||||
|
struct block_q4_Kx16 {
|
||||||
|
ggml_half d[16]; // super-block scale for quantized scales
|
||||||
|
ggml_half dmin[16]; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[192]; // scales and mins, quantized with 6 bits
|
||||||
|
uint8_t qs[2048]; // 4--bit quants
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(block_q4_Kx16) == sizeof(ggml_half) * 32 + K_SCALE_SIZE * 16 + QK_K * 8, "wrong q4_K block size/padding");
|
||||||
struct block_q2_Kx8 {
|
struct block_q2_Kx8 {
|
||||||
ggml_half d[8]; // super-block scale for quantized scales
|
ggml_half d[8]; // super-block scale for quantized scales
|
||||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||||
|
|
@ -53,6 +64,13 @@ struct block_q2_Kx8 {
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||||
|
struct block_q2_Kx16 {
|
||||||
|
ggml_half d[16]; // Super-block scale for quantized scales
|
||||||
|
ggml_half dmin[16]; // Super-block scale for quantized mins
|
||||||
|
uint8_t scales[256]; // Sub-block scales (16 cols * 16 sub-blocks)
|
||||||
|
uint8_t qs[1024]; // Data (16 cols * 64 bytes per block)
|
||||||
|
};
|
||||||
|
static_assert(sizeof(block_q2_Kx16) == sizeof(ggml_half) * 32 + QK_K + QK_K * 4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
struct block_q5_Kx8 {
|
struct block_q5_Kx8 {
|
||||||
ggml_half d[8]; // super-block scale for quantized scales
|
ggml_half d[8]; // super-block scale for quantized scales
|
||||||
|
|
@ -97,6 +115,12 @@ struct block_iq4_nlx8 {
|
||||||
|
|
||||||
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
||||||
|
|
||||||
|
struct block_iq4_nlx16 {
|
||||||
|
ggml_half d[16]; // deltas for 16 iq4_nl blocks
|
||||||
|
uint8_t qs[QK4_NL * 8]; // nibbles / quants for 16 iq4_nl blocks
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(block_iq4_nlx16) == 16 * sizeof(ggml_half) + QK4_NL * 8, "wrong iq4_nlx16 block size/padding");
|
||||||
struct block_mxfp4x4 {
|
struct block_mxfp4x4 {
|
||||||
uint8_t e[4];
|
uint8_t e[4];
|
||||||
uint8_t qs[QK_MXFP4 * 2];
|
uint8_t qs[QK_MXFP4 * 2];
|
||||||
|
|
@ -109,7 +133,6 @@ struct block_mxfp4x8 {
|
||||||
};
|
};
|
||||||
static_assert(sizeof(block_mxfp4x8) == 8 + QK_MXFP4 * 4, "wrong mxfp4x8 block size/padding");
|
static_assert(sizeof(block_mxfp4x8) == 8 + QK_MXFP4 * 4, "wrong mxfp4x8 block size/padding");
|
||||||
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -132,6 +155,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
@ -146,10 +171,22 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
#if defined __riscv_zvfh
|
||||||
|
void ggml_quantize_mat_q8_0_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
void ggml_quantize_mat_q8_K_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Native implementations
|
// Native implementations
|
||||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
|
@ -170,6 +207,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
@ -184,10 +223,22 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
||||||
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
#if defined __riscv_zvfh
|
||||||
|
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|
|
||||||
|
|
@ -479,13 +479,51 @@ do { \
|
||||||
|
|
||||||
// F16 AVX512
|
// F16 AVX512
|
||||||
|
|
||||||
// F16 AVX
|
#if defined(__AVX512FP16__)
|
||||||
|
|
||||||
|
#define GGML_F16_STEP 128
|
||||||
|
#define GGML_F16_EPR 32
|
||||||
|
|
||||||
|
#define GGML_F16x32 __m512h
|
||||||
|
#define GGML_F16x32_ZERO _mm512_setzero_ph()
|
||||||
|
#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x))
|
||||||
|
#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x)
|
||||||
|
#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y)
|
||||||
|
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
|
||||||
|
#define GGML_F16x32_ADD _mm512_add_ph
|
||||||
|
#define GGML_F16x32_MUL _mm512_mul_ph
|
||||||
|
#define GGML_F16x32_REDUCE(res, x) \
|
||||||
|
do { \
|
||||||
|
int offset = GGML_F16_ARR >> 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
res = (ggml_float) _mm512_reduce_add_ph(x[0]); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define GGML_F16_VEC GGML_F16x32
|
||||||
|
#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO
|
||||||
|
#define GGML_F16_VEC_SET1 GGML_F16x32_SET1
|
||||||
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p)
|
||||||
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
|
||||||
|
#define GGML_F16_VEC_FMA GGML_F16x32_FMA
|
||||||
|
#define GGML_F16_VEC_ADD GGML_F16x32_ADD
|
||||||
|
#define GGML_F16_VEC_MUL GGML_F16x32_MUL
|
||||||
|
#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE
|
||||||
|
|
||||||
|
#else // Fallback FP16 <-> FP32
|
||||||
|
|
||||||
#define GGML_F16_STEP 64
|
#define GGML_F16_STEP 64
|
||||||
#define GGML_F16_EPR 16
|
#define GGML_F16_EPR 16
|
||||||
|
|
||||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
|
||||||
|
|
||||||
#define GGML_F32Cx16 __m512
|
#define GGML_F32Cx16 __m512
|
||||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||||
|
|
@ -525,6 +563,8 @@ do { \
|
||||||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||||
|
|
||||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||||
|
|
||||||
|
#endif // __AVX512FP16__
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
|
|
||||||
#define GGML_SIMD
|
#define GGML_SIMD
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,8 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
||||||
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||||
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||||
|
|
||||||
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
__shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||||
|
int cur_tile_buf = 0;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
||||||
|
|
@ -70,7 +71,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
||||||
if(x < ne01 && y + j < ne00){
|
if(x < ne01 && y + j < ne00){
|
||||||
const int row = threadIdx.y+j;
|
const int row = threadIdx.y+j;
|
||||||
const int col = threadIdx.x * sizeof(float)/sizeof(T);
|
const int col = threadIdx.x * sizeof(float)/sizeof(T);
|
||||||
T *tile2 = reinterpret_cast<T*>(tile[row]);
|
T *tile2 = reinterpret_cast<T*>(tile[cur_tile_buf][row]);
|
||||||
tile2[col] = src[imat*n + (y+j)*ne01 + x];
|
tile2[col] = src[imat*n + (y+j)*ne01 + x];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -81,10 +82,12 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
||||||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||||
if (ty + j < ne01 && tx < ne00) {
|
if (ty + j < ne01 && tx < ne00) {
|
||||||
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
|
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
|
||||||
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
|
const T *tile2 = reinterpret_cast<const T*>(tile[cur_tile_buf][threadIdx.x]);
|
||||||
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
|
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cur_tile_buf = (cur_tile_buf + 1) % 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
|
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
|
||||||
|
|
|
||||||
|
|
@ -892,7 +892,7 @@ void launch_fattn(
|
||||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||||
const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2);
|
const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2);
|
||||||
const int ntiles_total = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
|
const int ntiles_dst = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
|
||||||
|
|
||||||
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
||||||
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||||
|
|
@ -919,37 +919,37 @@ void launch_fattn(
|
||||||
GGML_ASSERT(max_blocks_per_sm > 0);
|
GGML_ASSERT(max_blocks_per_sm > 0);
|
||||||
int parallel_blocks = max_blocks_per_sm;
|
int parallel_blocks = max_blocks_per_sm;
|
||||||
|
|
||||||
|
const int ntiles_KV = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by KV cache length.
|
||||||
|
|
||||||
dim3 blocks_num;
|
dim3 blocks_num;
|
||||||
if (stream_k) {
|
if (stream_k) {
|
||||||
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
||||||
const int max_blocks = max_blocks_per_sm*nsm;
|
const int max_blocks = max_blocks_per_sm*nsm;
|
||||||
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
|
const int tiles_nwaves = (ntiles_dst + max_blocks - 1) / max_blocks;
|
||||||
const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
|
const int tiles_efficiency_percent = 100 * ntiles_dst / (max_blocks*tiles_nwaves);
|
||||||
|
|
||||||
const int nblocks_stream_k = max_blocks;
|
const int nblocks_stream_k = std::min(max_blocks, ntiles_KV*ntiles_dst);
|
||||||
|
|
||||||
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
|
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
|
||||||
|
|
||||||
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
|
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_dst;
|
||||||
blocks_num.y = 1;
|
blocks_num.y = 1;
|
||||||
blocks_num.z = 1;
|
blocks_num.z = 1;
|
||||||
|
|
||||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||||
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
|
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
|
|
||||||
|
|
||||||
// parallel_blocks must not be larger than what the tensor size allows:
|
// parallel_blocks must not be larger than what the tensor size allows:
|
||||||
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
|
parallel_blocks = std::min(parallel_blocks, ntiles_KV);
|
||||||
|
|
||||||
// If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
|
// If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
|
||||||
// Test whether parallel_blocks can be set to a higher value for better efficiency.
|
// Test whether parallel_blocks can be set to a higher value for better efficiency.
|
||||||
const int blocks_per_wave = nsm * max_blocks_per_sm;
|
const int blocks_per_wave = nsm * max_blocks_per_sm;
|
||||||
int nwaves_best = 0;
|
int nwaves_best = 0;
|
||||||
int efficiency_percent_best = 0;
|
int efficiency_percent_best = 0;
|
||||||
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) {
|
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KV; ++parallel_blocks_test) {
|
||||||
const int nblocks_total = ntiles_total * parallel_blocks_test;
|
const int nblocks_total = ntiles_dst * parallel_blocks_test;
|
||||||
const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
|
const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
|
||||||
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
|
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
|
||||||
|
|
||||||
|
|
@ -1015,7 +1015,7 @@ void launch_fattn(
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
if (stream_k) {
|
if (stream_k) {
|
||||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||||
const dim3 block_dim_combine(DV, 1, 1);
|
const dim3 block_dim_combine(DV, 1, 1);
|
||||||
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
#include "gated_delta_net.cuh"
|
#include "gated_delta_net.cuh"
|
||||||
#include "ggml-cuda/common.cuh"
|
|
||||||
|
|
||||||
template <int S_v, bool KDA>
|
template <int S_v, bool KDA>
|
||||||
__global__ void gated_delta_net_cuda(const float * q,
|
__global__ void gated_delta_net_cuda(const float * q,
|
||||||
|
|
@ -21,15 +20,17 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||||
int64_t sb1,
|
int64_t sb1,
|
||||||
int64_t sb2,
|
int64_t sb2,
|
||||||
int64_t sb3,
|
int64_t sb3,
|
||||||
int64_t rq1,
|
const uint3 neqk1_magic,
|
||||||
int64_t rq3,
|
const uint3 rq3_magic,
|
||||||
float scale) {
|
float scale) {
|
||||||
const int64_t h_idx = blockIdx.x;
|
const uint32_t h_idx = blockIdx.x;
|
||||||
const int64_t sequence = blockIdx.y;
|
const uint32_t sequence = blockIdx.y;
|
||||||
const int col = threadIdx.x; // each thread owns one column
|
// each warp owns one column, using warp-level primitives to reduce across rows
|
||||||
|
const int lane = threadIdx.x;
|
||||||
|
const int col = blockIdx.z * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
const int64_t iq1 = h_idx / rq1;
|
const uint32_t iq1 = fastmodulo(h_idx, neqk1_magic);
|
||||||
const int64_t iq3 = sequence / rq3;
|
const uint32_t iq3 = fastdiv(sequence, rq3_magic);
|
||||||
|
|
||||||
const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
|
const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
|
||||||
float * attn_data = dst;
|
float * attn_data = dst;
|
||||||
|
|
@ -40,11 +41,15 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||||
curr_state += state_offset;
|
curr_state += state_offset;
|
||||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||||
|
|
||||||
// Load state column into registers
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
|
||||||
float s[S_v];
|
static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
|
||||||
|
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
|
||||||
|
float s_shard[rows_per_lane];
|
||||||
|
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
s[i] = curr_state[i * S_v + col];
|
const int i = r * warp_size + lane;
|
||||||
|
s_shard[r] = curr_state[col * S_v + i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int t = 0; t < n_tokens; t++) {
|
for (int t = 0; t < n_tokens; t++) {
|
||||||
|
|
@ -62,55 +67,71 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||||
const float g_val = expf(*g_t);
|
const float g_val = expf(*g_t);
|
||||||
|
|
||||||
// kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
|
// kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
|
||||||
float kv_col = 0.0f;
|
float kv_shard = 0.0f;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
kv_col += s[i] * k_t[i];
|
const int i = r * warp_size + lane;
|
||||||
|
kv_shard += s_shard[r] * k_t[i];
|
||||||
}
|
}
|
||||||
|
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||||
|
|
||||||
// delta[col] = (v[col] - g * kv[col]) * beta
|
// delta[col] = (v[col] - g * kv[col]) * beta
|
||||||
float delta_col = (v_t[col] - g_val * kv_col) * beta_val;
|
float delta_col = (v_t[col] - g_val * kv_col) * beta_val;
|
||||||
|
|
||||||
// fused: S[i][col] = g * S[i][col] + k[i] * delta[col]
|
// fused: S[i][col] = g * S[i][col] + k[i] * delta[col]
|
||||||
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||||
float attn_col = 0.0f;
|
float attn_partial = 0.0f;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
s[i] = g_val * s[i] + k_t[i] * delta_col;
|
const int i = r * warp_size + lane;
|
||||||
attn_col += s[i] * q_t[i];
|
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
|
||||||
|
attn_partial += s_shard[r] * q_t[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
attn_data[col] = attn_col * scale;
|
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||||
|
|
||||||
|
if (lane == 0) {
|
||||||
|
attn_data[col] = attn_col * scale;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// kv[col] = sum_i g[i] * S[i][col] * k[i]
|
// kv[col] = sum_i g[i] * S[i][col] * k[i]
|
||||||
float kv_col = 0.0f;
|
float kv_shard = 0.0f;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
kv_col += expf(g_t[i]) * s[i] * k_t[i];
|
const int i = r * warp_size + lane;
|
||||||
|
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||||
|
|
||||||
// delta[col] = (v[col] - kv[col]) * beta
|
// delta[col] = (v[col] - kv[col]) * beta
|
||||||
float delta_col = (v_t[col] - kv_col) * beta_val;
|
float delta_col = (v_t[col] - kv_col) * beta_val;
|
||||||
|
|
||||||
// fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col]
|
// fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col]
|
||||||
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
|
||||||
float attn_col = 0.0f;
|
float attn_partial = 0.0f;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
s[i] = expf(g_t[i]) * s[i] + k_t[i] * delta_col;
|
const int i = r * warp_size + lane;
|
||||||
attn_col += s[i] * q_t[i];
|
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
|
||||||
|
attn_partial += s_shard[r] * q_t[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
attn_data[col] = attn_col * scale;
|
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||||
|
|
||||||
|
if (lane == 0) {
|
||||||
|
attn_data[col] = attn_col * scale;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
attn_data += S_v * H;
|
attn_data += S_v * H;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write state back to global memory
|
// Write state back to global memory (transposed layout)
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < S_v; i++) {
|
for (int r = 0; r < rows_per_lane; r++) {
|
||||||
state[i * S_v + col] = s[i];
|
const int i = r * warp_size + lane;
|
||||||
|
state[col * S_v + i] = s_shard[r];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -119,35 +140,50 @@ static void launch_gated_delta_net(
|
||||||
const float * q_d, const float * k_d, const float * v_d,
|
const float * q_d, const float * k_d, const float * v_d,
|
||||||
const float * g_d, const float * b_d, const float * s_d,
|
const float * g_d, const float * b_d, const float * s_d,
|
||||||
float * dst_d,
|
float * dst_d,
|
||||||
int64_t S_v, int64_t H, int64_t n_tokens, int64_t n_seqs,
|
int64_t S_v, int64_t H, int64_t n_tokens, int64_t n_seqs,
|
||||||
int64_t sq1, int64_t sq2, int64_t sq3,
|
int64_t sq1, int64_t sq2, int64_t sq3,
|
||||||
int64_t sv1, int64_t sv2, int64_t sv3,
|
int64_t sv1, int64_t sv2, int64_t sv3,
|
||||||
int64_t sb1, int64_t sb2, int64_t sb3,
|
int64_t sb1, int64_t sb2, int64_t sb3,
|
||||||
int64_t rq1, int64_t rq3,
|
int64_t neqk1, int64_t rq3,
|
||||||
float scale, cudaStream_t stream) {
|
float scale, cudaStream_t stream) {
|
||||||
|
//TODO: Add chunked kernel for even faster pre-fill
|
||||||
|
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||||
|
const int num_warps = 4;
|
||||||
|
dim3 grid_dims(H, n_seqs, (S_v + num_warps - 1) / num_warps);
|
||||||
|
dim3 block_dims(warp_size <= S_v ? warp_size : S_v, num_warps, 1);
|
||||||
|
|
||||||
dim3 grid_dims(H, n_seqs, 1);
|
const uint3 neqk1_magic = init_fastdiv_values(neqk1);
|
||||||
dim3 block_dims(S_v, 1, 1);
|
const uint3 rq3_magic = init_fastdiv_values(rq3);
|
||||||
|
|
||||||
|
int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
|
|
||||||
switch (S_v) {
|
switch (S_v) {
|
||||||
|
case 16:
|
||||||
|
gated_delta_net_cuda<16, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
|
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||||
|
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
|
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||||
|
break;
|
||||||
case 32:
|
case 32:
|
||||||
gated_delta_net_cuda<32, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
gated_delta_net_cuda<32, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
sb1, sb2, sb3, rq1, rq3, scale);
|
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||||
break;
|
break;
|
||||||
case 64:
|
case 64: {
|
||||||
gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
sb1, sb2, sb3, rq1, rq3, scale);
|
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||||
break;
|
break;
|
||||||
case 128:
|
}
|
||||||
|
case 128: {
|
||||||
gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
sb1, sb2, sb3, rq1, rq3, scale);
|
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
|
|
@ -163,10 +199,12 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
ggml_tensor * src_state = dst->src[5];
|
ggml_tensor * src_state = dst->src[5];
|
||||||
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne);
|
GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne);
|
||||||
GGML_TENSOR_LOCALS(size_t, nbq, src_q, nb);
|
GGML_TENSOR_LOCALS(size_t , nbq, src_q, nb);
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, nek, src_k, ne);
|
||||||
|
GGML_TENSOR_LOCALS(size_t , nbk, src_k, nb);
|
||||||
GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne);
|
GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne);
|
||||||
GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb);
|
GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb);
|
||||||
GGML_TENSOR_LOCALS(size_t, nbb, src_beta, nb);
|
GGML_TENSOR_LOCALS(size_t, nbb, src_beta, nb);
|
||||||
|
|
||||||
const int64_t S_v = nev0;
|
const int64_t S_v = nev0;
|
||||||
const int64_t H = nev1;
|
const int64_t H = nev1;
|
||||||
|
|
@ -175,7 +213,9 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
|
|
||||||
const bool kda = (src_g->ne[0] == S_v);
|
const bool kda = (src_g->ne[0] == S_v);
|
||||||
|
|
||||||
const int64_t rq1 = nev1 / neq1;
|
GGML_ASSERT(neq1 == nek1);
|
||||||
|
const int64_t neqk1 = neq1;
|
||||||
|
|
||||||
const int64_t rq3 = nev3 / neq3;
|
const int64_t rq3 = nev3 / neq3;
|
||||||
|
|
||||||
const float * q_d = (const float *) src_q->data;
|
const float * q_d = (const float *) src_q->data;
|
||||||
|
|
@ -214,10 +254,10 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
if (kda) {
|
if (kda) {
|
||||||
launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
sb1, sb2, sb3, rq1, rq3, scale, stream);
|
sb1, sb2, sb3, neqk1, rq3, scale, stream);
|
||||||
} else {
|
} else {
|
||||||
launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
|
||||||
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||||
sb1, sb2, sb3, rq1, rq3, scale, stream);
|
sb1, sb2, sb3, neqk1, rq3, scale, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -124,7 +124,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
err = cudaMallocManaged(ptr, size);
|
err = cudaMallocManaged(ptr, size);
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
if (err == hipSuccess) {
|
if (err == hipSuccess) {
|
||||||
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
// hipMemAdviseSetCoarseGrain is an optional performance hint;
|
||||||
|
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
|
||||||
|
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||||
|
(void)hipGetLastError(); // clear any error
|
||||||
}
|
}
|
||||||
|
|
||||||
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
||||||
|
|
@ -205,7 +208,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
|
total_vram += prop.totalGlobalMem;
|
||||||
|
}
|
||||||
|
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
|
||||||
|
__func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
|
||||||
|
total_vram = 0;
|
||||||
|
|
||||||
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
|
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
|
|
@ -243,6 +253,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#else
|
#else
|
||||||
info.devices[id].supports_cooperative_launch = false;
|
info.devices[id].supports_cooperative_launch = false;
|
||||||
#endif // !(GGML_USE_MUSA)
|
#endif // !(GGML_USE_MUSA)
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||||
|
|
||||||
|
|
@ -257,22 +268,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
info.devices[id].cc += prop.minor * 0x10;
|
info.devices[id].cc += prop.minor * 0x10;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB\n",
|
||||||
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
||||||
device_vmm ? "yes" : "no", prop.warpSize);
|
device_vmm ? "yes" : "no", prop.warpSize,
|
||||||
|
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
||||||
info.devices[id].warp_size = 32;
|
info.devices[id].warp_size = 32;
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
||||||
info.devices[id].cc += prop.minor * 0x10;
|
info.devices[id].cc += prop.minor * 0x10;
|
||||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
|
||||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||||
|
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||||
#else
|
#else
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
|
||||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||||
|
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||||
std::string device_name(prop.name);
|
std::string device_name(prop.name);
|
||||||
if (device_name == "NVIDIA GeForce MX450") {
|
if (device_name == "NVIDIA GeForce MX450") {
|
||||||
turing_devices_without_mma.push_back({ id, device_name });
|
turing_devices_without_mma.push_back({ id, device_name });
|
||||||
|
|
@ -287,6 +301,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
// TODO: Check for future drivers the default scheduling strategy and
|
// TODO: Check for future drivers the default scheduling strategy and
|
||||||
// remove this call again when cudaDeviceScheduleSpin is default.
|
// remove this call again when cudaDeviceScheduleSpin is default.
|
||||||
if (prop.major == 12 && prop.minor == 1) {
|
if (prop.major == 12 && prop.minor == 1) {
|
||||||
|
CUDA_CHECK(cudaSetDevice(id));
|
||||||
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
|
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1226,6 +1241,34 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct cublas_force_compute_type {
|
||||||
|
bool fp32 = false;
|
||||||
|
bool fp16 = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type() {
|
||||||
|
static const cublas_force_compute_type compute_type = [] {
|
||||||
|
cublas_force_compute_type result;
|
||||||
|
|
||||||
|
const bool ggml_cuda_force_cublas_compute_32f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F") != nullptr;
|
||||||
|
const bool ggml_cuda_force_cublas_compute_16f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F") != nullptr;
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_cuda_force_cublas_compute_16f_env == false || ggml_cuda_force_cublas_compute_32f_env == false);
|
||||||
|
|
||||||
|
if (ggml_cuda_force_cublas_compute_32f_env) {
|
||||||
|
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F\n");
|
||||||
|
result.fp32 = true;
|
||||||
|
} else if (ggml_cuda_force_cublas_compute_16f_env) {
|
||||||
|
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F\n");
|
||||||
|
result.fp16 = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}();
|
||||||
|
|
||||||
|
return compute_type;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_mul_mat_cublas(
|
static void ggml_cuda_op_mul_mat_cublas(
|
||||||
ggml_backend_cuda_context & ctx,
|
ggml_backend_cuda_context & ctx,
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||||
|
|
@ -1308,7 +1351,13 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
|
|
||||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||||
|
|
||||||
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||||
|
|
||||||
|
if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||||
|
|| GGML_CUDA_CC_IS_RDNA4(cc)
|
||||||
|
|| cc == GGML_CUDA_CC_VOLTA
|
||||||
|
|| force_compute_type.fp32))
|
||||||
|
{
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
|
|
@ -1907,10 +1956,23 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||||
cudaDataType_t cu_data_type_b = traits::data_type;
|
cudaDataType_t cu_data_type_b = traits::data_type;
|
||||||
const void * alpha = traits::get_alpha();
|
const void * alpha = traits::get_alpha();
|
||||||
const void * beta = traits::get_beta();
|
const void * beta = traits::get_beta();
|
||||||
const float alpha_f32 = 1.0f;
|
|
||||||
const float beta_f32 = 0.0f;
|
|
||||||
|
|
||||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||||
|
|
||||||
|
int id = ggml_cuda_get_device();
|
||||||
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
|
static constexpr bool is_src0_type_f16 = src0_type == GGML_TYPE_F16;
|
||||||
|
|
||||||
|
// bf16 and fp32 are already being computed in fp32 (ensure it using static_assert),
|
||||||
|
// so checking necessity of forced fp32 only for fp16 src0_type
|
||||||
|
static_assert(is_src0_type_f16 || traits::compute_type == CUBLAS_COMPUTE_32F);
|
||||||
|
|
||||||
|
const bool need_compute_32f = is_src0_type_f16 && !force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||||
|
|| GGML_CUDA_CC_IS_RDNA4(cc)
|
||||||
|
|| cc == GGML_CUDA_CC_VOLTA
|
||||||
|
|| force_compute_type.fp32);
|
||||||
|
|
||||||
|
if (dst->op_params[0] == GGML_PREC_DEFAULT && !need_compute_32f) {
|
||||||
if constexpr (src0_type == GGML_TYPE_F32) {
|
if constexpr (src0_type == GGML_TYPE_F32) {
|
||||||
dst_t = (char *) dst_ddf; // Direct F32 output
|
dst_t = (char *) dst_ddf; // Direct F32 output
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1920,18 +1982,10 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
dst_t = (char *) dst_ddf;
|
dst_t = (char *) dst_ddf;
|
||||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
cu_compute_type = batched_mul_mat_traits<GGML_TYPE_F32>::compute_type;
|
||||||
cu_data_type = CUDA_R_32F;
|
cu_data_type = batched_mul_mat_traits<GGML_TYPE_F32>::data_type;
|
||||||
alpha = &alpha_f32;
|
alpha = batched_mul_mat_traits<GGML_TYPE_F32>::get_alpha();
|
||||||
beta = &beta_f32;
|
beta = batched_mul_mat_traits<GGML_TYPE_F32>::get_beta();
|
||||||
}
|
|
||||||
|
|
||||||
int id = ggml_cuda_get_device();
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
|
||||||
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
|
||||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
|
||||||
alpha = &alpha_f32;
|
|
||||||
beta = &beta_f32;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
|
|
@ -2807,14 +2861,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||||
|
|
||||||
//enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
|
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||||
bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
|
||||||
|
|
||||||
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2825,17 +2876,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||||
|
|
||||||
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
|
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (copy_from_host) {
|
if (backend_src != backend_dst) {
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
|
|
||||||
} else if (backend_src != backend_dst) {
|
|
||||||
// copy on src stream
|
// copy on src stream
|
||||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||||
|
|
@ -4976,9 +5024,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_RWKV_WKV6:
|
case GGML_OP_RWKV_WKV6:
|
||||||
case GGML_OP_GATED_LINEAR_ATTN:
|
case GGML_OP_GATED_LINEAR_ATTN:
|
||||||
case GGML_OP_GATED_DELTA_NET:
|
|
||||||
case GGML_OP_RWKV_WKV7:
|
case GGML_OP_RWKV_WKV7:
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_GATED_DELTA_NET:
|
||||||
|
//TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
|
||||||
|
#ifdef GGML_USE_MUSA
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
return true;
|
||||||
|
#endif // GGML_USE_MUSA
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
|
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
|
|
|
||||||
|
|
@ -60,11 +60,17 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||||
enum mmvq_parameter_table_id {
|
enum mmvq_parameter_table_id {
|
||||||
MMVQ_PARAMETERS_GENERIC = 0,
|
MMVQ_PARAMETERS_GENERIC = 0,
|
||||||
MMVQ_PARAMETERS_GCN,
|
MMVQ_PARAMETERS_GCN,
|
||||||
MMVQ_PARAMETERS_RDNA2
|
MMVQ_PARAMETERS_RDNA2,
|
||||||
|
MMVQ_PARAMETERS_RDNA3_0,
|
||||||
|
MMVQ_PARAMETERS_RDNA4
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||||
#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
|
#if defined(RDNA4)
|
||||||
|
return MMVQ_PARAMETERS_RDNA4;
|
||||||
|
#elif defined(RDNA3_0)
|
||||||
|
return MMVQ_PARAMETERS_RDNA3_0;
|
||||||
|
#elif defined(RDNA2) || defined(RDNA3_5)
|
||||||
return MMVQ_PARAMETERS_RDNA2;
|
return MMVQ_PARAMETERS_RDNA2;
|
||||||
#elif defined(GCN) || defined(CDNA)
|
#elif defined(GCN) || defined(CDNA)
|
||||||
return MMVQ_PARAMETERS_GCN;
|
return MMVQ_PARAMETERS_GCN;
|
||||||
|
|
@ -74,7 +80,13 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||||
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||||
|
return MMVQ_PARAMETERS_RDNA4;
|
||||||
|
}
|
||||||
|
if (GGML_CUDA_CC_IS_RDNA3_0(cc)) {
|
||||||
|
return MMVQ_PARAMETERS_RDNA3_0;
|
||||||
|
}
|
||||||
|
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) {
|
||||||
return MMVQ_PARAMETERS_RDNA2;
|
return MMVQ_PARAMETERS_RDNA2;
|
||||||
}
|
}
|
||||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||||
|
|
@ -83,7 +95,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||||
return MMVQ_PARAMETERS_GENERIC;
|
return MMVQ_PARAMETERS_GENERIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
|
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
|
||||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||||
switch (ncols_dst) {
|
switch (ncols_dst) {
|
||||||
case 1:
|
case 1:
|
||||||
|
|
@ -114,6 +126,50 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_paramet
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (table_id == MMVQ_PARAMETERS_RDNA4) {
|
||||||
|
// nwarps=8 benefits types with simple vec_dot on RDNA4 (ncols_dst=1).
|
||||||
|
// Types with complex vec_dot (Q3_K, IQ2_*, IQ3_*) regress due to register
|
||||||
|
// pressure and lookup table contention at higher thread counts.
|
||||||
|
if (ncols_dst == 1) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
return 8;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (table_id == MMVQ_PARAMETERS_RDNA3_0) {
|
||||||
|
// RDNA3 (W7900): stricter whitelist than RDNA4.
|
||||||
|
// Q2_K / Q5_K / IQ4_XS regress in full quant sweeps.
|
||||||
|
if (ncols_dst == 1) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
return 8;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -138,7 +194,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
|
||||||
}
|
}
|
||||||
|
|
||||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
|
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
|
||||||
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||||
|
|
@ -151,7 +207,7 @@ static __global__ void mul_mat_vec_q(
|
||||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||||
constexpr int vdr = get_vdr_mmvq(type);
|
constexpr int vdr = get_vdr_mmvq(type);
|
||||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||||
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
|
constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id);
|
||||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
|
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
|
|
||||||
|
|
@ -355,12 +411,13 @@ static __global__ void mul_mat_vec_q(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<ggml_type type>
|
||||||
static std::pair<dim3, dim3> calc_launch_params(
|
static std::pair<dim3, dim3> calc_launch_params(
|
||||||
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
||||||
const int warp_size, const mmvq_parameter_table_id table_id) {
|
const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
||||||
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
|
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
|
||||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
|
const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1);
|
||||||
return {block_nums, block_dims};
|
return {block_nums, block_dims};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -420,7 +477,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
if (has_ids && ncols_dst > 1) {
|
if (has_ids && ncols_dst > 1) {
|
||||||
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
||||||
constexpr int c_ncols_dst = 1;
|
constexpr int c_ncols_dst = 1;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -431,7 +488,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
switch (ncols_dst) {
|
switch (ncols_dst) {
|
||||||
case 1: {
|
case 1: {
|
||||||
constexpr int c_ncols_dst = 1;
|
constexpr int c_ncols_dst = 1;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -439,7 +496,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 2: {
|
case 2: {
|
||||||
constexpr int c_ncols_dst = 2;
|
constexpr int c_ncols_dst = 2;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -447,7 +504,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 3: {
|
case 3: {
|
||||||
constexpr int c_ncols_dst = 3;
|
constexpr int c_ncols_dst = 3;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -455,7 +512,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 4: {
|
case 4: {
|
||||||
constexpr int c_ncols_dst = 4;
|
constexpr int c_ncols_dst = 4;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -463,7 +520,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 5: {
|
case 5: {
|
||||||
constexpr int c_ncols_dst = 5;
|
constexpr int c_ncols_dst = 5;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -471,7 +528,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 6: {
|
case 6: {
|
||||||
constexpr int c_ncols_dst = 6;
|
constexpr int c_ncols_dst = 6;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -479,7 +536,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 7: {
|
case 7: {
|
||||||
constexpr int c_ncols_dst = 7;
|
constexpr int c_ncols_dst = 7;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
@ -487,7 +544,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||||
} break;
|
} break;
|
||||||
case 8: {
|
case 8: {
|
||||||
constexpr int c_ncols_dst = 8;
|
constexpr int c_ncols_dst = 8;
|
||||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
|
||||||
int row = tid / load_cols;
|
int row = tid / load_cols;
|
||||||
int col = tid % load_cols;
|
int col = tid % load_cols;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int idx = tid; idx < total_elems; idx += split_d_inner) {
|
for (int idx = 0; idx < total_elems; idx += split_d_inner) {
|
||||||
if (row < (int)split_d_inner) {
|
if (row < (int)split_d_inner) {
|
||||||
smem[row * n_cols + col] = x_block[row * stride_x + col];
|
smem[row * n_cols + col] = x_block[row * stride_x + col];
|
||||||
}
|
}
|
||||||
|
|
@ -84,6 +84,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
|
||||||
col += split_d_inner;
|
col += split_d_inner;
|
||||||
row += col / load_cols;
|
row += col / load_cols;
|
||||||
col = col % load_cols;
|
col = col % load_cols;
|
||||||
|
if (idx >= total_elems - tid - split_d_inner) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -207,6 +207,14 @@
|
||||||
#define RDNA3
|
#define RDNA3
|
||||||
#endif // defined(__GFX11__)
|
#endif // defined(__GFX11__)
|
||||||
|
|
||||||
|
#if defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
#define RDNA3_5
|
||||||
|
#endif // defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
|
||||||
|
#if defined(RDNA3) && !defined(RDNA3_5)
|
||||||
|
#define RDNA3_0
|
||||||
|
#endif // defined(RDNA3) && !defined(RDNA3_5)
|
||||||
|
|
||||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||||
#define RDNA2
|
#define RDNA2
|
||||||
|
|
|
||||||
|
|
@ -402,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
|
||||||
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
||||||
static const int qk = QK_Q4_0x4x2;
|
static const int qk = QK_Q4_0x4x2;
|
||||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||||
|
const int nloe = k % qk; // leftovers
|
||||||
|
|
||||||
const int dblk_size = 8 * 2; // 8x __fp16
|
const int dblk_size = 8 * 2; // 8x __fp16
|
||||||
const int qblk_size = qk / 2; // int4
|
const int qblk_size = qk / 2; // int4
|
||||||
|
|
@ -435,9 +436,11 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
||||||
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
|
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
|
||||||
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
|
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
|
||||||
|
|
||||||
|
bool partial = (nloe && i == nb-1);
|
||||||
|
|
||||||
uint8_t * q = y_q + (i * qblk_size);
|
uint8_t * q = y_q + (i * qblk_size);
|
||||||
for (int j = 0; j < qk / 2; j++) {
|
for (int j = 0; j < qk / 2; j++) {
|
||||||
q[j] = (qs[j + 128] << 4) | qs[j];
|
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -467,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
|
||||||
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
||||||
static const int qk = QK_Q4_0x4x2;
|
static const int qk = QK_Q4_0x4x2;
|
||||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||||
|
const int nloe = k % qk; // leftovers
|
||||||
|
|
||||||
const int dblk_size = 8 * 2; // 8x __fp16
|
const int dblk_size = 8 * 2; // 8x __fp16
|
||||||
const int qblk_size = qk / 2; // int4
|
const int qblk_size = qk / 2; // int4
|
||||||
|
|
@ -485,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
||||||
|
|
||||||
|
bool partial = (nloe && i == nb-1);
|
||||||
|
|
||||||
const uint8_t * q = y_q + (i * qblk_size);
|
const uint8_t * q = y_q + (i * qblk_size);
|
||||||
for (int j = 0; j < qk / 2; j++) {
|
for (int j = 0; j < qk / 2; j++) {
|
||||||
qs[j] = q[j] & 0xf;
|
if (partial) {
|
||||||
qs[j + 128] = q[j] >> 4;
|
qs[j*2+0] = q[j] & 0xf;
|
||||||
|
qs[j*2+1] = q[j] >> 4;
|
||||||
|
} else {
|
||||||
|
qs[j+000] = q[j] & 0xf;
|
||||||
|
qs[j+128] = q[j] >> 4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
|
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
|
||||||
|
|
@ -1078,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
|
||||||
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
|
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
|
||||||
static const int qk = QK_MXFP4x4x2;
|
static const int qk = QK_MXFP4x4x2;
|
||||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||||
|
const int nloe = k % qk; // leftovers
|
||||||
|
|
||||||
const int eblk_size = 8 * 1; // 8x E8M0
|
const int eblk_size = 8 * 1; // 8x E8M0
|
||||||
const int qblk_size = qk / 2; // int4
|
const int qblk_size = qk / 2; // int4
|
||||||
|
|
@ -1112,9 +1124,11 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
||||||
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
|
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
|
||||||
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
|
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
|
||||||
|
|
||||||
|
bool partial = (nloe && i == nb-1);
|
||||||
|
|
||||||
uint8_t * q = y_q + (i * qblk_size);
|
uint8_t * q = y_q + (i * qblk_size);
|
||||||
for (int j = 0; j < qk / 2; j++) {
|
for (int j = 0; j < qk / 2; j++) {
|
||||||
q[j] = (qs[j + 128] << 4) | qs[j];
|
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1144,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
|
||||||
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
|
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
|
||||||
static const int qk = QK_MXFP4x4x2;
|
static const int qk = QK_MXFP4x4x2;
|
||||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||||
|
const int nloe = k % qk; // leftovers
|
||||||
|
|
||||||
const int eblk_size = 8 * 1; // 8x E8M0
|
const int eblk_size = 8 * 1; // 8x E8M0
|
||||||
const int qblk_size = qk / 2; // int4
|
const int qblk_size = qk / 2; // int4
|
||||||
|
|
@ -1162,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
|
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
|
||||||
|
|
||||||
|
bool partial = (nloe && i == nb-1);
|
||||||
|
|
||||||
const uint8_t * q = y_q + (i * qblk_size);
|
const uint8_t * q = y_q + (i * qblk_size);
|
||||||
for (int j = 0; j < qk / 2; j++) {
|
for (int j = 0; j < qk / 2; j++) {
|
||||||
qs[j] = q[j] & 0xf;
|
if (partial) {
|
||||||
qs[j + 128] = q[j] >> 4;
|
qs[j*2+0] = q[j] & 0xf;
|
||||||
|
qs[j*2+1] = q[j] >> 4;
|
||||||
|
} else {
|
||||||
|
qs[j+000] = q[j] & 0xf;
|
||||||
|
qs[j+128] = q[j] >> 4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
|
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
|
||||||
|
|
@ -1801,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->ne[1] > 16 * 1024) {
|
if (ggml_nrows(src0) > 16 * 1024) {
|
||||||
return false; // typically the lm-head which would be too large for VTCM
|
return false; // typically the lm-head which would be too large for VTCM
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
|
if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
|
||||||
return false;
|
return false; // no huge batches or broadcasting (for now)
|
||||||
}
|
}
|
||||||
|
|
||||||
// src0 (weights) must be repacked
|
// src0 (weights) must be repacked
|
||||||
|
|
@ -1820,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (ggml_nrows(src1) > 1024) {
|
||||||
|
return false; // no huge batches (for now)
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) {
|
||||||
return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
|
return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) {
|
||||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||||
|
|
||||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||||
|
|
@ -88,9 +88,9 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
||||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||||
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||||
|
|
||||||
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
|
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements
|
||||||
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
|
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements
|
||||||
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
|
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ...
|
||||||
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
|
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
|
||||||
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
|
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
|
||||||
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
|
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
|
||||||
|
|
@ -111,7 +111,41 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) {
|
static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||||
|
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||||
|
|
||||||
|
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||||
|
const uint32_t nb = n / qk;
|
||||||
|
const uint32_t nloe = n % qk;
|
||||||
|
|
||||||
|
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||||
|
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||||
|
|
||||||
|
HVX_Vector_x8 r;
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(2)
|
||||||
|
for (i=0; i < nb; i++) {
|
||||||
|
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||||
|
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||||
|
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||||
|
r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8);
|
||||||
|
r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||||
|
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||||
|
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||||
|
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||||
|
r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8);
|
||||||
|
r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) {
|
||||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||||
|
|
||||||
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
|
||||||
|
|
@ -144,7 +178,41 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr)
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
|
||||||
|
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||||
|
|
||||||
|
const uint32_t qk = QK_Q4_0x4x2; // 256
|
||||||
|
const uint32_t nb = n / qk;
|
||||||
|
const uint32_t nloe = n % qk;
|
||||||
|
|
||||||
|
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||||
|
const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
|
||||||
|
|
||||||
|
HVX_Vector_x8 r;
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(2)
|
||||||
|
for (i=0; i < nb; i++) {
|
||||||
|
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||||
|
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
|
||||||
|
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
|
||||||
|
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
|
||||||
|
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
|
||||||
|
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
|
||||||
|
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
|
||||||
|
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
|
||||||
|
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
|
||||||
|
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) {
|
||||||
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
|
||||||
|
|
||||||
HVX_Vector v0 = vptr[0]; // first 128 vals
|
HVX_Vector v0 = vptr[0]; // first 128 vals
|
||||||
|
|
@ -160,6 +228,10 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) {
|
||||||
|
return hvx_vec_load_q8x4x8_full(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
|
// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
|
||||||
// Accumulate each block into a single int32 value.
|
// Accumulate each block into a single int32 value.
|
||||||
// Return a single HVX vector with 32x int32 accumulators.
|
// Return a single HVX vector with 32x int32 accumulators.
|
||||||
|
|
@ -167,14 +239,14 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
|
||||||
// if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
|
// if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
||||||
HVX_Vector r0 = Q6_V_vsplat_R(0);
|
HVX_Vector r0 = Q6_V_vzero();
|
||||||
HVX_Vector r1 = Q6_V_vsplat_R(0);
|
HVX_Vector r1 = Q6_V_vzero();
|
||||||
HVX_Vector r2 = Q6_V_vsplat_R(0);
|
HVX_Vector r2 = Q6_V_vzero();
|
||||||
HVX_Vector r3 = Q6_V_vsplat_R(0);
|
HVX_Vector r3 = Q6_V_vzero();
|
||||||
HVX_Vector r4 = Q6_V_vsplat_R(0);
|
HVX_Vector r4 = Q6_V_vzero();
|
||||||
HVX_Vector r5 = Q6_V_vsplat_R(0);
|
HVX_Vector r5 = Q6_V_vzero();
|
||||||
HVX_Vector r6 = Q6_V_vsplat_R(0);
|
HVX_Vector r6 = Q6_V_vzero();
|
||||||
HVX_Vector r7 = Q6_V_vsplat_R(0);
|
HVX_Vector r7 = Q6_V_vzero();
|
||||||
|
|
||||||
HVX_VectorPair p3;
|
HVX_VectorPair p3;
|
||||||
HVX_VectorPair p2;
|
HVX_VectorPair p2;
|
||||||
|
|
@ -213,15 +285,42 @@ static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, uns
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
|
static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
|
||||||
return hvx_vec_rmpy_x8_n(x, y, 1024);
|
HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
|
||||||
|
HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
|
||||||
|
HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
|
||||||
|
HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
|
||||||
|
HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
|
||||||
|
HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
|
||||||
|
HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
|
||||||
|
HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
|
||||||
|
|
||||||
|
HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||||
|
HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
|
||||||
|
HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
|
||||||
|
HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
|
||||||
|
|
||||||
|
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||||
|
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
|
||||||
|
r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
|
||||||
|
r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
|
||||||
|
|
||||||
|
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||||
|
p1 = Q6_W_vdeal_VVR(r3, r2, -4);
|
||||||
|
|
||||||
|
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||||
|
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
|
||||||
|
|
||||||
|
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
|
||||||
|
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
|
||||||
|
|
||||||
|
return r0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle most common cases of tensors not multiple of 1024.
|
static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
||||||
static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
|
if (n >= 512)
|
||||||
if (n <= 256) { return hvx_vec_rmpy_x8_n(x, y, 256); };
|
return hvx_vec_rmpy_x8_full(x, y);
|
||||||
if (n <= 512) { return hvx_vec_rmpy_x8_n(x, y, 512); };
|
|
||||||
if (n <= 768) { return hvx_vec_rmpy_x8_n(x, y, 768); };
|
return hvx_vec_rmpy_x8_partial(x, y, 512);
|
||||||
return hvx_vec_rmpy_x8_n(x, y, 1024);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
|
static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
|
||||||
|
|
@ -246,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||||
|
|
||||||
// Row sum (sf)
|
// Row sum (sf)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -257,12 +356,12 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
|
|
@ -272,19 +371,19 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
|
|
||||||
// Zero out unused scales
|
// Zero out unused elements
|
||||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||||
|
|
@ -326,8 +425,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||||
|
|
||||||
// Row sum (sf)
|
// Row sum (sf)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -338,14 +437,14 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -359,23 +458,23 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||||
|
|
||||||
// Zero out unused scales
|
// Zero out unused elements
|
||||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||||
|
|
@ -423,10 +522,10 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||||
|
|
||||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
const uint32_t nb = n / qk; // num full blocks
|
const uint32_t nb = n / qk; // num full blocks
|
||||||
const uint32_t nloe = n % qk; // num leftover elements
|
const uint32_t nloe = n % qk; // num leftover elements
|
||||||
|
|
@ -434,12 +533,12 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
// Load src1 columns (reused across both src0 rows)
|
// Load src1 columns (reused across both src0 rows)
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||||
|
|
||||||
// Load src0 rows (reused across both src1 columns)
|
// Load src0 rows (reused across both src1 columns)
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||||
|
|
@ -448,8 +547,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||||
|
|
||||||
// Load scales
|
// Load scales
|
||||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -473,18 +572,18 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
|
|
||||||
// Process leftovers
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -545,7 +644,7 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||||
|
|
||||||
// Row sum (sf)
|
// Row sum (sf)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -556,12 +655,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
|
|
@ -571,19 +670,19 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
|
||||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
|
|
||||||
// Zero out unused scales
|
// Zero out unused elements
|
||||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||||
|
|
@ -625,8 +724,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||||
|
|
||||||
// Row sum (qf32)
|
// Row sum (qf32)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -637,14 +736,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -658,14 +757,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
|
|
@ -674,7 +773,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
|
||||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
|
||||||
|
|
||||||
// Zero out unused scales
|
// Zero out unused elements
|
||||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||||
|
|
@ -722,10 +821,10 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||||
|
|
||||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
const uint32_t nb = n / qk; // num full blocks
|
const uint32_t nb = n / qk; // num full blocks
|
||||||
const uint32_t nloe = n % qk; // num leftover elements
|
const uint32_t nloe = n % qk; // num leftover elements
|
||||||
|
|
@ -733,12 +832,12 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
// Load src1 columns (reused across both src0 rows)
|
// Load src1 columns (reused across both src0 rows)
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||||
|
|
||||||
// Load src0 rows (reused across both src1 columns)
|
// Load src0 rows (reused across both src1 columns)
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||||
|
|
@ -747,8 +846,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
|
||||||
|
|
||||||
// Load scales
|
// Load scales
|
||||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -772,18 +871,18 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
|
|
||||||
// Process leftovers
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
|
||||||
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
|
||||||
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
|
||||||
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
|
||||||
|
|
||||||
|
|
@ -792,7 +891,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
|
||||||
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
|
||||||
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
|
||||||
|
|
||||||
// Zero out unused scales
|
// Zero out unused elements
|
||||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||||
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
|
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
|
||||||
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
|
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
|
||||||
|
|
@ -844,7 +943,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
|
||||||
|
|
||||||
// Row sum (sf)
|
// Row sum (sf)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -855,8 +954,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
|
|
||||||
|
|
@ -887,12 +986,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
|
||||||
|
|
||||||
// Process leftovers
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||||
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
||||||
|
|
||||||
// Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
|
// Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
|
||||||
|
|
@ -954,8 +1053,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
|
const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
|
||||||
|
|
||||||
// Row sum (sf)
|
// Row sum (sf)
|
||||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
// Multiply and accumulate into int32.
|
// Multiply and accumulate into int32.
|
||||||
// Compute combined scale (fp32).
|
// Compute combined scale (fp32).
|
||||||
|
|
@ -966,9 +1065,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||||
|
|
@ -1007,14 +1106,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
|
||||||
|
|
||||||
// Process leftovers
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
|
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
|
||||||
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
|
||||||
|
|
||||||
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
|
||||||
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
|
||||||
HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
|
HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
|
||||||
|
|
||||||
|
|
@ -1087,10 +1186,10 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
||||||
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
|
||||||
|
|
||||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||||
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r0_c1_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c0_sum = Q6_V_vzero();
|
||||||
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
|
HVX_Vector r1_c1_sum = Q6_V_vzero();
|
||||||
|
|
||||||
const uint32_t nb = n / qk; // num full blocks
|
const uint32_t nb = n / qk; // num full blocks
|
||||||
const uint32_t nloe = n % qk; // num leftover elements
|
const uint32_t nloe = n % qk; // num leftover elements
|
||||||
|
|
@ -1098,12 +1197,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
for (; i < nb; i++) {
|
for (; i < nb; i++) {
|
||||||
// Load src1 columns (reused across both src0 rows)
|
// Load src1 columns (reused across both src0 rows)
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
|
||||||
|
|
||||||
// Load src0 rows (reused across both src1 columns)
|
// Load src0 rows (reused across both src1 columns)
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
|
||||||
|
|
||||||
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
|
||||||
|
|
@ -1157,15 +1256,15 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
|
||||||
|
|
||||||
// Process leftovers
|
// Process leftovers
|
||||||
if (nloe) {
|
if (nloe) {
|
||||||
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
|
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
|
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
|
||||||
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
|
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
|
||||||
|
|
||||||
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
|
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
|
||||||
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
|
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
|
||||||
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
|
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
|
||||||
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
|
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
|
||||||
|
|
||||||
HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
|
HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
|
||||||
HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
|
HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
|
||||||
|
|
@ -1234,7 +1333,7 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void *
|
||||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair rsum_p = Q6_W_vzero();
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
|
@ -1264,8 +1363,8 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
|
||||||
uint32_t nvec = n / VLEN_FP16;
|
uint32_t nvec = n / VLEN_FP16;
|
||||||
uint32_t nloe = n % VLEN_FP16;
|
uint32_t nloe = n % VLEN_FP16;
|
||||||
|
|
||||||
HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair rsum0_p = Q6_W_vzero();
|
||||||
HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair rsum1_p = Q6_W_vzero();
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
|
@ -1303,10 +1402,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
|
||||||
uint32_t nloe = n % VLEN_FP16;
|
uint32_t nloe = n % VLEN_FP16;
|
||||||
|
|
||||||
// Row sums (sf) - 4 accumulators for 2×2 tile
|
// Row sums (sf) - 4 accumulators for 2×2 tile
|
||||||
HVX_VectorPair r0_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair r0_c0_sum_p = Q6_W_vzero();
|
||||||
HVX_VectorPair r0_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair r0_c1_sum_p = Q6_W_vzero();
|
||||||
HVX_VectorPair r1_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair r1_c0_sum_p = Q6_W_vzero();
|
||||||
HVX_VectorPair r1_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
|
HVX_VectorPair r1_c1_sum_p = Q6_W_vzero();
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
|
@ -1358,7 +1457,7 @@ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void *
|
||||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
HVX_Vector rsum = Q6_V_vzero();
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
|
@ -1388,9 +1487,9 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *
|
||||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
const HVX_Vector zero = Q6_V_vzero();
|
||||||
|
|
||||||
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
HVX_Vector rsum = Q6_V_vzero();
|
||||||
|
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
|
@ -1973,7 +2072,7 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
|
||||||
assert((unsigned long) y_q % 128 == 0);
|
assert((unsigned long) y_q % 128 == 0);
|
||||||
|
|
||||||
HVX_Vector * vx = (HVX_Vector *) x;
|
HVX_Vector * vx = (HVX_Vector *) x;
|
||||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
HVX_Vector zero = Q6_V_vzero();
|
||||||
|
|
||||||
// Use reduce max fp32 to find max(abs(e)) first
|
// Use reduce max fp32 to find max(abs(e)) first
|
||||||
HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
|
HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
|
||||||
|
|
@ -2034,7 +2133,7 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric
|
||||||
HVX_Vector * vx = (HVX_Vector *) x;
|
HVX_Vector * vx = (HVX_Vector *) x;
|
||||||
|
|
||||||
// Load and convert into QF32
|
// Load and convert into QF32
|
||||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
HVX_Vector zero = Q6_V_vzero();
|
||||||
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
||||||
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
||||||
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
||||||
|
|
@ -2077,7 +2176,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric
|
||||||
HVX_Vector * vx = (HVX_Vector *) x;
|
HVX_Vector * vx = (HVX_Vector *) x;
|
||||||
|
|
||||||
// Load and convert into QF32
|
// Load and convert into QF32
|
||||||
HVX_Vector zero = Q6_V_vsplat_R(0);
|
HVX_Vector zero = Q6_V_vzero();
|
||||||
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
|
||||||
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
|
||||||
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,10 @@ endif()
|
||||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||||
|
|
||||||
|
if (NOT DEFINED CMAKE_HIP_FLAGS_DEBUG)
|
||||||
|
set(CMAKE_HIP_FLAGS_DEBUG "-g -O2")
|
||||||
|
endif()
|
||||||
|
|
||||||
# CMake on Windows doesn't support the HIP language yet
|
# CMake on Windows doesn't support the HIP language yet
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(CXX_IS_HIPCC TRUE)
|
set(CXX_IS_HIPCC TRUE)
|
||||||
|
|
|
||||||
|
|
@ -491,6 +491,61 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
|
||||||
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
|
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
|
||||||
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
|
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
|
||||||
|
|
||||||
|
// UE4M3: unsigned, 4 exp bits (bias=7), 3 mantissa bits
|
||||||
|
// Returns value * 0.5 to match kvalues_mxfp4 convention (kvalues = 2 * E2M1_float)
|
||||||
|
static inline float ggml_ue4m3_to_fp32(uint8_t x) {
|
||||||
|
if (x == 0 || x == 0x7F) {
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
int exp = (x >> 3) & 0xF;
|
||||||
|
int man = x & 0x7;
|
||||||
|
float raw;
|
||||||
|
if (exp == 0) {
|
||||||
|
raw = ldexpf((float) man, -9);
|
||||||
|
} else {
|
||||||
|
raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
|
||||||
|
}
|
||||||
|
return raw * 0.5f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint8_t ggml_fp32_to_ue4m3(float x) {
|
||||||
|
if (!(x > 0.0f)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (x > 448.0f) {
|
||||||
|
x = 448.0f;
|
||||||
|
}
|
||||||
|
uint32_t bits;
|
||||||
|
memcpy(&bits, &x, 4);
|
||||||
|
int fp32_exp = ((bits >> 23) & 0xFF) - 127;
|
||||||
|
int fp32_man = (bits >> 20) & 0x7;
|
||||||
|
int ue4m3_exp = fp32_exp + 7;
|
||||||
|
if (ue4m3_exp <= 0) {
|
||||||
|
// subnormal: value = man * 2^-9, man = round(x * 2^9)
|
||||||
|
int man = (int) (x * 512.0f + 0.5f);
|
||||||
|
if (man > 7) {
|
||||||
|
man = 7;
|
||||||
|
}
|
||||||
|
if (man < 1) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return (uint8_t) man;
|
||||||
|
}
|
||||||
|
if (ue4m3_exp >= 15) {
|
||||||
|
return 0x7E;
|
||||||
|
}
|
||||||
|
int round_bit = (bits >> 19) & 1;
|
||||||
|
int ue4m3_man = fp32_man + round_bit;
|
||||||
|
if (ue4m3_man > 7) {
|
||||||
|
ue4m3_man = 0;
|
||||||
|
ue4m3_exp++;
|
||||||
|
if (ue4m3_exp >= 15) {
|
||||||
|
return 0x7E;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (uint8_t) ((ue4m3_exp << 3) | ue4m3_man);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts brain16 to float32.
|
* Converts brain16 to float32.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ struct ggml_metal {
|
||||||
uint64_t fuse_cnt[GGML_OP_COUNT];
|
uint64_t fuse_cnt[GGML_OP_COUNT];
|
||||||
|
|
||||||
// capture state
|
// capture state
|
||||||
bool capture_next_compute;
|
int capture_compute;
|
||||||
bool capture_started;
|
bool capture_started;
|
||||||
|
|
||||||
id<MTLCaptureScope> capture_scope;
|
id<MTLCaptureScope> capture_scope;
|
||||||
|
|
@ -75,6 +75,10 @@ struct ggml_metal {
|
||||||
// abort ggml_metal_graph_compute if callback returns true
|
// abort ggml_metal_graph_compute if callback returns true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
// error state - set when a command buffer fails during synchronize
|
||||||
|
// once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated
|
||||||
|
bool has_error;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
|
|
@ -154,10 +158,19 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
||||||
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
||||||
|
|
||||||
res->capture_next_compute = false;
|
res->capture_compute = 0;
|
||||||
res->capture_started = false;
|
res->capture_started = false;
|
||||||
res->capture_scope = nil;
|
res->capture_scope = nil;
|
||||||
|
|
||||||
|
{
|
||||||
|
const char * val = getenv("GGML_METAL_CAPTURE_COMPUTE");
|
||||||
|
if (val) {
|
||||||
|
res->capture_compute = atoi(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res->has_error = false;
|
||||||
|
|
||||||
res->gf = nil;
|
res->gf = nil;
|
||||||
res->encode_async = nil;
|
res->encode_async = nil;
|
||||||
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
|
||||||
|
|
@ -246,7 +259,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||||
if (status == MTLCommandBufferStatusError) {
|
if (status == MTLCommandBufferStatusError) {
|
||||||
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||||
}
|
}
|
||||||
GGML_ABORT("fatal error");
|
ctx->has_error = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -262,7 +276,15 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||||
if (status == MTLCommandBufferStatusError) {
|
if (status == MTLCommandBufferStatusError) {
|
||||||
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
|
||||||
}
|
}
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
|
// release this and all remaining command buffers before returning
|
||||||
|
for (size_t j = i; j < ctx->cmd_bufs_ext.count; ++j) {
|
||||||
|
[ctx->cmd_bufs_ext[j] release];
|
||||||
|
}
|
||||||
|
[ctx->cmd_bufs_ext removeAllObjects];
|
||||||
|
|
||||||
|
ctx->has_error = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
[cmd_buf release];
|
[cmd_buf release];
|
||||||
|
|
@ -414,6 +436,11 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
||||||
|
if (ctx->has_error) {
|
||||||
|
GGML_LOG_ERROR("%s: backend is in error state from a previous command buffer failure - recreate the backend to recover\n", __func__);
|
||||||
|
return GGML_STATUS_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
// number of nodes encoded by the main thread (empirically determined)
|
// number of nodes encoded by the main thread (empirically determined)
|
||||||
const int n_main = MAX(64, 0.1*gf->n_nodes);
|
const int n_main = MAX(64, 0.1*gf->n_nodes);
|
||||||
|
|
||||||
|
|
@ -438,9 +465,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
|
|
||||||
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
|
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
|
||||||
|
|
||||||
const bool use_capture = ctx->capture_next_compute;
|
if (ctx->capture_compute >= 0) {
|
||||||
|
ctx->capture_compute--;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool use_capture = ctx->capture_compute == 0;
|
||||||
if (use_capture) {
|
if (use_capture) {
|
||||||
ctx->capture_next_compute = false;
|
ctx->capture_compute = -1;
|
||||||
|
|
||||||
// make sure all previous computations have finished before starting the capture
|
// make sure all previous computations have finished before starting the capture
|
||||||
if (ctx->cmd_buf_last) {
|
if (ctx->cmd_buf_last) {
|
||||||
|
|
@ -449,6 +480,10 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ctx->capture_started) {
|
if (!ctx->capture_started) {
|
||||||
|
NSString * path = [NSString stringWithFormat:@"/tmp/perf-metal-%d.gputrace", getpid()];
|
||||||
|
|
||||||
|
GGML_LOG_WARN("%s: capturing graph in %s\n", __func__, [path UTF8String]);
|
||||||
|
|
||||||
// create capture scope
|
// create capture scope
|
||||||
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||||
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
|
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
|
||||||
|
|
@ -456,7 +491,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||||
descriptor.captureObject = ctx->capture_scope;
|
descriptor.captureObject = ctx->capture_scope;
|
||||||
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
|
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
|
||||||
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
|
descriptor.outputURL = [NSURL fileURLWithPath:path];
|
||||||
|
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||||
|
|
@ -519,7 +554,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
|
|
||||||
// enter here only when capturing in order to wait for all computation to finish
|
// enter here only when capturing in order to wait for all computation to finish
|
||||||
// otherwise, we leave the graph to compute asynchronously
|
// otherwise, we leave the graph to compute asynchronously
|
||||||
if (!use_capture && ctx->capture_started) {
|
if (use_capture && ctx->capture_started) {
|
||||||
// wait for completion and check status of each command buffer
|
// wait for completion and check status of each command buffer
|
||||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||||
{
|
{
|
||||||
|
|
@ -571,6 +606,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
|
|
||||||
[ctx->capture_scope endScope];
|
[ctx->capture_scope endScope];
|
||||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||||
|
|
||||||
|
ctx->capture_started = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -663,7 +700,7 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
|
||||||
idx_end,
|
idx_end,
|
||||||
ctx->use_fusion,
|
ctx->use_fusion,
|
||||||
ctx->use_concurrency,
|
ctx->use_concurrency,
|
||||||
ctx->capture_next_compute,
|
ctx->capture_compute,
|
||||||
ctx->debug_graph,
|
ctx->debug_graph,
|
||||||
ctx->debug_fusion);
|
ctx->debug_fusion);
|
||||||
|
|
||||||
|
|
@ -698,5 +735,5 @@ bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
|
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
|
||||||
ctx->capture_next_compute = true;
|
ctx->capture_compute = 1;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -577,6 +577,41 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||||
|
char base[256];
|
||||||
|
char name[256];
|
||||||
|
|
||||||
|
// v is src[2], dimensions: S_v = ne[0], H = ne[1]
|
||||||
|
const int ne20 = op->src[2]->ne[0]; // S_v
|
||||||
|
const int ne21 = op->src[2]->ne[1]; // H
|
||||||
|
const int ne30 = op->src[3]->ne[0]; // G
|
||||||
|
|
||||||
|
const int nsg = op->src[2]->ne[0]/32;
|
||||||
|
|
||||||
|
GGML_ASSERT(op->src[5]->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(op->ne[0] == ne20 * ne21);
|
||||||
|
GGML_ASSERT(ne20 % 32 == 0);
|
||||||
|
|
||||||
|
snprintf(base, 256, "kernel_gated_delta_net_%s_%d", ggml_type_name(op->src[0]->type), nsg);
|
||||||
|
snprintf(name, 256, "%s_ne20=%d_ne30=%d", base, ne20, ne30);
|
||||||
|
|
||||||
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||||
|
if (!res.pipeline) {
|
||||||
|
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||||
|
|
||||||
|
ggml_metal_cv_set_int16(cv, ne20, FC_GATED_DELTA_NET + 0);
|
||||||
|
ggml_metal_cv_set_int16(cv, ne30, FC_GATED_DELTA_NET + 1);
|
||||||
|
|
||||||
|
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||||
|
|
||||||
|
ggml_metal_cv_free(cv);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.nsg = nsg;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) {
|
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||||
char base[256];
|
char base[256];
|
||||||
char name[256];
|
char name[256];
|
||||||
|
|
@ -1435,10 +1470,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_l
|
||||||
|
|
||||||
const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
|
const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
|
||||||
|
|
||||||
|
const bool is_cb = op->src[0]->ne[0] != op->src[1]->ne[0];
|
||||||
const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
|
const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
|
||||||
|
|
||||||
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
|
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
|
||||||
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb);
|
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d_cb=%d", base, op_num, n_fuse, is_rb, is_cb);
|
||||||
|
|
||||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||||
if (!res.pipeline) {
|
if (!res.pipeline) {
|
||||||
|
|
@ -1447,6 +1483,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_l
|
||||||
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
|
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
|
||||||
ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
|
ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
|
||||||
ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2);
|
ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2);
|
||||||
|
ggml_metal_cv_set_bool (cv, is_cb, FC_BIN + 3);
|
||||||
|
|
||||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||||
|
|
||||||
|
|
@ -1717,12 +1754,29 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_met
|
||||||
char base[256];
|
char base[256];
|
||||||
char name[256];
|
char name[256];
|
||||||
|
|
||||||
snprintf(base, 256, "kernel_upscale_%s", ggml_type_name(op->src[0]->type));
|
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
|
||||||
snprintf(name, 256, "%s", base);
|
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
||||||
|
|
||||||
|
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
|
||||||
|
|
||||||
|
if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||||
|
snprintf(base, 256, "kernel_upscale_bilinear_%s", ggml_type_name(op->src[0]->type));
|
||||||
|
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
|
||||||
|
snprintf(base, 256, "kernel_upscale_bicubic_%s", ggml_type_name(op->src[0]->type));
|
||||||
|
} else {
|
||||||
|
snprintf(base, 256, "kernel_upscale_nearest_%s", ggml_type_name(op->src[0]->type));
|
||||||
|
}
|
||||||
|
snprintf(name, 256, "%s_aa=%d", base, antialias);
|
||||||
|
|
||||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||||
if (!res.pipeline) {
|
if (!res.pipeline) {
|
||||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||||
|
|
||||||
|
ggml_metal_cv_set_bool(cv, antialias, FC_UPSCALE + 0);
|
||||||
|
|
||||||
|
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||||
|
|
||||||
|
ggml_metal_cv_free(cv);
|
||||||
}
|
}
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
|
|
||||||
|
|
@ -125,6 +125,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
|
|
|
||||||
|
|
@ -1108,7 +1108,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
op->type == GGML_TYPE_F32 &&
|
op->type == GGML_TYPE_F32 &&
|
||||||
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
|
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
|
return op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_POOL_1D:
|
case GGML_OP_POOL_1D:
|
||||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
|
|
@ -1142,6 +1142,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
op->src[0]->ne[0] != 128 &&
|
op->src[0]->ne[0] != 128 &&
|
||||||
op->src[0]->ne[0] != 192 &&
|
op->src[0]->ne[0] != 192 &&
|
||||||
op->src[0]->ne[0] != 256 &&
|
op->src[0]->ne[0] != 256 &&
|
||||||
|
op->src[0]->ne[0] != 320 &&
|
||||||
op->src[0]->ne[0] != 576) {
|
op->src[0]->ne[0] != 576) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -1155,10 +1156,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
case GGML_OP_RWKV_WKV6:
|
case GGML_OP_RWKV_WKV6:
|
||||||
case GGML_OP_RWKV_WKV7:
|
case GGML_OP_RWKV_WKV7:
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_GATED_DELTA_NET:
|
||||||
|
return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
return has_simdgroup_reduction;
|
return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
|
||||||
case GGML_OP_SET:
|
case GGML_OP_SET:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
|
|
@ -1216,7 +1219,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
return true;
|
return op->src[0]->type != GGML_TYPE_NVFP4;
|
||||||
case GGML_OP_SET_ROWS:
|
case GGML_OP_SET_ROWS:
|
||||||
{
|
{
|
||||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@
|
||||||
#define N_R0_Q4_K 2
|
#define N_R0_Q4_K 2
|
||||||
#define N_SG_Q4_K 2
|
#define N_SG_Q4_K 2
|
||||||
|
|
||||||
#define N_R0_Q5_K 2
|
#define N_R0_Q5_K 1
|
||||||
#define N_SG_Q5_K 2
|
#define N_SG_Q5_K 2
|
||||||
|
|
||||||
#define N_R0_Q6_K 2
|
#define N_R0_Q6_K 2
|
||||||
|
|
@ -83,6 +83,8 @@
|
||||||
#define FC_UNARY 1200
|
#define FC_UNARY 1200
|
||||||
#define FC_BIN 1300
|
#define FC_BIN 1300
|
||||||
#define FC_SUM_ROWS 1400
|
#define FC_SUM_ROWS 1400
|
||||||
|
#define FC_UPSCALE 1500
|
||||||
|
#define FC_GATED_DELTA_NET 1600
|
||||||
|
|
||||||
// op-specific constants
|
// op-specific constants
|
||||||
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
||||||
|
|
@ -792,6 +794,44 @@ typedef struct {
|
||||||
uint64_t nb0;
|
uint64_t nb0;
|
||||||
} ggml_metal_kargs_ssm_scan;
|
} ggml_metal_kargs_ssm_scan;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t ne00;
|
||||||
|
int32_t ne01;
|
||||||
|
int32_t ne02;
|
||||||
|
int32_t ne03;
|
||||||
|
uint64_t nb00;
|
||||||
|
uint64_t nb01;
|
||||||
|
uint64_t nb02;
|
||||||
|
uint64_t nb03;
|
||||||
|
int32_t ne10;
|
||||||
|
int32_t ne11;
|
||||||
|
int32_t ne12;
|
||||||
|
int32_t ne13;
|
||||||
|
uint64_t nb10;
|
||||||
|
uint64_t nb11;
|
||||||
|
uint64_t nb12;
|
||||||
|
uint64_t nb13;
|
||||||
|
int32_t ne20;
|
||||||
|
int32_t ne21;
|
||||||
|
int32_t ne22;
|
||||||
|
int32_t ne23;
|
||||||
|
uint64_t nb20;
|
||||||
|
uint64_t nb21;
|
||||||
|
uint64_t nb22;
|
||||||
|
uint64_t nb23;
|
||||||
|
int32_t ns02;
|
||||||
|
int32_t ns12;
|
||||||
|
int32_t ns22;
|
||||||
|
int32_t ne0;
|
||||||
|
int32_t ne1;
|
||||||
|
int32_t ne2;
|
||||||
|
int32_t ne3;
|
||||||
|
uint64_t nb0;
|
||||||
|
uint64_t nb1;
|
||||||
|
uint64_t nb2;
|
||||||
|
uint64_t nb3;
|
||||||
|
} ggml_metal_kargs_gated_delta_net;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int32_t ne00;
|
int32_t ne00;
|
||||||
int32_t ne01;
|
int32_t ne01;
|
||||||
|
|
@ -890,6 +930,7 @@ typedef struct {
|
||||||
float sf1;
|
float sf1;
|
||||||
float sf2;
|
float sf2;
|
||||||
float sf3;
|
float sf3;
|
||||||
|
float poffs;
|
||||||
} ggml_metal_kargs_upscale;
|
} ggml_metal_kargs_upscale;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
|
||||||
|
|
@ -333,6 +333,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_rwkv(ctx, idx);
|
n_fuse = ggml_metal_op_rwkv(ctx, idx);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_GATED_DELTA_NET:
|
||||||
|
{
|
||||||
|
n_fuse = ggml_metal_op_gated_delta_net(ctx, idx);
|
||||||
|
} break;
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
|
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
|
||||||
|
|
@ -1562,6 +1566,81 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_metal_op_gated_delta_net(ggml_metal_op_t ctx, int idx) {
|
||||||
|
ggml_tensor * op = ctx->node(idx);
|
||||||
|
|
||||||
|
ggml_metal_library_t lib = ctx->lib;
|
||||||
|
ggml_metal_encoder_t enc = ctx->enc;
|
||||||
|
|
||||||
|
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
|
auto pipeline = ggml_metal_library_get_pipeline_gated_delta_net(lib, op);
|
||||||
|
|
||||||
|
int ida = 0;
|
||||||
|
|
||||||
|
ggml_metal_kargs_gated_delta_net args = {
|
||||||
|
/*.ne00 =*/ ne00,
|
||||||
|
/*.ne01 =*/ ne01,
|
||||||
|
/*.ne02 =*/ ne02,
|
||||||
|
/*.ne03 =*/ ne03,
|
||||||
|
/*.nb00 =*/ nb00,
|
||||||
|
/*.nb01 =*/ nb01,
|
||||||
|
/*.nb02 =*/ nb02,
|
||||||
|
/*.nb03 =*/ nb03,
|
||||||
|
/*.ne10 =*/ ne10,
|
||||||
|
/*.ne11 =*/ ne11,
|
||||||
|
/*.ne12 =*/ ne12,
|
||||||
|
/*.ne13 =*/ ne13,
|
||||||
|
/*.nb10 =*/ nb10,
|
||||||
|
/*.nb11 =*/ nb11,
|
||||||
|
/*.nb12 =*/ nb12,
|
||||||
|
/*.nb13 =*/ nb13,
|
||||||
|
/*.ne20 =*/ ne20,
|
||||||
|
/*.ne21 =*/ ne21,
|
||||||
|
/*.ne22 =*/ ne22,
|
||||||
|
/*.ne23 =*/ ne23,
|
||||||
|
/*.nb20 =*/ nb20,
|
||||||
|
/*.nb21 =*/ nb21,
|
||||||
|
/*.nb22 =*/ nb22,
|
||||||
|
/*.nb23 =*/ nb23,
|
||||||
|
/*.ns02 =*/ (int32_t) (nb02/sizeof(float)),
|
||||||
|
/*.ns12 =*/ (int32_t) (nb12/sizeof(float)),
|
||||||
|
/*.ns22 =*/ (int32_t) (nb22/sizeof(float)),
|
||||||
|
/*.ne0 =*/ ne0,
|
||||||
|
/*.ne1 =*/ ne1,
|
||||||
|
/*.ne2 =*/ ne2,
|
||||||
|
/*.ne3 =*/ ne3,
|
||||||
|
/*.nb0 =*/ nb0,
|
||||||
|
/*.nb1 =*/ nb1,
|
||||||
|
/*.nb2 =*/ nb2,
|
||||||
|
/*.nb3 =*/ nb3,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), ida++);
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++); // q
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++); // k
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++); // v
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), ida++); // gate
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), ida++); // beta
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), ida++); // state
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), ida++); // dst
|
||||||
|
|
||||||
|
const int nsg = pipeline.nsg;
|
||||||
|
|
||||||
|
ggml_metal_encoder_dispatch_threadgroups(enc, op->src[2]->ne[0]/nsg, op->src[2]->ne[1], op->src[2]->ne[3], 32, nsg, 1);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
|
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_tensor * op = ctx->node(idx);
|
ggml_tensor * op = ctx->node(idx);
|
||||||
|
|
||||||
|
|
@ -1963,6 +2042,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||||
(
|
(
|
||||||
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
|
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
|
||||||
op->src[0]->type == GGML_TYPE_F16 ||
|
op->src[0]->type == GGML_TYPE_F16 ||
|
||||||
|
op->src[0]->type == GGML_TYPE_BF16 ||
|
||||||
op->src[0]->type == GGML_TYPE_Q4_0 ||
|
op->src[0]->type == GGML_TYPE_Q4_0 ||
|
||||||
op->src[0]->type == GGML_TYPE_Q4_1 ||
|
op->src[0]->type == GGML_TYPE_Q4_1 ||
|
||||||
op->src[0]->type == GGML_TYPE_Q5_0 ||
|
op->src[0]->type == GGML_TYPE_Q5_0 ||
|
||||||
|
|
@ -1977,6 +2057,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||||
op->src[0]->type == GGML_TYPE_Q4_K ||
|
op->src[0]->type == GGML_TYPE_Q4_K ||
|
||||||
op->src[0]->type == GGML_TYPE_Q5_K ||
|
op->src[0]->type == GGML_TYPE_Q5_K ||
|
||||||
op->src[0]->type == GGML_TYPE_Q6_K ||
|
op->src[0]->type == GGML_TYPE_Q6_K ||
|
||||||
|
op->src[0]->type == GGML_TYPE_Q2_K ||
|
||||||
|
op->src[0]->type == GGML_TYPE_Q3_K ||
|
||||||
false) && (ne11 >= 4 && ne11 <= 8)
|
false) && (ne11 >= 4 && ne11 <= 8)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -3098,9 +3180,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
|
ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
|
||||||
|
|
||||||
if (pipeline.cnt) {
|
if (pipeline.cnt) {
|
||||||
const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op);
|
ggml_metal_encoder_dispatch_threadgroups(enc, args.ne0, ggml_nrows(op), 1, 1, 1, 1);
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
|
|
||||||
} else {
|
} else {
|
||||||
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
|
||||||
|
|
@ -3729,32 +3809,43 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
const float sf0 = (float)ne0/op->src[0]->ne[0];
|
float sf0 = (float)ne0/op->src[0]->ne[0];
|
||||||
const float sf1 = (float)ne1/op->src[0]->ne[1];
|
float sf1 = (float)ne1/op->src[0]->ne[1];
|
||||||
const float sf2 = (float)ne2/op->src[0]->ne[2];
|
float sf2 = (float)ne2/op->src[0]->ne[2];
|
||||||
const float sf3 = (float)ne3/op->src[0]->ne[3];
|
float sf3 = (float)ne3/op->src[0]->ne[3];
|
||||||
|
|
||||||
|
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
|
||||||
|
|
||||||
|
float poffs = 0.5f;
|
||||||
|
|
||||||
|
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||||
|
poffs = 0.0f;
|
||||||
|
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
||||||
|
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_metal_kargs_upscale args = {
|
ggml_metal_kargs_upscale args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
/*.ne01 =*/ ne01,
|
/*.ne01 =*/ ne01,
|
||||||
/*.ne02 =*/ ne02,
|
/*.ne02 =*/ ne02,
|
||||||
/*.ne03 =*/ ne03,
|
/*.ne03 =*/ ne03,
|
||||||
/*.nb00 =*/ nb00,
|
/*.nb00 =*/ nb00,
|
||||||
/*.nb01 =*/ nb01,
|
/*.nb01 =*/ nb01,
|
||||||
/*.nb02 =*/ nb02,
|
/*.nb02 =*/ nb02,
|
||||||
/*.nb03 =*/ nb03,
|
/*.nb03 =*/ nb03,
|
||||||
/*.ne0 =*/ ne0,
|
/*.ne0 =*/ ne0,
|
||||||
/*.ne1 =*/ ne1,
|
/*.ne1 =*/ ne1,
|
||||||
/*.ne2 =*/ ne2,
|
/*.ne2 =*/ ne2,
|
||||||
/*.ne3 =*/ ne3,
|
/*.ne3 =*/ ne3,
|
||||||
/*.nb0 =*/ nb0,
|
/*.nb0 =*/ nb0,
|
||||||
/*.nb1 =*/ nb1,
|
/*.nb1 =*/ nb1,
|
||||||
/*.nb2 =*/ nb2,
|
/*.nb2 =*/ nb2,
|
||||||
/*.nb3 =*/ nb3,
|
/*.nb3 =*/ nb3,
|
||||||
/*.sf0 =*/ sf0,
|
/*.sf0 =*/ sf0,
|
||||||
/*.sf1 =*/ sf1,
|
/*.sf1 =*/ sf1,
|
||||||
/*.sf2 =*/ sf2,
|
/*.sf2 =*/ sf2,
|
||||||
/*.sf3 =*/ sf3
|
/*.sf3 =*/ sf3,
|
||||||
|
/*.poffs =*/ poffs,
|
||||||
};
|
};
|
||||||
|
|
||||||
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,7 @@ int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
|
||||||
|
int ggml_metal_op_gated_delta_net (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_set (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_set (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
|
||||||
|
|
|
||||||
|
|
@ -1111,6 +1111,7 @@ template [[host_name("kernel_unary_f16_f16_4")]] kernel kernel_unary_t kernel_un
|
||||||
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
|
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
|
||||||
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
|
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
|
||||||
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
|
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
|
||||||
|
constant bool FC_bin_cb [[function_constant(FC_BIN + 3)]];
|
||||||
|
|
||||||
template <typename T0, typename T1, typename T>
|
template <typename T0, typename T1, typename T>
|
||||||
kernel void kernel_bin_fuse_impl(
|
kernel void kernel_bin_fuse_impl(
|
||||||
|
|
@ -1124,11 +1125,12 @@ kernel void kernel_bin_fuse_impl(
|
||||||
#define FC_OP FC_bin_op
|
#define FC_OP FC_bin_op
|
||||||
#define FC_F FC_bin_f
|
#define FC_F FC_bin_f
|
||||||
#define FC_RB FC_bin_rb
|
#define FC_RB FC_bin_rb
|
||||||
|
#define FC_CB FC_bin_cb
|
||||||
|
|
||||||
if (FC_RB) {
|
if (FC_RB) {
|
||||||
// row broadcast
|
// row broadcast
|
||||||
const uint i0 = tgpig.x;
|
const uint i0 = tgpig.y*args.ne00 + tgpig.x;
|
||||||
const uint i1 = i0%args.ne10;
|
const uint i1 = FC_CB ? tgpig.x%args.ne10 : tgpig.x;
|
||||||
|
|
||||||
device const T0 * src0_row = (device const T0 *) (src0);
|
device const T0 * src0_row = (device const T0 *) (src0);
|
||||||
device T * dst_row = (device T *) (dst);
|
device T * dst_row = (device T *) (dst);
|
||||||
|
|
@ -1200,7 +1202,7 @@ kernel void kernel_bin_fuse_impl(
|
||||||
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||||
|
|
||||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||||
const int i10 = i0%args.ne10;
|
const int i10 = FC_CB ? i0%args.ne10 : i0;
|
||||||
|
|
||||||
if (FC_OP == 0) {
|
if (FC_OP == 0) {
|
||||||
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
|
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
|
||||||
|
|
@ -1225,7 +1227,7 @@ kernel void kernel_bin_fuse_impl(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||||
const int i10 = i0%args.ne10;
|
const int i10 = FC_CB ? i0%args.ne10 : i0;
|
||||||
|
|
||||||
T res = src0_ptr[i0];
|
T res = src0_ptr[i0];
|
||||||
|
|
||||||
|
|
@ -1261,6 +1263,7 @@ kernel void kernel_bin_fuse_impl(
|
||||||
#undef FC_OP
|
#undef FC_OP
|
||||||
#undef FC_F
|
#undef FC_F
|
||||||
#undef FC_RB
|
#undef FC_RB
|
||||||
|
#undef FC_CB
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
|
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
|
||||||
|
|
@ -2434,6 +2437,228 @@ kernel void kernel_rwkv_wkv7_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]];
|
||||||
|
constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]];
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
template<short NSG>
|
||||||
|
kernel void kernel_gated_delta_net_impl(
|
||||||
|
constant ggml_metal_kargs_gated_delta_net & args,
|
||||||
|
device const char * q,
|
||||||
|
device const char * k,
|
||||||
|
device const char * v,
|
||||||
|
device const char * g,
|
||||||
|
device const char * b,
|
||||||
|
device const char * s,
|
||||||
|
device char * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
#define S_v FC_gated_delta_net_ne20
|
||||||
|
#define G FC_gated_delta_net_ne30
|
||||||
|
|
||||||
|
const uint tx = tpitg.x;
|
||||||
|
const uint ty = tpitg.y;
|
||||||
|
|
||||||
|
const uint i23 = tgpig.z; // B
|
||||||
|
const uint i21 = tgpig.y; // H
|
||||||
|
const uint i20 = tgpig.x*NSG + ty;
|
||||||
|
|
||||||
|
const uint i01 = i21 % args.ne01;
|
||||||
|
const uint i11 = i21 % args.ne11;
|
||||||
|
|
||||||
|
const float scale = 1.0f / sqrt((float)S_v);
|
||||||
|
|
||||||
|
// state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
|
||||||
|
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||||
|
|
||||||
|
float ls[NSG];
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
ls[j] = s_ptr[is];
|
||||||
|
}
|
||||||
|
|
||||||
|
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
|
||||||
|
|
||||||
|
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
|
||||||
|
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
|
||||||
|
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
|
||||||
|
|
||||||
|
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
|
||||||
|
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
|
||||||
|
|
||||||
|
for (short t = 0; t < args.ne22; t++) {
|
||||||
|
float s_k = 0.0f;
|
||||||
|
|
||||||
|
if (G == 1) {
|
||||||
|
const float g_exp = exp(g_ptr[0]);
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
ls[j] *= g_exp;
|
||||||
|
|
||||||
|
s_k += ls[j]*k_ptr[is];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// KDA
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
ls[j] *= exp(g_ptr[is]);
|
||||||
|
|
||||||
|
s_k += ls[j]*k_ptr[is];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s_k = simd_sum(s_k);
|
||||||
|
|
||||||
|
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
|
||||||
|
|
||||||
|
float y = 0.0f;
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
ls[j] += k_ptr[is]*d;
|
||||||
|
|
||||||
|
y += ls[j]*q_ptr[is];
|
||||||
|
}
|
||||||
|
|
||||||
|
y = simd_sum(y);
|
||||||
|
|
||||||
|
if (tx == 0) {
|
||||||
|
dst_attn[t*args.ne21*S_v] = y*scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
q_ptr += args.ns02;
|
||||||
|
k_ptr += args.ns12;
|
||||||
|
v_ptr += args.ns22;
|
||||||
|
|
||||||
|
b_ptr += args.ne21;
|
||||||
|
g_ptr += args.ne21*G;
|
||||||
|
}
|
||||||
|
|
||||||
|
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
dst_state[is] = ls[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef S_v
|
||||||
|
#undef G
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t;
|
||||||
|
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<1>;
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<2>;
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<4>;
|
||||||
|
|
||||||
|
#else
|
||||||
|
// a simplified version of the above
|
||||||
|
// no performance improvement, so keep the above version for now
|
||||||
|
|
||||||
|
template<typename T, short NSG>
|
||||||
|
kernel void kernel_gated_delta_net_impl(
|
||||||
|
constant ggml_metal_kargs_gated_delta_net & args,
|
||||||
|
device const char * q,
|
||||||
|
device const char * k,
|
||||||
|
device const char * v,
|
||||||
|
device const char * g,
|
||||||
|
device const char * b,
|
||||||
|
device const char * s,
|
||||||
|
device char * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
#define S_v FC_gated_delta_net_ne20
|
||||||
|
#define G FC_gated_delta_net_ne30
|
||||||
|
|
||||||
|
const uint tx = tpitg.x;
|
||||||
|
const uint ty = tpitg.y;
|
||||||
|
|
||||||
|
const uint i23 = tgpig.z; // B
|
||||||
|
const uint i21 = tgpig.y; // H
|
||||||
|
const uint i20 = tgpig.x*NSG + ty;
|
||||||
|
|
||||||
|
const uint i01 = i21 % args.ne01;
|
||||||
|
const uint i11 = i21 % args.ne11;
|
||||||
|
|
||||||
|
const float scale = 1.0f / sqrt((float)S_v);
|
||||||
|
|
||||||
|
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||||
|
|
||||||
|
float lsf[NSG];
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
lsf[j] = s_ptr[is*S_v];
|
||||||
|
}
|
||||||
|
|
||||||
|
thread T * ls = (thread T *) (lsf);
|
||||||
|
|
||||||
|
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
|
||||||
|
|
||||||
|
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
|
||||||
|
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
|
||||||
|
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
|
||||||
|
|
||||||
|
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
|
||||||
|
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
|
||||||
|
|
||||||
|
for (short t = 0; t < args.ne22; t++) {
|
||||||
|
device const T * qt_ptr = (device const T *) (q_ptr);
|
||||||
|
device const T * kt_ptr = (device const T *) (k_ptr);
|
||||||
|
device const T * gt_ptr = (device const T *) (g_ptr);
|
||||||
|
|
||||||
|
if (G == 1) {
|
||||||
|
*ls *= exp(g_ptr[0]);
|
||||||
|
} else {
|
||||||
|
// KDA
|
||||||
|
*ls *= exp(gt_ptr[tx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float s_k = simd_sum(dot(*ls, kt_ptr[tx]));
|
||||||
|
|
||||||
|
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
|
||||||
|
|
||||||
|
*ls += kt_ptr[tx]*d;
|
||||||
|
|
||||||
|
const float y = simd_sum(dot(*ls, qt_ptr[tx]));
|
||||||
|
|
||||||
|
if (tx == 0) {
|
||||||
|
*dst_attn = y*scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
q_ptr += args.ns02;
|
||||||
|
k_ptr += args.ns12;
|
||||||
|
v_ptr += args.ns22;
|
||||||
|
|
||||||
|
b_ptr += args.ne21;
|
||||||
|
g_ptr += args.ne21*G;
|
||||||
|
|
||||||
|
dst_attn += args.ne21*S_v;
|
||||||
|
}
|
||||||
|
|
||||||
|
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||||
|
device T * dstt_state = (device T *) (dst_state);
|
||||||
|
|
||||||
|
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||||
|
const short is = tx*NSG + j;
|
||||||
|
dst_state[is*S_v] = lsf[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef S_v
|
||||||
|
#undef G
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef decltype(kernel_gated_delta_net_impl<float4, 4>) kernel_gated_delta_net_t;
|
||||||
|
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float, 1>;
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float2, 2>;
|
||||||
|
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float4, 4>;
|
||||||
|
#endif
|
||||||
|
|
||||||
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
|
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
|
||||||
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
|
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
|
||||||
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
|
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
|
||||||
|
|
@ -2782,7 +3007,7 @@ kernel void kernel_l2_norm_impl(
|
||||||
sumf = shmem_f32[tiisg];
|
sumf = shmem_f32[tiisg];
|
||||||
sumf = simd_sum(sumf);
|
sumf = simd_sum(sumf);
|
||||||
|
|
||||||
const float scale = 1.0f/sqrt(max(sumf, args.eps));
|
const float scale = 1.0f/max(sqrt(sumf), args.eps);
|
||||||
|
|
||||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
|
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
|
||||||
y[i00] = x[i00] * scale;
|
y[i00] = x[i00] * scale;
|
||||||
|
|
@ -3481,6 +3706,13 @@ template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4
|
||||||
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>;
|
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>;
|
||||||
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>;
|
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>;
|
||||||
|
|
||||||
|
#if defined(GGML_METAL_HAS_BF16)
|
||||||
|
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, bfloat4, 4, dequantize_bf16_t4>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, bfloat4, 4, dequantize_bf16_t4>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, bfloat4, 4, dequantize_bf16_t4>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>;
|
||||||
|
#endif
|
||||||
|
|
||||||
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
|
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
|
||||||
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
|
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
|
||||||
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
|
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
|
||||||
|
|
@ -3531,6 +3763,16 @@ template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4
|
||||||
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
|
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
|
||||||
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
|
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
|
||||||
|
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q2_K, 256, dequantize_q2_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q2_K, 256, dequantize_q2_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q2_K, 256, dequantize_q2_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q2_K, 256, dequantize_q2_K>;
|
||||||
|
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q3_K, 256, dequantize_q3_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q3_K, 256, dequantize_q3_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q3_K, 256, dequantize_q3_K>;
|
||||||
|
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q3_K, 256, dequantize_q3_K>;
|
||||||
|
|
||||||
template<typename T0, typename T1, short NR0, typename args_t>
|
template<typename T0, typename T1, short NR0, typename args_t>
|
||||||
void kernel_mul_mv_t_t_impl(
|
void kernel_mul_mv_t_t_impl(
|
||||||
args_t args,
|
args_t args,
|
||||||
|
|
@ -4530,7 +4772,9 @@ kernel void kernel_conv_transpose_2d<half>(
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]);
|
uint3 ntg[[threads_per_threadgroup]]);
|
||||||
|
|
||||||
kernel void kernel_upscale_f32(
|
constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]];
|
||||||
|
|
||||||
|
kernel void kernel_upscale_nearest_f32(
|
||||||
constant ggml_metal_kargs_upscale & args,
|
constant ggml_metal_kargs_upscale & args,
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device char * dst,
|
device char * dst,
|
||||||
|
|
@ -4556,6 +4800,156 @@ kernel void kernel_upscale_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline float bilinear_tri(float x) {
|
||||||
|
return MAX(0.0f, 1.0f - fabs(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_upscale_bilinear_f32(
|
||||||
|
constant ggml_metal_kargs_upscale & args,
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2 = tgpig.y;
|
||||||
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i03 = i3 / args.sf3;
|
||||||
|
const int64_t i02 = i2 / args.sf2;
|
||||||
|
|
||||||
|
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
|
||||||
|
const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01)));
|
||||||
|
const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1));
|
||||||
|
const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01));
|
||||||
|
|
||||||
|
src0 += i03*args.nb03 + i02*args.nb02;
|
||||||
|
|
||||||
|
device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||||
|
|
||||||
|
if (FC_upscale_aa) {
|
||||||
|
const float support0 = MAX(1.0f, 1.0f / args.sf0);
|
||||||
|
const float invscale0 = 1.0f / support0;
|
||||||
|
const float support1 = MAX(1.0f, 1.0f / args.sf1);
|
||||||
|
const float invscale1 = 1.0f / support1;
|
||||||
|
|
||||||
|
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||||
|
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||||
|
|
||||||
|
int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs));
|
||||||
|
int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs));
|
||||||
|
|
||||||
|
int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs));
|
||||||
|
int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs));
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
float wsum = 0.0f;
|
||||||
|
|
||||||
|
for (int64_t sy = y_min; sy < y_max; ++sy) {
|
||||||
|
const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1);
|
||||||
|
for (int64_t sx = x_min; sx < x_max; ++sx) {
|
||||||
|
const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
|
||||||
|
const float w = wx * wy;
|
||||||
|
const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
|
||||||
|
sum += (*src_ptr) * w;
|
||||||
|
wsum += w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f;
|
||||||
|
dst_ptr[i0] = v;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||||
|
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||||
|
const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00)));
|
||||||
|
const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1));
|
||||||
|
const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00));
|
||||||
|
|
||||||
|
device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00);
|
||||||
|
device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00);
|
||||||
|
device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00);
|
||||||
|
device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00);
|
||||||
|
|
||||||
|
const float v =
|
||||||
|
(*src00) * (1.0f - fd0) * (1.0f - fd1) +
|
||||||
|
(*src10) * fd0 * (1.0f - fd1) +
|
||||||
|
(*src01) * (1.0f - fd0) * fd1 +
|
||||||
|
(*src11) * fd0 * fd1;
|
||||||
|
|
||||||
|
dst_ptr[i0] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float bicubic_weight1(float x) {
|
||||||
|
const float a = -0.75f;
|
||||||
|
return ((a + 2) * x - (a + 3)) * x * x + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float bicubic_weight2(float x) {
|
||||||
|
const float a = -0.75f;
|
||||||
|
return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_upscale_bicubic_f32(
|
||||||
|
constant ggml_metal_kargs_upscale & args,
|
||||||
|
device const char * src0,
|
||||||
|
device char * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const int64_t i3 = tgpig.z;
|
||||||
|
const int64_t i2 = tgpig.y;
|
||||||
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i03 = i3 / args.sf3;
|
||||||
|
const int64_t i02 = i2 / args.sf2;
|
||||||
|
|
||||||
|
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
|
||||||
|
const int64_t i01 = (int64_t)floor(f01);
|
||||||
|
const float fd1 = f01 - (float)i01;
|
||||||
|
|
||||||
|
const float w_y0 = bicubic_weight2(fd1 + 1.0f);
|
||||||
|
const float w_y1 = bicubic_weight1(fd1);
|
||||||
|
const float w_y2 = bicubic_weight1(1.0f - fd1);
|
||||||
|
const float w_y3 = bicubic_weight2(2.0f - fd1);
|
||||||
|
|
||||||
|
const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02;
|
||||||
|
|
||||||
|
device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1);
|
||||||
|
|
||||||
|
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||||
|
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||||
|
const int64_t i00 = (int64_t)floor(f00);
|
||||||
|
const float fd0 = f00 - (float)i00;
|
||||||
|
|
||||||
|
const float w_x0 = bicubic_weight2(fd0 + 1.0f);
|
||||||
|
const float w_x1 = bicubic_weight1(fd0);
|
||||||
|
const float w_x2 = bicubic_weight1(1.0f - fd0);
|
||||||
|
const float w_x3 = bicubic_weight2(2.0f - fd0);
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
|
||||||
|
for (int dy = -1; dy <= 2; ++dy) {
|
||||||
|
const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy));
|
||||||
|
const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3;
|
||||||
|
|
||||||
|
for (int dx = -1; dx <= 2; ++dx) {
|
||||||
|
const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
|
||||||
|
const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
|
||||||
|
|
||||||
|
const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
|
||||||
|
sum += (*src_ptr) * wx * wy;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_ptr[i0] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_pad_f32(
|
kernel void kernel_pad_f32(
|
||||||
constant ggml_metal_kargs_pad & args,
|
constant ggml_metal_kargs_pad & args,
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
|
|
@ -5782,6 +6176,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 576, 512>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
|
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
|
||||||
|
|
@ -5796,6 +6191,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||||
|
|
||||||
#if defined(GGML_METAL_HAS_BF16)
|
#if defined(GGML_METAL_HAS_BF16)
|
||||||
|
|
@ -5811,6 +6207,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -5826,6 +6223,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
|
||||||
|
|
@ -5840,6 +6238,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
|
||||||
|
|
@ -5854,6 +6253,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
|
||||||
|
|
@ -5868,6 +6268,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
|
||||||
|
|
@ -5882,6 +6283,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_at
|
||||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 320, 256>;
|
||||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||||
|
|
||||||
#undef FA_TYPES
|
#undef FA_TYPES
|
||||||
|
|
@ -6452,6 +6854,17 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flas
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 1>;
|
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 1>;
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 1>;
|
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 1>;
|
||||||
|
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 320, 256, 2>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 320, 256, 2>;
|
||||||
|
#if defined(GGML_METAL_HAS_BF16)
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 320, 256, 2>;
|
||||||
|
#endif
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 320, 256, 2>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 320, 256, 2>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 320, 256, 2>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 320, 256, 2>;
|
||||||
|
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 320, 256, 2>;
|
||||||
|
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 576, 512, 2>;
|
template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 576, 512, 2>;
|
||||||
template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
||||||
#if defined(GGML_METAL_HAS_BF16)
|
#if defined(GGML_METAL_HAS_BF16)
|
||||||
|
|
@ -8912,6 +9325,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
|
||||||
|
template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
|
||||||
|
|
||||||
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||||
kernel void kernel_mul_mm_id(
|
kernel void kernel_mul_mm_id(
|
||||||
|
|
|
||||||
|
|
@ -132,6 +132,7 @@ set(GGML_OPENCL_KERNELS
|
||||||
ssm_conv
|
ssm_conv
|
||||||
sub
|
sub
|
||||||
sum_rows
|
sum_rows
|
||||||
|
cumsum
|
||||||
transpose
|
transpose
|
||||||
concat
|
concat
|
||||||
tsembd
|
tsembd
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue