Merge remote-tracking branch 'origin/master' into allozaur/20677-webui-server-tools

This commit is contained in:
Aleksander Grygier 2026-03-31 15:57:32 +02:00
commit b22ae1411c
189 changed files with 6816 additions and 2137 deletions

View File

@ -1,11 +1,13 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH ARG TARGETARCH
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git cmake libssl-dev apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
ENV CC=gcc-14 CXX=g++-14
WORKDIR /app WORKDIR /app
@ -34,7 +36,7 @@ RUN mkdir -p /app/full \
FROM ubuntu:$UBUNTU_VERSION AS base FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \
@ -55,8 +57,9 @@ RUN apt-get update \
git \ git \
python3 \ python3 \
python3-pip \ python3-pip \
&& pip install --upgrade pip setuptools wheel \ python3-wheel \
&& pip install -r requirements.txt \ && pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=24.04 ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=13.1.0 ARG CUDA_VERSION=13.1.1
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app WORKDIR /app
@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_CUDA_RUN_CONTAINER} AS base FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.4.0 ARG CUDA_VERSION=12.8.1
# Target the CUDA build image # Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app WORKDIR /app
@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_CUDA_RUN_CONTAINER} AS base FROM ${BASE_CUDA_RUN_CONTAINER} AS base
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \
@ -60,7 +62,8 @@ RUN apt-get update \
git \ git \
python3 \ python3 \
python3-pip \ python3-pip \
&& pip install --upgrade pip setuptools wheel \ python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \ && pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \

View File

@ -33,8 +33,25 @@ RUN mkdir -p /app/full \
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_MUSA_RUN_CONTAINER} AS base FROM ${BASE_MUSA_RUN_CONTAINER} AS base
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -41,6 +41,7 @@
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic, enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false, precompileMetalShaders ? false,
useWebUi ? true,
}: }:
let let
@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
cmakeFlags = cmakeFlags =
[ [
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_NATIVE" false)

View File

@ -78,7 +78,7 @@ ARG http_proxy
ARG https_proxy ARG https_proxy
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl\ && apt-get install -y libgomp1 libtbb12 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -58,7 +58,7 @@ RUN mkdir -p /app/full \
FROM ${BASE_ROCM_DEV_CONTAINER} AS base FROM ${BASE_ROCM_DEV_CONTAINER} AS base
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 curl\ && apt-get install -y libgomp1 curl \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \
@ -79,7 +79,7 @@ RUN apt-get update \
git \ git \
python3-pip \ python3-pip \
python3 \ python3 \
python3-wheel\ python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \ && pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \ && pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \ && apt autoremove -y \

View File

@ -49,17 +49,20 @@ COPY --from=build /app/full /app
WORKDIR /app WORKDIR /app
ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
# Flag for compatibility with pip
ARG UV_INDEX_STRATEGY="unsafe-best-match"
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y \ && apt-get install -y \
build-essential \ build-essential \
curl \
git \ git \
python3.13 \ ca-certificates \
python3.13-dev \ && curl -LsSf https://astral.sh/uv/install.sh | sh \
python3-pip \ && uv python install 3.13 \
python3-wheel \ && uv venv --python 3.13 /root/.venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \ && uv pip install --python /root/.venv/bin/python -r requirements.txt \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View File

@ -21,14 +21,6 @@ indent_style = tab
[prompts/*.txt] [prompts/*.txt]
insert_final_newline = unset insert_final_newline = unset
[tools/server/public/*]
indent_size = 2
[tools/server/public/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[tools/server/deps_*] [tools/server/deps_*]
trim_trailing_whitespace = unset trim_trailing_whitespace = unset
indent_style = unset indent_style = unset
@ -61,6 +53,14 @@ charset = unset
trim_trailing_whitespace = unset trim_trailing_whitespace = unset
insert_final_newline = unset insert_final_newline = unset
[tools/server/public/**]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[benches/**] [benches/**]
indent_style = unset indent_style = unset
indent_size = unset indent_size = unset

4
.gitattributes vendored Normal file
View File

@ -0,0 +1,4 @@
# Treat the generated single-file WebUI build as binary for diff purposes.
# Git's pack-file delta compression still works (byte-level), but this prevents
# git diff from printing the entire minified file on every change.
tools/server/public/index.html -diff

View File

@ -181,7 +181,7 @@ jobs:
- build: 'x64' - build: 'x64'
os: ubuntu-22.04 os: ubuntu-22.04
- build: 'arm64' - build: 'arm64'
os: ubuntu-22.04-arm os: ubuntu-24.04-arm
- build: 's390x' - build: 's390x'
os: ubuntu-24.04-s390x os: ubuntu-24.04-s390x
- build: 'ppc64le' - build: 'ppc64le'
@ -207,14 +207,22 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y --no-install-recommends \ sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \ python3 python3-pip python3-dev python3-wheel \
libjpeg-dev build-essential libssl-dev \ libjpeg-dev build-essential libssl-dev \
git-lfs git-lfs
- name: Toolchain workaround (GCC 14)
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Python Dependencies - name: Python Dependencies
id: python_depends id: python_depends
run: | run: |
python3 -m pip install --upgrade pip export PIP_BREAK_SYSTEM_PACKAGES="1"
python3 -m pip install --upgrade pip setuptools
pip3 install ./gguf-py pip3 install ./gguf-py
- name: Swap Endianness - name: Swap Endianness
@ -292,7 +300,15 @@ jobs:
ctest -L main --verbose ctest -L main --verbose
ubuntu-24-vulkan: ubuntu-24-vulkan:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-24.04
- build: 'arm64'
os: ubuntu-24.04-arm
runs-on: ${{ matrix.os }}
steps: steps:
- name: Clone - name: Clone
@ -302,7 +318,10 @@ jobs:
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
sudo apt-get install -y glslc libvulkan-dev libssl-dev ninja-build sudo apt-get update
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Configure - name: Configure
id: cmake_configure id: cmake_configure

View File

@ -25,186 +25,13 @@ permissions:
packages: write packages: write
jobs: jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ${{ matrix.config.runs_on }}
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
fail-fast: false
matrix:
config:
# Multi-stage build
# Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU
if: ${{ matrix.config.tag != 's390x' }}
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
with:
image: tonistiigi/binfmt:qemu-v7.0.0-28
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
- name: Log in to Docker Hub
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Determine image tag name
id: tag
shell: bash
run: |
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
# list all tags possible
tags="${{ matrix.config.tag }}"
for tag in $tags; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
CACHETAGS="${PREFIX}buildcache${TYPE}"
FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
done
echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "cache_output_tags=$CACHETAGS" # print out for debugging
echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }}
uses: ggml-org/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Build and push Full Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.full_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Light Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.light_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Server Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
# tag list is generated from step above
tags: ${{ steps.tag.outputs.server_output_tags }}
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
create_tag: create_tag:
name: Create and push git tag name: Create and push git tag
runs-on: ubuntu-22.04 runs-on: ubuntu-slim
permissions: permissions:
contents: write contents: write
outputs:
source_tag: ${{ steps.srctag.outputs.name }}
steps: steps:
- name: Clone - name: Clone
@ -225,3 +52,391 @@ jobs:
run: | run: |
git tag ${{ steps.srctag.outputs.name }} || exit 0 git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0 git push origin ${{ steps.srctag.outputs.name }} || exit 0
prepare_matrices:
name: Prepare Docker matrices
runs-on: ubuntu-24.04
outputs:
build_matrix: ${{ steps.matrices.outputs.build_matrix }}
merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
steps:
- name: Generate build and merge matrices
id: matrices
shell: bash
run: |
set -euo pipefail
# Keep all build targets in one place and derive merge targets from it.
cat > build-matrix.json <<'JSON'
[
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
]
JSON
BUILD_MATRIX="$(jq -c . build-matrix.json)"
MERGE_MATRIX="$(jq -c '
reduce .[] as $entry ({}; .[$entry.tag] |= (
. // {
tag: $entry.tag,
arches: [],
full: false,
light: false,
server: false
}
| .full = (.full or ($entry.full // false))
| .light = (.light or ($entry.light // false))
| .server = (.server or ($entry.server // false))
| .arches += [($entry.platforms | sub("^linux/"; ""))]
))
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
| if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
. + {
s390x: {
tag: "s390x",
arches: ["s390x"],
full: .cpu.full,
light: .cpu.light,
server: .cpu.server
}
}
else
.
end
| [.[] | .arches = (.arches | unique | sort | join(" "))]
' build-matrix.json)"
echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
push_to_registry:
name: Push Docker image to Docker Registry
needs: [prepare_matrices, create_tag]
runs-on: ${{ matrix.config.runs_on }}
strategy:
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ needs.create_tag.outputs.source_tag }}
- name: Set up QEMU
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
with:
image: tonistiigi/binfmt:qemu-v10.2.1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
- name: Log in to Docker Registry
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine image metadata
id: meta
shell: bash
run: |
set -euo pipefail
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
PREFIX="${IMAGE_REPO}:"
PLATFORM="${{ matrix.config.platforms }}"
ARCH_SUFFIX="${PLATFORM#linux/}"
# list all tags possible
tags="${{ matrix.config.tag }}"
for tag in $tags; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
done
SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
echo "cache_output_tag=$CACHETAG" # print out for debugging
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu)
if: ${{ matrix.config.free_disk_space == true }}
uses: ggml-org/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Build and push Full Docker image by digest
id: build_full
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: full
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Build and push Light Docker image by digest
id: build_light
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: light
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Build and push Server Docker image by digest
id: build_server
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
with:
context: .
platforms: ${{ matrix.config.platforms }}
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
file: ${{ matrix.config.dockerfile }}
target: server
provenance: false
build-args: |
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
# using github experimental cache
#cache-from: type=gha
#cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
- name: Export digest metadata
shell: bash
run: |
set -euo pipefail
TAGS="${{ matrix.config.tag }}"
ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
mkdir -p /tmp/digests
add_digest_rows() {
local image_type="$1"
local digest="$2"
if [[ -z "$digest" ]]; then
echo "Missing digest for image_type=${image_type}" >&2
exit 1
fi
for tag in $TAGS; do
printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
done
}
if [[ "${{ matrix.config.full }}" == "true" ]]; then
add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
fi
if [[ "${{ matrix.config.light }}" == "true" ]]; then
add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
fi
if [[ "${{ matrix.config.server }}" == "true" ]]; then
add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
fi
- name: Upload digest metadata
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
with:
name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
if-no-files-found: error
merge_arch_tags:
name: Create shared tags from digests
needs: [prepare_matrices, push_to_registry, create_tag]
runs-on: ubuntu-24.04
strategy:
fail-fast: false
matrix:
config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
steps:
- name: Check out the repo
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Download digest metadata
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
with:
pattern: digests-*
path: /tmp/digests
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
- name: Log in to Docker Registry
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create tags from digests
shell: bash
run: |
set -euo pipefail
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}"
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
PREFIX="${IMAGE_REPO}:"
SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
TAGS="${{ matrix.config.tag }}"
ARCHES="${{ matrix.config.arches }}"
DIGEST_GLOB="/tmp/digests/*.tsv"
if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
echo "No digest metadata found in /tmp/digests" >&2
exit 1
fi
if [[ -z "$SRC_TAG" ]]; then
echo "Missing source tag from create_tag" >&2
exit 1
fi
find_digest() {
local tag_name="$1"
local arch="$2"
local image_type="$3"
local digest
digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
fi
if [[ -z "$digest" ]]; then
echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
exit 1
fi
echo "$digest"
}
create_manifest_tags() {
local image_type="$1"
local tag_name="$2"
local suffix="$3"
local merged_tag="${PREFIX}${image_type}${suffix}"
local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
local refs=()
for arch in $ARCHES; do
local digest
digest="$(find_digest "$tag_name" "$arch" "$image_type")"
refs+=("${IMAGE_REPO}@${digest}")
done
echo "Creating ${merged_tag} from ${refs[*]}"
docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
echo "Creating ${merged_versioned_tag} from ${refs[*]}"
docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
}
for tag in $TAGS; do
if [[ "$tag" == "cpu" ]]; then
TYPE=""
else
TYPE="-$tag"
fi
if [[ "${{ matrix.config.full }}" == "true" ]]; then
create_manifest_tags "full" "$tag" "$TYPE"
fi
if [[ "${{ matrix.config.light }}" == "true" ]]; then
create_manifest_tags "light" "$tag" "$TYPE"
fi
if [[ "${{ matrix.config.server }}" == "true" ]]; then
create_manifest_tags "server" "$tag" "$TYPE"
fi
done
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

View File

@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6 uses: actions/setup-python@v6
with: with:
python-version: "3.11" python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.24 pip-install: -r requirements/requirements-all.txt ty==0.0.26
# - name: Type-check with Pyright # - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2 # uses: jakebailey/pyright-action@v2
# with: # with:

View File

@ -131,17 +131,16 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz name: llama-bin-macos-x64.tar.gz
ubuntu-22-cpu: ubuntu-cpu:
strategy: strategy:
matrix: matrix:
include: include:
- build: 'x64' - build: 'x64'
os: ubuntu-22.04 os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-24.04-arm
- build: 's390x' - build: 's390x'
os: ubuntu-24.04-s390x os: ubuntu-24.04-s390x
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
@ -165,6 +164,13 @@ jobs:
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential libssl-dev sudo apt-get install build-essential libssl-dev
- name: Toolchain workaround (GCC 14)
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -194,8 +200,16 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-22-vulkan: ubuntu-vulkan:
runs-on: ubuntu-22.04 strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-24.04-arm
runs-on: ${{ matrix.os }}
steps: steps:
- name: Clone - name: Clone
@ -207,16 +221,23 @@ jobs:
- name: ccache - name: ccache
uses: ggml-org/ccache-action@v1.2.21 uses: ggml-org/ccache-action@v1.2.21
with: with:
key: ubuntu-22-vulkan key: ubuntu-vulkan-${{ matrix.build }}
evict-old-files: 1d evict-old-files: 1d
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo apt-get update -y sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
else
sudo apt-get update -y
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
fi
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -239,13 +260,13 @@ jobs:
id: pack_artifacts id: pack_artifacts
run: | run: |
cp LICENSE ./build/bin/ cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v6
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
ubuntu-24-openvino: ubuntu-24-openvino:
runs-on: ubuntu-24.04 runs-on: ubuntu-24.04
@ -977,8 +998,8 @@ jobs:
- windows-sycl - windows-sycl
- windows-hip - windows-hip
- ubuntu-22-rocm - ubuntu-22-rocm
- ubuntu-22-cpu - ubuntu-cpu
- ubuntu-22-vulkan - ubuntu-vulkan
- ubuntu-24-openvino - ubuntu-24-openvino
- macOS-arm64 - macOS-arm64
- macOS-x64 - macOS-x64
@ -1061,9 +1082,11 @@ jobs:
**Linux:** **Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz) - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz) - [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz) - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz) - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
**Windows:** **Windows:**

2
.gitignore vendored
View File

@ -95,6 +95,8 @@
# Server Web UI temporary files # Server Web UI temporary files
/tools/server/webui/node_modules /tools/server/webui/node_modules
/tools/server/webui/dist /tools/server/webui/dist
# we no longer use gz for index.html
/tools/server/public/index.html.gz
# Python # Python

View File

@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON) option(LLAMA_TESTS_INSTALL "llama: install tests" ON)

View File

@ -2807,6 +2807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.port = value; params.port = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
add_opt(common_arg(
{"--reuse-port"},
string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
[](common_params & params) {
params.reuse_port = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
add_opt(common_arg( add_opt(common_arg(
{"--path"}, "PATH", {"--path"}, "PATH",
string_format("path to serve static files from (default: %s)", params.public_path.c_str()), string_format("path to serve static files from (default: %s)", params.public_path.c_str()),

View File

@ -65,7 +65,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.grammar = build_grammar([&](const common_grammar_builder & builder) { data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) { foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function"); const auto & function = tool.at("function");
auto schema = function.at("parameters"); auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema); builder.resolve_refs(schema);
}); });
parser.build_grammar(builder, data.grammar_lazy); parser.build_grammar(builder, data.grammar_lazy);
@ -221,7 +221,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
foreach_function(inputs.tools, [&](const json & tool) { foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function"); const auto & func = tool.at("function");
std::string name = func.at("name"); std::string name = func.at("name");
const auto & schema = func.at("parameters"); const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
// Build call_id parser based on position (if supported) // Build call_id parser based on position (if supported)
common_peg_parser call_id_section = p.eps(); common_peg_parser call_id_section = p.eps();
@ -282,19 +282,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
common_peg_parser tool_choice = p.choice(); common_peg_parser tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) { foreach_function(inputs.tools, [&](const json & tool) {
const auto & func = tool.at("function"); const auto & func = tool.at("function");
std::string name = func.at("name"); std::string name = func.at("name");
const auto & params = func.at("parameters"); const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
if (!params.contains("properties") || !params.at("properties").is_object()) {
return;
}
const auto & properties = params.at("properties");
std::set<std::string> required; std::set<std::string> required;
if (params.contains("required") && params.at("required").is_array()) {
params.at("required").get_to(required);
}
// Build parser for each argument, separating required and optional // Build parser for each argument, separating required and optional
std::vector<common_peg_parser> required_parsers; std::vector<common_peg_parser> required_parsers;
@ -311,17 +303,18 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
} }
} }
auto arg = p.tool_arg( auto arg =
p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) + p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
arguments.name_suffix) + arguments.name_suffix) +
arguments.value_prefix + arguments.value_prefix +
(type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix), (type == "string" ?
"tool-" + name + "-arg-" + param_name + "-schema", p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
param_schema, true)) : "tool-" + name + "-arg-" + param_name + "-schema",
p.tool_arg_json_value(p.schema( param_schema, true)) :
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) + p.tool_arg_json_value(p.schema(
p.space()) + p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.tool_arg_close(p.literal(arguments.value_suffix))); p.space()) +
p.tool_arg_close(p.literal(arguments.value_suffix)));
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg); auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
if (is_required) { if (is_required) {

View File

@ -287,7 +287,7 @@ void analyze_reasoning::compare_reasoning_presence() {
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest()); return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
}); });
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) { auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest(); return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
}); });
// try the more aggressive parse first, if it fails, fall back to the delimiter one // try the more aggressive parse first, if it fails, fall back to the delimiter one
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B); auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@ -297,7 +297,7 @@ void analyze_reasoning::compare_reasoning_presence() {
if (result.result.success()) { if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) { if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED; mode = reasoning_mode::TAG_BASED;
start = trim_whitespace(result.tags["pre"]); start = trim_leading_whitespace(result.tags["pre"]);
end = trim_trailing_whitespace(result.tags["post"]); end = trim_trailing_whitespace(result.tags["post"]);
} else if (!result.tags["post"].empty()) { } else if (!result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED; mode = reasoning_mode::TAG_BASED;
@ -333,7 +333,7 @@ void analyze_reasoning::compare_thinking_enabled() {
if (left_trimmed.empty() && !diff.right.empty()) { if (left_trimmed.empty() && !diff.right.empty()) {
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) { if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
if (start.empty()) { if (start.empty()) {
start = right_trimmed; start = trim_leading_whitespace(diff.right);
mode = reasoning_mode::TAG_BASED; mode = reasoning_mode::TAG_BASED;
} }
} }
@ -344,7 +344,7 @@ void analyze_reasoning::compare_thinking_enabled() {
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) { if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
start = seg[seg.size() - 2].value; start = seg[seg.size() - 2].value;
} }
end = left_trimmed; end = trim_trailing_whitespace(diff.left);
mode = reasoning_mode::TAG_BASED; mode = reasoning_mode::TAG_BASED;
} }
} }
@ -363,15 +363,23 @@ void analyze_reasoning::compare_thinking_enabled() {
size_t len = std::min(base.size(), anchor_len); size_t len = std::min(base.size(), anchor_len);
std::string anchor = base.substr(base.size() - len); std::string anchor = base.substr(base.size() - len);
auto pos = extended.rfind(anchor); auto pos = extended.rfind(anchor);
if (pos == std::string::npos || pos + len >= extended.size()) continue; if (pos == std::string::npos || pos + len >= extended.size()) {
continue;
}
std::string extra = trim_whitespace(extended.substr(pos + len)); std::string extra = trim_whitespace(extended.substr(pos + len));
if (extra.empty()) continue; if (extra.empty()) {
continue;
}
auto seg = prune_whitespace_segments(segmentize_markers(extra)); auto seg = prune_whitespace_segments(segmentize_markers(extra));
if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) { if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
if (start.empty()) start = seg[0].value; if (start.empty()) {
if (end.empty()) end = seg[1].value; start = seg[0].value;
}
if (end.empty()) {
end = seg[1].value;
}
mode = reasoning_mode::TAG_BASED; mode = reasoning_mode::TAG_BASED;
break; break;
} }
@ -423,7 +431,7 @@ void analyze_reasoning::compare_reasoning_scope() {
LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__); LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) { auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())); return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
}); });
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B); auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
if (result.result.success()) { if (result.result.success()) {
@ -516,7 +524,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
// Take the more promising diff // Take the more promising diff
std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left; std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) { auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest(); return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
}); });
auto result = parser_wrapped.parse_anywhere_and_extract(pure_content); auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
start = result.tags["pre"]; start = result.tags["pre"];

View File

@ -221,7 +221,7 @@ using chat_template_caps = jinja::caps;
struct common_chat_templates { struct common_chat_templates {
bool add_bos; bool add_bos;
bool add_eos; bool add_eos;
bool has_explicit_template; // Model had builtin template or template overridde was specified. bool has_explicit_template; // Model had builtin template or template overridden was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml) std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use; std::unique_ptr<common_chat_template> template_tool_use;
}; };
@ -971,6 +971,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object(); auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE); auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto start = p.rule("start", p.literal("<|start|>assistant")); auto start = p.rule("start", p.literal("<|start|>assistant"));
@ -979,9 +980,19 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis")); auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1); auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end); if (extract_reasoning) {
p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
} else {
p.rule("analysis", p.content(p.literal("<|channel|>analysis<|message|>") + content + end));
}
auto analysis = p.ref("analysis");
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end); auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content)); auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
// Consume any unsolicited tool calls, e.g. builtin functions
auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
auto any = p.rule("any", preamble | analysis); auto any = p.rule("any", preamble | analysis);
if (has_response_format) { if (has_response_format) {
@ -1025,7 +1036,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
return p.zero_or_more(start + any) + start + (tool_call | final_msg); return p.zero_or_more(start + any) + start + (tool_call | final_msg);
} }
return p.zero_or_more(start + any) + start + final_msg; return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
}); });
data.parser = parser.save(); data.parser = parser.save();

View File

@ -359,6 +359,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
} }
void common_init() { void common_init() {
#if defined(_WIN32)
SetConsoleOutputCP(CP_UTF8);
SetConsoleCP(CP_UTF8);
#endif
llama_log_set(common_log_default_callback, NULL); llama_log_set(common_log_default_callback, NULL);
#ifdef NDEBUG #ifdef NDEBUG
@ -367,7 +372,7 @@ void common_init() {
const char * build_type = " (debug)"; const char * build_type = " (debug)";
#endif #endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
} }
std::string common_params_get_system_info(const common_params & params) { std::string common_params_get_system_info(const common_params & params) {
@ -656,6 +661,97 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
return true; return true;
} }
static inline bool glob_class_match(const char c, const char * pattern, const char * class_end) {
const char * class_start = pattern;
bool negated = false;
if (*class_start == '!') {
negated = true;
class_start++;
}
// If first character after negation is ']' or '-', treat it as literal
if (*class_start == ']' || *class_start == '-') {
if (class_start < class_end && *class_start == c) {
return !negated;
}
class_start++;
}
bool matched = false;
while (class_start < class_end) {
if (class_start + 2 < class_end && class_start[1] == '-' && class_start[2] != ']') {
char start_char = *class_start;
char end_char = class_start[2];
if (c >= start_char && c <= end_char) {
matched = true;
break;
}
class_start += 3;
} else {
if (*class_start == c) {
matched = true;
break;
}
class_start++;
}
}
return negated ? !matched : matched;
}
// simple glob: * matches non-/ chars, ** matches anything including /, [] matches character class
static inline bool glob_match(const char * pattern, const char * str) {
if (*pattern == '\0') {
return *str == '\0';
}
if (pattern[0] == '*' && pattern[1] == '*') {
const char * p = pattern + 2;
if (glob_match(p, str)) return true;
if (*str != '\0') return glob_match(pattern, str + 1);
return false;
}
if (*pattern == '*') {
const char * p = pattern + 1;
for (; *str != '\0' && *str != '/'; str++) {
if (glob_match(p, str)) return true;
}
return glob_match(p, str);
}
if (*pattern == '?' && *str != '\0' && *str != '/') {
return glob_match(pattern + 1, str + 1);
}
if (*pattern == '[') {
const char * class_end = pattern + 1;
// If first character after '[' is ']' or '-', treat it as literal
if (*class_end == ']' || *class_end == '-') {
class_end++;
}
while (*class_end != '\0' && *class_end != ']') {
class_end++;
}
if (*class_end == ']') {
if (*str == '\0') return false;
bool matched = glob_class_match(*str, pattern + 1, class_end);
return matched && glob_match(class_end + 1, str + 1);
} else {
if (*str == '[') {
return glob_match(pattern + 1, str + 1);
}
return false;
}
}
if (*pattern == *str) {
return glob_match(pattern + 1, str + 1);
}
return false;
}
bool glob_match(const std::string & pattern, const std::string & str) {
return glob_match(pattern.c_str(), str.c_str());
}
// //
// Filesystem utils // Filesystem utils
// //
@ -1152,6 +1248,9 @@ llama_context * common_init_result::context() {
} }
common_sampler * common_init_result::sampler(llama_seq_id seq_id) { common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
return nullptr;
}
return pimpl->samplers[seq_id].get(); return pimpl->samplers[seq_id].get();
} }

View File

@ -573,6 +573,7 @@ struct common_params {
// server params // server params
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
bool reuse_port = false; // allow multiple sockets to bind to the same port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
@ -793,6 +794,8 @@ std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens); std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
bool glob_match(const std::string & pattern, const std::string & str);
// //
// Filesystem utils // Filesystem utils
// //

View File

@ -539,6 +539,9 @@ private:
statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr; statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr;
return mk_stmt<slice_expression>(start_pos, std::move(start), std::move(stop), std::move(step)); return mk_stmt<slice_expression>(start_pos, std::move(start), std::move(stop), std::move(step));
} }
if (slices.empty()) {
return mk_stmt<blank_expression>(start_pos);
}
return std::move(slices[0]); return std::move(slices[0]);
} }

View File

@ -771,10 +771,15 @@ value member_expression::execute_impl(context & ctx) {
} }
JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str()); JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
ensure_key_type_allowed(property);
value val = mk_val<value_undefined>("object_property"); value val = mk_val<value_undefined>("object_property");
if (property->is_undefined()) {
JJ_DEBUG("%s", "Member expression property is undefined, returning undefined");
return val;
}
ensure_key_type_allowed(property);
if (is_val<value_undefined>(object)) { if (is_val<value_undefined>(object)) {
JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined"); JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined");
return val; return val;

View File

@ -263,6 +263,14 @@ struct comment_statement : public statement {
// Expressions // Expressions
// Represents an omitted expression in a computed member, e.g. `a[]`.
struct blank_expression : public expression {
std::string type() const override { return "BlankExpression"; }
value execute_impl(context &) override {
return mk_val<value_undefined>();
}
};
struct member_expression : public expression { struct member_expression : public expression {
statement_ptr object; statement_ptr object;
statement_ptr property; statement_ptr property;

View File

@ -416,15 +416,30 @@ private:
i++; i++;
} else if (c == '(') { } else if (c == '(') {
i++; i++;
if (i < length) { if (i < length && sub_pattern[i] == '?') {
if (sub_pattern[i] == '?') { if (i + 1 < length && sub_pattern[i + 1] == ':') {
i += 2; // skip "?:" for non-capturing group, treat as regular group
} else {
// lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
_warnings.push_back("Unsupported pattern syntax"); _warnings.push_back("Unsupported pattern syntax");
// skip to matching ')' to avoid UB on empty seq
int depth = 1;
while (i < length && depth > 0) {
if (sub_pattern[i] == '\\' && i + 1 < length) {
i += 2; // skip escaped character
} else {
if (sub_pattern[i] == '(') depth++;
else if (sub_pattern[i] == ')') depth--;
i++;
}
}
continue;
} }
} }
seq.emplace_back("(" + to_rule(transform()) + ")", false); seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') { } else if (c == ')') {
i++; i++;
if (start > 0 && sub_pattern[start - 1] != '(') { if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
_errors.push_back("Unbalanced parentheses"); _errors.push_back("Unbalanced parentheses");
} }
return join_seq(); return join_seq();

View File

@ -51,7 +51,7 @@ struct common_ngram_map_value {
// statistics of a n-gram // statistics of a n-gram
struct common_ngram_map_key { struct common_ngram_map_key {
size_t key_idx; // index of key n-gram in token-history size_t key_idx; // index of key n-gram in token-history
size_t stat_idx; // index of last token of stastistics computation (key_num, values) size_t stat_idx; // index of last token of statistics computation (key_num, values)
uint16_t key_num; // number of occurrences of this key n-gram in token-history uint16_t key_num; // number of occurrences of this key n-gram in token-history
common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key

View File

@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
break; break;
} }
case REASONING_BUDGET_FORCING: case REASONING_BUDGET_FORCING:
// force_pos is advanced in apply(), not here. ctx->force_pos++;
// This ensures the first forced token isn't skipped when the sampler if (ctx->force_pos >= ctx->forced_tokens.size()) {
// is initialized directly in FORCING state (e.g. COUNTING + budget=0) ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
break; break;
case REASONING_BUDGET_DONE: case REASONING_BUDGET_DONE:
break; break;
@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
cur_p->data[i].logit = -INFINITY; cur_p->data[i].logit = -INFINITY;
} }
} }
// advance to next forced token (done here rather than in accept so that
// the first forced token isn't skipped when starting in FORCING state)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
} }
static void common_reasoning_budget_reset(struct llama_sampler * smpl) { static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
common_reasoning_budget_state initial_state) { common_reasoning_budget_state initial_state) {
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state); return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
} }
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
if (!smpl) {
return REASONING_BUDGET_IDLE;
}
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
}

View File

@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
const std::vector<llama_token> & forced_tokens, const std::vector<llama_token> & forced_tokens,
int32_t budget, int32_t budget,
common_reasoning_budget_state initial_state); common_reasoning_budget_state initial_state);
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);

View File

@ -7,6 +7,7 @@
#include <algorithm> #include <algorithm>
#include <cctype> #include <cctype>
#include <climits>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <unordered_map> #include <unordered_map>
@ -109,6 +110,7 @@ struct common_sampler {
common_params_sampling params; common_params_sampling params;
struct llama_sampler * grmr; struct llama_sampler * grmr;
struct llama_sampler * rbudget;
struct llama_sampler * chain; struct llama_sampler * chain;
ring_buffer<llama_token> prev; ring_buffer<llama_token> prev;
@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
llama_sampler * grmr = nullptr; llama_sampler * grmr = nullptr;
llama_sampler * rbudget = nullptr;
llama_sampler * chain = llama_sampler_chain_init(lparams); llama_sampler * chain = llama_sampler_chain_init(lparams);
std::vector<llama_sampler *> samplers; std::vector<llama_sampler *> samplers;
@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
} }
if (grmr) { if (grmr && !params.grammar_lazy) {
try { try {
for (const auto & token : prefill_tokens) { for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token); llama_sampler_accept(grmr, token);
@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
} }
} }
// reasoning budget sampler — added first so it can force tokens before other samplers // reasoning budget sampler
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) { if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
samplers.push_back(common_reasoning_budget_init( rbudget = common_reasoning_budget_init(
vocab, vocab,
params.reasoning_budget_start, params.reasoning_budget_start,
params.reasoning_budget_end, params.reasoning_budget_end,
params.reasoning_budget_forced, params.reasoning_budget_forced,
params.reasoning_budget_tokens, params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
prefill_tokens)); prefill_tokens);
} }
if (params.has_logit_bias()) { if (params.has_logit_bias()) {
@ -380,9 +383,16 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
params.backend_sampling = false; params.backend_sampling = false;
} }
if (rbudget && params.backend_sampling) {
LOG_WRN("%s: backend sampling is not compatible with reasoning budget, disabling\n", __func__);
params.backend_sampling = false;
}
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ grmr, /* .grmr = */ grmr,
/* .rbudget = */ rbudget,
/* .chain = */ chain, /* .chain = */ chain,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
@ -398,11 +408,27 @@ void common_sampler_free(struct common_sampler * gsmpl) {
} }
llama_sampler_free(gsmpl->grmr); llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->rbudget);
llama_sampler_free(gsmpl->chain); llama_sampler_free(gsmpl->chain);
delete gsmpl; delete gsmpl;
} }
static bool grammar_should_apply(struct common_sampler * gsmpl) {
if (!gsmpl->grmr) {
return false;
}
if (!gsmpl->rbudget) {
return true;
}
if (gsmpl->params.grammar_lazy) {
// if grammar is lazy, only apply when reasoning budget is not active
const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
}
return true;
}
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (!gsmpl) { if (!gsmpl) {
return; return;
@ -410,6 +436,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
const auto tm = gsmpl->tm(); const auto tm = gsmpl->tm();
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
llama_sampler_accept(gsmpl->rbudget, token);
if (gsmpl->grmr && accept_grammar) { if (gsmpl->grmr && accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token); llama_sampler_accept(gsmpl->grmr, token);
} }
@ -431,6 +462,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler { return new common_sampler {
/* .params = */ gsmpl->params, /* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
/* .chain = */ llama_sampler_clone(gsmpl->chain), /* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev, /* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur, /* .cur = */ gsmpl->cur,
@ -500,6 +532,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
llama_token id = LLAMA_TOKEN_NULL; llama_token id = LLAMA_TOKEN_NULL;
auto & grmr = gsmpl->grmr; auto & grmr = gsmpl->grmr;
auto & rbudget = gsmpl->rbudget;
auto & chain = gsmpl->chain; auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits auto & cur_p = gsmpl->cur_p; // initialized by set_logits
@ -511,7 +544,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
if (id != LLAMA_TOKEN_NULL) { if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id); LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported"); GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
// TODO: simplify // TODO: simplify
gsmpl->cur.resize(1); gsmpl->cur.resize(1);
@ -524,7 +558,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
if (grammar_first) { // apply reasoning budget first
llama_sampler_apply(rbudget, &cur_p);
if (grammar_first && grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p); llama_sampler_apply(grmr, &cur_p);
} }
@ -532,7 +569,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
id = cur_p.data[cur_p.selected].id; id = cur_p.data[cur_p.selected].id;
if (grammar_first) { if (grammar_first || !grammar_should_apply(gsmpl)) {
return id; return id;
} }
@ -553,7 +590,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx); gsmpl->set_logits(ctx, idx);
llama_sampler_apply(grmr, &cur_p); llama_sampler_apply(rbudget, &cur_p);
if (grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p);
}
llama_sampler_apply(chain, &cur_p); llama_sampler_apply(chain, &cur_p);
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

View File

@ -31,10 +31,10 @@ import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab from gguf.vocab import MistralTokenizerType, MistralVocab
try: try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
SentencePieceTokenizer, SentencePieceTokenizer,
) )

View File

@ -13,24 +13,30 @@ We have three Docker images available for this project:
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-cuda13`: Same as `full` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggml-org/llama.cpp:light-cuda13`: Same as `light` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggml-org/llama.cpp:server-cuda13`: Same as `server` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-s390x`: Identical to `full`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:light-s390x`: Identical to `light`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
- `ghcr.io/ggml-org/llama.cpp:server-s390x`: Identical to `server`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
@ -82,7 +88,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are: The defaults are:
- `CUDA_VERSION` set to `12.4.0` - `CUDA_VERSION` set to `12.8.1`
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:

View File

@ -24,12 +24,12 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
params.n_predict = 32; params.n_predict = 32;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
return 1; return 1;
} }
common_init();
// number of parallel batches // number of parallel batches
int n_parallel = params.n_parallel; int n_parallel = params.n_parallel;

View File

@ -213,12 +213,12 @@ static bool run(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
return 1; return 1;
} }
common_init();
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -545,11 +545,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
return 1; return 1;
} }
common_init();
llama_backend_init(); llama_backend_init();
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();

View File

@ -99,12 +99,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
return 1; return 1;
} }
common_init();
params.embedding = true; params.embedding = true;
// get max number of sequences per batch // get max number of sequences per batch

View File

@ -37,12 +37,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
common_init();
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -19,12 +19,12 @@ static void print_usage(int /*argc*/, char ** argv) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1; return 1;
} }
common_init();
// init LLM // init LLM
llama_backend_init(); llama_backend_init();

View File

@ -43,12 +43,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
common_init();
const int W = 15; // lookahead window const int W = 15; // lookahead window
const int N = 5; // n-gram size const int N = 5; // n-gram size
const int G = 15; // max verification n-grams const int G = 15; // max verification n-grams

View File

@ -12,6 +12,8 @@ int main(int argc, char ** argv){
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }

View File

@ -18,12 +18,12 @@ int main(int argc, char ** argv){
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }
common_init();
const int n_draft = params.speculative.n_max; const int n_draft = params.speculative.n_max;
// init llama.cpp // init llama.cpp

View File

@ -18,12 +18,12 @@ int main(int argc, char ** argv){
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1; return 1;
} }
common_init();
// max. number of additional tokens to draft if match is found // max. number of additional tokens to draft if match is found
const int n_draft = params.speculative.n_max; const int n_draft = params.speculative.n_max;

View File

@ -7,7 +7,7 @@ import os
# Add utils directory to path for direct script execution # Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils")) sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found] from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
def quick_logits_check(pytorch_file, llamacpp_file): def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE""" """Lightweight sanity check before NMSE"""

View File

@ -5,7 +5,7 @@ import sys
import os import os
import argparse import argparse
from pathlib import Path from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found] from common import get_model_name_from_env_path # type: ignore[import-not-found, ty:unresolved-import]
def calculate_nmse(reference, test): def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2) mse = np.mean((test - reference) ** 2)

View File

@ -2,7 +2,7 @@
import argparse import argparse
import sys import sys
from common import compare_tokens # type: ignore[import-not-found] from common import compare_tokens # type: ignore[import-not-found, ty:unresolved-import]
def parse_arguments(): def parse_arguments():

View File

@ -7,7 +7,7 @@ import importlib
from pathlib import Path from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found] from common import compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

View File

@ -163,12 +163,12 @@ int main(int argc, char ** argv) {
params.n_predict = 128; params.n_predict = 128;
params.n_junk = 1; params.n_junk = 1;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1; return 1;
} }
common_init();
// number of simultaneous "clients" to simulate // number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel; const int32_t n_clients = params.n_parallel;

View File

@ -25,12 +25,12 @@ int main(int argc, char ** argv) {
params.n_keep = 32; params.n_keep = 32;
params.i_pos = -1; params.i_pos = -1;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
return 1; return 1;
} }
common_init();
int n_junk = params.n_junk; int n_junk = params.n_junk;
int n_keep = params.n_keep; int n_keep = params.n_keep;
int n_grp = params.grp_attn_n; int n_grp = params.grp_attn_n;

View File

@ -117,12 +117,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
return 1; return 1;
} }
common_init();
// For BERT models, batch size must be equal to ubatch size // For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
params.embedding = true; params.embedding = true;

View File

@ -17,6 +17,8 @@ int main(int argc, char ** argv) {
const std::string_view state_file = "dump_state.bin"; const std::string_view state_file = "dump_state.bin";
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1; return 1;
} }
@ -27,8 +29,6 @@ int main(int argc, char ** argv) {
params.kv_unified = true; params.kv_unified = true;
} }
common_init();
if (params.n_predict < 0) { if (params.n_predict < 0) {
params.n_predict = 16; params.n_predict = 16;
} }

View File

@ -16,6 +16,8 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1; return 1;
} }
@ -25,8 +27,6 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
common_init();
if (params.speculative.mparams_dft.path.empty()) { if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__); LOG_ERR("%s: --model-draft is required\n", __func__);
return 1; return 1;

View File

@ -38,6 +38,8 @@ int main(int argc, char ** argv) {
// needed to get candidate probs even for temp <= 0.0 // needed to get candidate probs even for temp <= 0.0
params.sampling.n_probs = 128; params.sampling.n_probs = 128;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
return 1; return 1;
} }
@ -47,8 +49,6 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
common_init();
if (params.speculative.mparams_dft.path.empty()) { if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__); LOG_ERR("%s: --model-draft is required\n", __func__);
return 1; return 1;

View File

@ -20,4 +20,4 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA
#cmake --build . --config Release --target llama-bench #cmake --build . --config Release --target llama-bench
#build all binary #build all binary
cmake --build . --config Release -j -v cmake --build . --config Release -j$((($(nproc)+1)/2)) -v

View File

@ -23,9 +23,9 @@ if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU" echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only #use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE} ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE} ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
fi fi

View File

@ -20,6 +20,8 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
params.escape = false; params.escape = false;
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
return 1; return 1;
} }
@ -38,7 +40,6 @@ int main(int argc, char ** argv) {
params.cache_type_v = GGML_TYPE_F32; params.cache_type_v = GGML_TYPE_F32;
} }
common_init();
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any

View File

@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version ### GGML Version
set(GGML_VERSION_MAJOR 0) set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9) set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 8) set(GGML_VERSION_PATCH 9)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)

View File

@ -47,9 +47,11 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
#ifdef STRIDED_ITERATOR_AVAILABLE #ifdef STRIDED_ITERATOR_AVAILABLE
auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols); auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
#else #else
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1); // offset_iterator needs to populate nrows + 1 elements, so we also have to ceildiv nrows + 1 by block_size
const int nrows_offset = nrows + 1;
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows_offset);
int * offset_iterator = offsets_alloc.get(); int * offset_iterator = offsets_alloc.get();
const dim3 offset_grid((nrows + block_size - 1) / block_size); const dim3 offset_grid((nrows_offset + block_size - 1) / block_size);
init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows); init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows);
#endif #endif
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));

View File

@ -2343,7 +2343,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE); static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
if (ne2 <= MMVQ_MAX_BATCH_SIZE) { if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
if (ggml_is_quantized(src0->type)) { if (ggml_is_quantized(src0->type)) {
if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) { const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
if (ne2 <= mmvq_mmid_max) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
return; return;
} }
@ -2946,14 +2947,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
} }
// [TAG_MUL_MAT_ID_CUDA_GRAPHS] // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) { if (node->op == GGML_OP_MUL_MAT_ID) {
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
// TODO: figure out a way to enable for larger batch sizes, without hurting performance const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
// ref: https://github.com/ggml-org/llama.cpp/pull/18958 if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
use_cuda_graph = false; // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
use_cuda_graph = false;
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
#endif #endif
}
} }
if (!use_cuda_graph) { if (!use_cuda_graph) {

View File

@ -97,6 +97,194 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
return MMVQ_PARAMETERS_GENERIC; return MMVQ_PARAMETERS_GENERIC;
} }
// Per-architecture maximum batch size for which MMVQ should be used for MUL_MAT_ID.
// Returns a value <= MMVQ_MAX_BATCH_SIZE. Default is MMVQ_MAX_BATCH_SIZE.
// Check https://github.com/ggml-org/llama.cpp/pull/20905#issuecomment-4145835627 for details
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ1_S: return 6;
case GGML_TYPE_IQ1_M: return 6;
case GGML_TYPE_IQ2_S: return 4;
case GGML_TYPE_IQ2_XS: return 5;
case GGML_TYPE_IQ2_XXS: return 5;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 4;
case GGML_TYPE_IQ4_NL: return 6;
case GGML_TYPE_IQ4_XS: return 5;
case GGML_TYPE_MXFP4: return 4;
case GGML_TYPE_Q2_K: return 4;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_0: return 6;
case GGML_TYPE_Q4_1: return 6;
case GGML_TYPE_Q4_K: return 5;
case GGML_TYPE_Q5_0: return 6;
case GGML_TYPE_Q5_1: return 6;
case GGML_TYPE_Q5_K: return 5;
case GGML_TYPE_Q6_K: return 4;
case GGML_TYPE_Q8_0: return 4;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ2_S: return 7;
case GGML_TYPE_IQ3_S: return 6;
case GGML_TYPE_IQ3_XXS: return 7;
case GGML_TYPE_MXFP4: return 7;
case GGML_TYPE_Q2_K: return 7;
case GGML_TYPE_Q3_K: return 5;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_gcn(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ1_S: return 5;
case GGML_TYPE_IQ1_M: return 5;
case GGML_TYPE_IQ2_S: return 4;
case GGML_TYPE_IQ2_XS: return 4;
case GGML_TYPE_IQ2_XXS: return 4;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 4;
case GGML_TYPE_IQ4_NL: return 6;
case GGML_TYPE_IQ4_XS: return 4;
case GGML_TYPE_Q2_K: return 4;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_0: return 5;
case GGML_TYPE_Q4_1: return 5;
case GGML_TYPE_Q4_K: return 4;
case GGML_TYPE_Q5_K: return 4;
case GGML_TYPE_Q6_K: return 4;
case GGML_TYPE_Q8_0: return 4;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_cdna(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ2_S: return 5;
case GGML_TYPE_IQ2_XS: return 5;
case GGML_TYPE_IQ2_XXS: return 5;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 5;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna1_rdna2(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ2_S: return 4;
case GGML_TYPE_IQ2_XS: return 4;
case GGML_TYPE_IQ2_XXS: return 4;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 4;
case GGML_TYPE_Q2_K: return 7;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_K: return 5;
case GGML_TYPE_Q5_K: return 6;
case GGML_TYPE_Q6_K: return 5;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna3(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ1_S: return 6;
case GGML_TYPE_IQ1_M: return 6;
case GGML_TYPE_IQ2_S: return 4;
case GGML_TYPE_IQ2_XS: return 4;
case GGML_TYPE_IQ2_XXS: return 4;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 4;
case GGML_TYPE_IQ4_NL: return 6;
case GGML_TYPE_IQ4_XS: return 6;
case GGML_TYPE_Q4_K: return 4;
case GGML_TYPE_Q5_K: return 4;
case GGML_TYPE_Q6_K: return 4;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type type) {
switch (type) {
case GGML_TYPE_IQ1_S: return 7;
case GGML_TYPE_IQ1_M: return 7;
case GGML_TYPE_IQ2_S: return 4;
case GGML_TYPE_IQ2_XS: return 4;
case GGML_TYPE_IQ2_XXS: return 4;
case GGML_TYPE_IQ3_S: return 4;
case GGML_TYPE_IQ3_XXS: return 4;
case GGML_TYPE_IQ4_NL: return 7;
case GGML_TYPE_IQ4_XS: return 5;
case GGML_TYPE_MXFP4: return 5;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_0: return 7;
case GGML_TYPE_Q4_1: return 7;
case GGML_TYPE_Q4_K: return 4;
case GGML_TYPE_Q5_0: return 7;
case GGML_TYPE_Q5_1: return 7;
case GGML_TYPE_Q5_K: return 5;
case GGML_TYPE_Q6_K: return 5;
case GGML_TYPE_Q8_0: return 7;
default: return MMVQ_MAX_BATCH_SIZE;
}
}
// Host function: returns the max batch size for the current arch+type at runtime.
int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
// NVIDIA: Volta, Ada Lovelace, and Blackwell always use MMVQ for MUL_MAT_ID.
if (cc == GGML_CUDA_CC_VOLTA || cc >= GGML_CUDA_CC_ADA_LOVELACE) {
return MMVQ_MAX_BATCH_SIZE;
}
if (cc >= GGML_CUDA_CC_TURING) {
return get_mmvq_mmid_max_batch_turing_plus(type);
}
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
return get_mmvq_mmid_max_batch_pascal_older(type);
}
// AMD
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return get_mmvq_mmid_max_batch_rdna4(type);
}
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
return get_mmvq_mmid_max_batch_rdna3(type);
}
if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA2(cc)) {
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
}
if (GGML_CUDA_CC_IS_CDNA(cc)) {
return get_mmvq_mmid_max_batch_cdna(type);
}
if (GGML_CUDA_CC_IS_GCN(cc)) {
return get_mmvq_mmid_max_batch_gcn(type);
}
return MMVQ_MAX_BATCH_SIZE;
}
// Device constexpr: returns the max batch size for the current arch+type at compile time.
template <ggml_type type>
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
#if defined(RDNA4)
return get_mmvq_mmid_max_batch_rdna4(type);
#elif defined(RDNA3)
return get_mmvq_mmid_max_batch_rdna3(type);
#elif defined(RDNA2) || defined(RDNA1)
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
#elif defined(CDNA)
return get_mmvq_mmid_max_batch_cdna(type);
#elif defined(GCN)
return get_mmvq_mmid_max_batch_gcn(type);
#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ >= GGML_CUDA_CC_ADA_LOVELACE)
return MMVQ_MAX_BATCH_SIZE;
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
return get_mmvq_mmid_max_batch_turing_plus(type);
#else
return get_mmvq_mmid_max_batch_pascal_older(type);
#endif
}
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) { static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) { if (table_id == MMVQ_PARAMETERS_GENERIC) {
switch (ncols_dst) { switch (ncols_dst) {
@ -195,7 +383,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
return 1; return 1;
} }
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false, bool small_k = false> template <ggml_type type, int ncols_dst, bool has_fusion, bool small_k = false>
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) __launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q( static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst, const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
@ -222,22 +410,13 @@ static __global__ void mul_mat_vec_q(
const uint32_t channel_dst = blockIdx.y; const uint32_t channel_dst = blockIdx.y;
uint32_t token_idx = 0;
uint32_t channel_x; uint32_t channel_x;
uint32_t channel_y; uint32_t channel_y;
uint32_t sample_dst; uint32_t sample_dst;
if constexpr (is_multi_token_id) { channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
token_idx = blockIdx.z; sample_dst = blockIdx.z;
channel_x = ids[channel_dst + token_idx * ids_stride];
channel_y = fastmodulo(channel_dst, nchannels_y);
sample_dst = 0;
} else {
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
sample_dst = blockIdx.z;
}
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio); const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
const uint32_t sample_y = sample_dst; const uint32_t sample_y = sample_dst;
@ -294,9 +473,6 @@ static __global__ void mul_mat_vec_q(
float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}}; float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
if constexpr (is_multi_token_id) {
y += token_idx*stride_col_y;
}
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@ -350,10 +526,6 @@ static __global__ void mul_mat_vec_q(
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0; dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
if constexpr (is_multi_token_id) {
dst += token_idx*stride_col_dst;
}
// sum up partial sums and write back result // sum up partial sums and write back result
#pragma unroll #pragma unroll
for (int j = 0; j < ncols_dst; ++j) { for (int j = 0; j < ncols_dst; ++j) {
@ -413,6 +585,69 @@ static __global__ void mul_mat_vec_q(
} }
} }
// Dedicated MoE multi-token kernel.
// Grid: (ceil(nrows_x / c_rows_per_block), nchannels_dst)
// Block: (warp_size, ncols_dst) - each warp handles one token independently.
// No shared memory reduction needed since each warp works alone.
template <ggml_type type, int c_rows_per_block>
__launch_bounds__(get_mmvq_mmid_max_batch_for_device<type>()*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q_moe(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids,
float * __restrict__ dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
const uint32_t ncols_dst, const uint32_t ids_stride) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
const uint32_t token_idx = threadIdx.y;
const int row0 = c_rows_per_block*blockIdx.x;
const int blocks_per_row_x = ncols_x / qk;
constexpr int blocks_per_iter = vdr * warp_size / qi;
const uint32_t channel_dst = blockIdx.y;
if (token_idx >= ncols_dst) {
return;
}
const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);
const block_q8_1 * y = ((const block_q8_1 *) vy) + channel_y*stride_channel_y + token_idx*stride_col_y;
const int kbx_offset = channel_x*stride_channel_x + row0*stride_row_x;
// partial sum for each thread
float tmp[c_rows_per_block] = {0.0f};
for (int kbx = threadIdx.x / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
const int kby = kbx * (qk/QK8_1);
const int kqs = vdr * (threadIdx.x % (qi/vdr));
#pragma unroll
for (int i = 0; i < c_rows_per_block; ++i) {
tmp[i] += vec_dot_q_cuda(vx, &y[kby], kbx_offset + i*stride_row_x + kbx, kqs);
}
}
// Warp-level reduction only - no shared memory needed
#pragma unroll
for (int i = 0; i < c_rows_per_block; ++i) {
tmp[i] = warp_reduce_sum<warp_size>(tmp[i]);
}
// Write results
if (threadIdx.x < c_rows_per_block && (c_rows_per_block == 1 || uint32_t(row0 + threadIdx.x) < nrows_x)) {
dst[channel_dst*stride_channel_dst + token_idx*stride_col_dst + row0 + threadIdx.x] = tmp[threadIdx.x];
}
}
template<ggml_type type> template<ggml_type type>
static std::pair<dim3, dim3> calc_launch_params( static std::pair<dim3, dim3> calc_launch_params(
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens, const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
@ -425,7 +660,7 @@ static std::pair<dim3, dim3> calc_launch_params(
return {block_nums, block_dims}; return {block_nums, block_dims};
} }
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false, bool small_k = false> template<ggml_type type, int c_ncols_dst, bool small_k = false>
static void mul_mat_vec_q_switch_fusion( static void mul_mat_vec_q_switch_fusion(
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
@ -438,7 +673,7 @@ static void mul_mat_vec_q_switch_fusion(
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
if constexpr (c_ncols_dst == 1) { if constexpr (c_ncols_dst == 1) {
if (has_fusion) { if (has_fusion) {
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>> mul_mat_vec_q<type, c_ncols_dst, true, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
@ -448,12 +683,33 @@ static void mul_mat_vec_q_switch_fusion(
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>> mul_mat_vec_q<type, c_ncols_dst, false, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
} }
template <ggml_type type>
static void mul_mat_vec_q_moe_launch(
const void * vx, const void * vy, const int32_t * ids, float * dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
const uint32_t ncols_dst, const uint32_t ids_stride,
const int warp_size, const int nchannels_dst, cudaStream_t stream) {
constexpr int rows_per_block = 2; // 2 gives best perf based on tuning
const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block;
const dim3 block_nums(nblocks_rows, nchannels_dst);
const dim3 block_dims(warp_size, ncols_dst);
mul_mat_vec_q_moe<type, rows_per_block><<<block_nums, block_dims, 0, stream>>>(
vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x,
stride_row_x, stride_col_y, stride_col_dst,
stride_channel_x, stride_channel_y, stride_channel_dst,
ncols_dst, ids_stride);
}
template <ggml_type type> template <ggml_type type>
static void mul_mat_vec_q_switch_ncols_dst( static void mul_mat_vec_q_switch_ncols_dst(
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
@ -472,20 +728,62 @@ static void mul_mat_vec_q_switch_ncols_dst(
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x); const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
const int device = ggml_cuda_get_device(); const int device = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[device].cc;
const int warp_size = ggml_cuda_info().devices[device].warp_size; const int warp_size = ggml_cuda_info().devices[device].warp_size;
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); const mmvq_parameter_table_id table_id = get_device_table_id(cc);
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
const bool has_ids = ids != nullptr; const bool has_ids = ids != nullptr;
const auto should_use_small_k = [&](int c_ncols_dst) {
// When K is small, increase rows_per_block to match nwarps so each warp has more work to do
// Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
const int blocks_per_row_x = ncols_x / qk;
const int blocks_per_iter_1warp = vdr * warp_size / qi;
const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
bool use = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
constexpr std::array<ggml_type, 2> iq_slow_turing = {
GGML_TYPE_IQ3_XXS,
GGML_TYPE_IQ3_S,
};
constexpr std::array<ggml_type, 8> iq_slow_other = {
GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
};
constexpr std::array<ggml_type, 3> slow_pascal = {
GGML_TYPE_IQ3_S,
GGML_TYPE_Q2_K,
GGML_TYPE_Q3_K,
};
const bool is_nvidia_turing_plus = GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_TURING;
const bool is_nvidia_pascal_older = GGML_CUDA_CC_IS_NVIDIA(cc) && cc < GGML_CUDA_CC_VOLTA;
if (is_nvidia_turing_plus) {
if (ncols_dst == 1 &&
std::find(iq_slow_turing.begin(), iq_slow_turing.end(), type) != iq_slow_turing.end()) {
use = false;
}
} else if ((ncols_dst == 1 && std::find(iq_slow_other.begin(), iq_slow_other.end(), type) != iq_slow_other.end()) ||
(is_nvidia_pascal_older && std::find(slow_pascal.begin(), slow_pascal.end(), type) != slow_pascal.end()) ||
GGML_CUDA_CC_IS_RDNA(cc)) {
use = false;
}
return use;
};
if (has_ids && ncols_dst > 1) { if (has_ids && ncols_dst > 1) {
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below // Multi-token MUL_MAT_ID path - dedicated MoE kernel
constexpr int c_ncols_dst = 1; mul_mat_vec_q_moe_launch<type>(
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id); vx, vy, ids, dst, ncols_x, nchannels_y_fd, nrows_x,
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, ncols_dst, ids_stride, warp_size, nchannels_dst, stream);
dims.first, dims.second, 0, ids_stride, stream);
return; return;
} }
@ -493,31 +791,24 @@ static void mul_mat_vec_q_switch_ncols_dst(
case 1: { case 1: {
constexpr int c_ncols_dst = 1; constexpr int c_ncols_dst = 1;
// When K is small, increase rows_per_block to match nwarps so each warp has more work to do bool use_small_k = should_use_small_k(c_ncols_dst);
// Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
const int blocks_per_row_x = ncols_x / qk;
const int blocks_per_iter_1warp = vdr * warp_size / qi;
const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
const bool use_small_k = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
if (use_small_k) { if (use_small_k) {
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
warp_size, table_id, true); nsamples_dst, warp_size, table_id, true);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, false, true>( mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
dims.first, dims.second, 0, ids_stride, stream); stream);
} else { } else {
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
warp_size, table_id); nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>( mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
dims.first, dims.second, 0, ids_stride, stream); stream);
} }
} break; } break;
case 2: { case 2: {

View File

@ -1,7 +1,10 @@
#include "common.cuh" #include "common.cuh"
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID
// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
// based on the quantization type and GPU architecture (compute capability).
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx, void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr); const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);

View File

@ -1406,6 +1406,13 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
repack_q8_0_q8x4x2(tensor, data, size); repack_q8_0_q8x4x2(tensor, data, size);
break; break;
case GGML_TYPE_IQ4_NL:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
// IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
repack_q4_0_q4x4x2(tensor, data, size);
break;
case GGML_TYPE_MXFP4: case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@ -1442,6 +1449,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
repack_q8x4x2_q8_0(data, tensor, size); repack_q8x4x2_q8_0(data, tensor, size);
break; break;
case GGML_TYPE_IQ4_NL:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
repack_q4x4x2_q4_0(data, tensor, size);
break;
case GGML_TYPE_MXFP4: case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@ -1819,6 +1832,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4: case GGML_TYPE_MXFP4:
if (src0->ne[0] % 32) { if (src0->ne[0] % 32) {
return false; return false;
@ -1868,6 +1882,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4: case GGML_TYPE_MXFP4:
if ((src0->ne[0] % 32)) { if ((src0->ne[0] % 32)) {
return false; return false;
@ -2596,8 +2611,26 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
delete backend; delete backend;
} }
// Map weight type to its activation quantization family.
// Types in the same family produce identical Q8 formats in VTCM and can
// safely share quantized activation data via SKIP_QUANTIZE.
// When adding a new quantized type, assign it the correct family here.
static inline int act_quant_family(enum ggml_type wtype) {
switch (wtype) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
return 1; // Q8x4x2
default:
return 0; // unknown / not quantized
}
}
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) { static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type)); return (op0 && op0->src[1] == op1->src[1] &&
act_quant_family(op0->src[0]->type) == act_quant_family(op1->src[0]->type) &&
act_quant_family(op0->src[0]->type) != 0);
} }
static inline bool is_compute_op(ggml_tensor *node) static inline bool is_compute_op(ggml_tensor *node)
@ -3364,6 +3397,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
"please update hexagon_type to match ggml_type"); "please update hexagon_type to match ggml_type");
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4, static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
"please update hexagon_type to match ggml_type"); "please update hexagon_type to match ggml_type");
static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
"please update hexagon_type to match ggml_type");
const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL"); const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE"); const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");

View File

@ -346,6 +346,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
const HVX_Vector logit_cap = hvx_vec_splat_f32(factx->logit_softcap); const HVX_Vector logit_cap = hvx_vec_splat_f32(factx->logit_softcap);
dma_cache m_cache;
dma_cache_init(&m_cache, spad_m, factx->size_m_block, DMA_CACHE_MAX_SIZE);
for (uint32_t ir = ir0; ir < ir1; ++ir) { for (uint32_t ir = ir0; ir < ir1; ++ir) {
const uint32_t iq3 = fastdiv(ir, &factx->src0_div21); const uint32_t iq3 = fastdiv(ir, &factx->src0_div21);
const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &factx->src0_div1); const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &factx->src0_div1);
@ -389,9 +392,8 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
// Mask // Mask
if (mask) { if (mask) {
const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start); const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start);
uint8_t * m_dst = spad_m + (ib % 2) * factx->size_m_block;
// Mask is 1D contiguous for this row // Mask is 1D contiguous for this row
dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1); dma_cache_push(dma, &m_cache, m_src, current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
} }
// FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u", // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
@ -554,7 +556,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
// Mask // Mask
if (mask) { if (mask) {
const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start); const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1); dma_cache_push(dma, &m_cache, m_src, next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
} }
// FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u : iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u", // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u : iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
@ -684,7 +686,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
octx->src0_spad.size_per_thread = size_q_block * 1; octx->src0_spad.size_per_thread = size_q_block * 1;
octx->src1_spad.size_per_thread = factx.size_k_block * 2; octx->src1_spad.size_per_thread = factx.size_k_block * 2;
octx->src2_spad.size_per_thread = factx.size_v_block * 2; octx->src2_spad.size_per_thread = factx.size_v_block * 2;
octx->src3_spad.size_per_thread = mask ? factx.size_m_block * 2 : 0; octx->src3_spad.size_per_thread = mask ? factx.size_m_block * DMA_CACHE_MAX_SIZE : 0;
octx->dst_spad.size_per_thread = size_vkq_acc; octx->dst_spad.size_per_thread = size_vkq_acc;
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
@ -705,6 +707,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size; octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size; octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size;
// FARF(ERROR, "fa: qrows-per-thread %u", factx.qrows_per_thread);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads); worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads);
} }

View File

@ -143,7 +143,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
desc->desc_size = 0; // 1D mode desc->desc_size = 0; // 1D mode
desc->src_bypass = dma_src_l2_bypass_on; desc->src_bypass = dma_src_l2_bypass_on;
desc->dst_bypass = dma_dst_l2_bypass_on; desc->dst_bypass = dma_dst_l2_bypass_on;
desc->order = 1; desc->order = 0;
desc->done = 0; desc->done = 0;
desc->src = (void *) dptr.src; desc->src = (void *) dptr.src;
desc->dst = (void *) dptr.dst; desc->dst = (void *) dptr.dst;
@ -151,8 +151,12 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
q->dptr[q->push_idx] = dptr; q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc); if (size) {
q->tail = (dma_descriptor_2d *) desc; dmlink(q->tail, desc);
q->tail = (dma_descriptor_2d *) desc;
} else {
desc->done = 1;
}
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src); // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask; q->push_idx = (q->push_idx + 1) & q->idx_mask;
@ -175,7 +179,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
desc->dst_bypass = dma_dst_l2_bypass_on; desc->dst_bypass = dma_dst_l2_bypass_on;
desc->src_comp = 0; desc->src_comp = 0;
desc->dst_comp = 0; desc->dst_comp = 0;
desc->order = 1; desc->order = 0;
desc->done = 0; desc->done = 0;
desc->src_stride = src_stride; desc->src_stride = src_stride;
desc->dst_stride = dst_stride; desc->dst_stride = dst_stride;
@ -197,8 +201,12 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
q->dptr[q->push_idx] = dptr; q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc); if (nrows) {
q->tail = desc; dmlink(q->tail, desc);
q->tail = desc;
} else {
desc->done = 1;
}
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src); // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask; q->push_idx = (q->push_idx + 1) & q->idx_mask;
@ -215,12 +223,9 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
dma_descriptor_2d * desc = &q->desc[q->pop_idx]; dma_descriptor_2d * desc = &q->desc[q->pop_idx];
// Wait for desc to complete // Wait for desc to complete
while (1) { while (!desc->done) {
dmpoll();
if (desc->done) {
break;
}
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
dmpoll();
} }
dptr = q->dptr[q->pop_idx]; dptr = q->dptr[q->pop_idx];
@ -312,6 +317,54 @@ static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows); return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
} }
#define DMA_CACHE_MAX_SIZE 64U
typedef struct {
uint8_t *base;
uint32_t line_size;
uint32_t capacity;
uint32_t src[DMA_CACHE_MAX_SIZE];
uint16_t age[DMA_CACHE_MAX_SIZE];
} dma_cache;
static inline void dma_cache_init(dma_cache *c, uint8_t *base, uint32_t line_size, uint32_t capacity)
{
c->capacity = (capacity > DMA_CACHE_MAX_SIZE) ? DMA_CACHE_MAX_SIZE : capacity;
c->base = base;
c->line_size = line_size;
for (unsigned i=0; i < c->capacity; i++) {
c->src[i] = 0;
c->age[i] = 0;
}
}
static inline bool dma_cache_push(dma_queue *q, dma_cache *c, const uint8_t * src, uint32_t dst_stride, uint32_t src_stride, uint32_t row_size, uint32_t nrows)
{
uint32_t o_idx = 0;
uint16_t o_age = 0;
uint8_t * dst = 0;
for (unsigned i=0; i < c->capacity; i++) {
if (c->src[i] == (uint32_t) src) {
c->age[i] = 0;
dst = c->base + (i * c->line_size); nrows = 0; // dummy dma
// FARF(ERROR, "dma-cache: found %p", src);
} else {
c->age[i]++;
if (c->age[i] > o_age) { o_age = c->age[i]; o_idx = i; }
}
}
if (!dst) {
// FARF(ERROR, "dma-cache: replacing #%u : age %u %p -> %p", o_idx, c->age[o_idx], (void *) c->src[o_idx], src);
c->age[o_idx] = 0;
c->src[o_idx] = (uint32_t) src;
dst = c->base + o_idx * c->line_size; // normal nrows dma
}
return dma_queue_push(q, dma_make_ptr(dst, src), dst_stride, src_stride, row_size, nrows);
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
#endif #endif

View File

@ -30,6 +30,12 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
-8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
}; };
// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
0, 0, 0.5, 0, 1, 0, 1.5, 0, 2, 0, 3, 0, 4, 0, 6, 0, 0, 0, -0.5, 0, -1, 0, -1.5, 0, -2, 0, -3, 0, -4, 0, -6, 0,
};
static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
-127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0, -127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0,
1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0, 1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0,
@ -46,7 +52,8 @@ static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned
// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes // Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
#define HMX_X4X2_SCALES_PER_BLK 8 #define HMX_X4X2_SCALES_PER_BLK 8
#define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes #define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes (fp16 scales for Q4_0/Q8_0/IQ4_NL)
#define HMX_X4X2_MXFP4_EBLK_SIZE 8 // 8 * 1 byte (E8M0 scales for MXFP4)
static inline void swap_ptr(void **p1, void **p2) { static inline void swap_ptr(void **p1, void **p2) {
void *t = *p1; void *t = *p1;
@ -78,9 +85,11 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
switch (weight_type) { switch (weight_type) {
case HTP_TYPE_Q4_0: case HTP_TYPE_Q4_0:
case HTP_TYPE_IQ4_NL: case HTP_TYPE_IQ4_NL:
return (size_t)nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb
case HTP_TYPE_Q8_0: case HTP_TYPE_Q8_0:
return (size_t)nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb
case HTP_TYPE_MXFP4:
return (size_t) nb * (QK_MXFP4x4x2 / 2 + HMX_X4X2_MXFP4_EBLK_SIZE); // 136 * nb
default: default:
return 0; return 0;
} }
@ -284,6 +293,87 @@ static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
} }
// --- MXFP4 E8M0 scale conversion and dequantization ---
//
// HVX batch-convert 8 E8M0 bytes (one x4x2 block's scales) to __fp16[8] on stack.
// Scalar loads from the stack array execute on the scalar pipeline, in parallel
// with HVX vlut16/vmpy/vscatter — freeing HVX slots in the hot loop.
// Arithmetic: fp16_bits = clamp(e - 112, 0, 30) << 10
// e=0..112 -> 0 (underflow), e=113..142 -> valid fp16, e>=143 -> clamped to 2^15.
typedef struct {
__fp16 v[8] __attribute__((aligned(16)));
} mxfp4_scales_t;
static inline mxfp4_scales_t mxfp4_convert_scales(const uint8_t * e8m0_8) {
mxfp4_scales_t s;
HVX_Vector v = hvx_vmemu(e8m0_8);
HVX_Vector vh = Q6_V_lo_W(Q6_Wuh_vunpack_Vub(v));
vh = Q6_Vh_vsub_VhVh(vh, Q6_Vh_vsplat_R(112));
vh = Q6_Vh_vmax_VhVh(vh, Q6_V_vzero());
vh = Q6_Vh_vmin_VhVh(vh, Q6_Vh_vsplat_R(30));
vh = Q6_Vh_vasl_VhR(vh, 10);
hvx_vec_store_u(s.v, 16, vh);
return s;
}
static inline HVX_Vector mxfp4_extract_splat(mxfp4_scales_t scales, int idx) {
return hvx_vec_splat_f16(scales.v[idx]);
}
// Dequantize one x4x2 MXFP4 group (32 elements from 32 packed bytes) -> 32 FP16.
static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t * packed_32,
bool upper_nibbles,
int sub_blk,
const HVX_Vector vlut_cvt,
mxfp4_scales_t scales) {
HVX_Vector vq = hvx_vmemu(packed_32);
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
HVX_Vector v_sc = mxfp4_extract_splat(scales, sub_blk);
v_quants = Q6_Vb_vshuff_Vb(v_quants);
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
HVX_Vector v_hf = Q6_V_lo_W(vp);
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_sc));
}
// Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128,
bool upper_nibbles,
int sub_blk_base,
const HVX_Vector vlut_cvt,
mxfp4_scales_t scales,
HVX_Vector out[4]) {
HVX_Vector vq = hvx_vmemu(packed_128);
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
v_quants = Q6_Vb_vshuff_Vb(v_quants);
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
HVX_Vector v_lo = Q6_V_lo_W(vp);
HVX_Vector v_hi = Q6_V_hi_W(vp);
HVX_VectorPred q64 = Q6_Q_vsetq_R(64);
HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 0),
mxfp4_extract_splat(scales, sub_blk_base + 1));
HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 2),
mxfp4_extract_splat(scales, sub_blk_base + 3));
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
out[0] = v_lo;
out[1] = Q6_V_vror_VR(v_lo, 64);
out[2] = v_hi;
out[3] = Q6_V_vror_VR(v_hi, 64);
}
// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16. // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
// Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes. // Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
// Output: vtcm_dst in tile-major FP16 layout. // Output: vtcm_dst in tile-major FP16 layout.
@ -295,11 +385,11 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
int start_tile, int end_tile) { int start_tile, int end_tile) {
const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS; const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL); const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
const int qrow_size = is_q4 ? (k_block / 2) : k_block;
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
? hvx_vmem(iq4_nl_to_fp16_lut) : hvx_vmem(q4_0_to_fp16_lut); (weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
hvx_vmem(q4_0_to_fp16_lut);
// vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions. // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
// Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128 // Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128
@ -312,8 +402,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
int ct = t / n_k_tiles; // column tile index int ct = t / n_k_tiles; // column tile index
int kt = t % n_k_tiles; // K tile index int kt = t % n_k_tiles; // K tile index
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row --- // --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) { if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
((t + 3) / n_k_tiles == ct)) {
int blk_idx = (kt * 32) / QK_Q4_0x4x2; int blk_idx = (kt * 32) / QK_Q4_0x4x2;
int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4 int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
bool upper = (sub_blk_base >= 4); bool upper = (sub_blk_base >= 4);
@ -351,10 +442,60 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
continue; continue;
} }
// --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; // 0 or 4
bool upper = (sub_blk_base >= 4);
int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); // 128 contiguous packed bytes
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; // all 8 E8M0 scales
__fp16 * tile_bases[4];
for (int g = 0; g < 4; g++) {
tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
}
HVX_Vector v_off = v_scat_base;
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
int row1 = row0 + 1;
const uint8_t * r0 = vtcm_src + row0 * row_stride;
const uint8_t * r1 = vtcm_src + row1 * row_stride;
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
HVX_Vector v0[4], v1[4];
dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
if (row1 < n_cols) {
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
} else {
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
}
for (int g = 0; g < 4; g++) {
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
}
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
for (int g = 0; g < 4; g++) {
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
}
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
}
for (int g = 0; g < 4; g++) {
(void) *(volatile HVX_Vector *) (tile_bases[g]);
}
t += 4;
continue;
}
// --- Single-tile fallback --- // --- Single-tile fallback ---
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS; __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
if (is_q4) { if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
int blk_idx = (kt * 32) / QK_Q4_0x4x2; int blk_idx = (kt * 32) / QK_Q4_0x4x2;
int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
bool upper = (sub_blk >= 4); bool upper = (sub_blk >= 4);
@ -382,6 +523,39 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
} }
(void) *(volatile HVX_Vector *)(tile_base); (void) *(volatile HVX_Vector *)(tile_base);
} else if (weight_type == HTP_TYPE_MXFP4) {
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
int sub_blk = ((kt * 32) % QK_MXFP4x4x2) / 32;
bool upper = (sub_blk >= 4);
int byte_off = blk_idx * (QK_MXFP4x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
HVX_Vector v_off = v_scat_base;
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
int row1 = row0 + 1;
const uint8_t * r0 = vtcm_src + row0 * row_stride;
const uint8_t * r1 = vtcm_src + row1 * row_stride;
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
HVX_Vector v1;
if (row1 < n_cols) {
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
} else {
v1 = Q6_V_vzero();
}
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
}
(void) *(volatile HVX_Vector *) (tile_base);
} else { } else {
// Q8_0 // Q8_0
int blk_idx = (kt * 32) / QK_Q8_0x4x2; int blk_idx = (kt * 32) / QK_Q8_0x4x2;
@ -1455,21 +1629,24 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
{ {
qweight_fetch_task_state_t s; qweight_fetch_task_state_t s;
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
const int blk_start = kk / QK_Q4_0x4x2; const int blk_start = kk / QK_Q4_0x4x2;
const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2; const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
const int full_qrow = is_q4 ? (k / 2) : k; const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz); const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
const int scale_blk_size =
(weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
s.dst = vtcm_scratch0; s.dst = vtcm_scratch0;
s.src = w + nc * row_stride; s.src = w + nc * row_stride;
s.n_rows = n_blk_sz; s.n_rows = n_blk_sz;
s.src_stride = row_stride; s.src_stride = row_stride;
s.dst_stride = sub_row_stride; s.dst_stride = sub_row_stride;
s.quant_off = is_q4 ? (blk_start * (QK_Q4_0x4x2 / 2)) : (blk_start * QK_Q8_0x4x2); s.quant_off =
s.quant_width = is_q4 ? (nb_sub * (QK_Q4_0x4x2 / 2)) : (nb_sub * QK_Q8_0x4x2); (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
s.scale_off = full_qrow + blk_start * HMX_X4X2_DBLK_SIZE; s.quant_width =
s.scale_width = nb_sub * HMX_X4X2_DBLK_SIZE; (weight_type == HTP_TYPE_Q8_0) ? (nb_sub * QK_Q8_0x4x2) : (nb_sub * (QK_Q4_0x4x2 / 2));
s.scale_off = full_qrow + blk_start * scale_blk_size;
s.scale_width = nb_sub * scale_blk_size;
// 2D DMA: quants sub-range // 2D DMA: quants sub-range
dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off), dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off),

View File

@ -31,6 +31,12 @@ struct htp_context {
uint32_t opmask; uint32_t opmask;
// Cached src1 spad position from the last quantize pass.
// When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM
// at this address; the matmul must read from here instead of recomputing
// the offset (which depends on the current op's src0 size).
uint8_t * prev_src1_spad;
// HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX) // HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX)
#ifdef HTP_HAS_HMX #ifdef HTP_HAS_HMX
int hmx_enabled; // Runtime flag: HMX initialisation succeeded int hmx_enabled; // Runtime flag: HMX initialisation succeeded

View File

@ -1114,14 +1114,12 @@ static void proc_hmx_matmul_req(struct htp_context * ctx,
return; return;
} }
// HMX only supports F16, Q4_0, Q8_0, IQ4_NL weights. // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
// Other types (e.g. MXFP4) fall back to HVX. // Other types fall back to HVX.
{ {
uint32_t wtype = req->src0.type; uint32_t wtype = req->src0.type;
if (wtype != HTP_TYPE_F16 && if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL &&
wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_MXFP4) {
wtype != HTP_TYPE_Q8_0 &&
wtype != HTP_TYPE_IQ4_NL) {
proc_matmul_req(ctx, req, bufs, n_bufs); proc_matmul_req(ctx, req, bufs, n_bufs);
return; return;
} }

View File

@ -60,6 +60,16 @@ static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20, 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20,
}; };
// IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue
// kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113
static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = {
0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0,
0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = { static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0,
0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -68,6 +78,73 @@ static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}; };
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_full(const uint8_t * restrict ptr) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
HVX_Vector v2_3 = vptr[1]; // ...
HVX_Vector v4_5 = vptr[2]; // ...
HVX_Vector v6_7 = vptr[3]; // ...
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
const uint32_t qk = QK_Q4_0x4x2; // 256
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
HVX_Vector_x8 r;
uint32_t i = 0;
#pragma unroll(2)
for (i = 0; i < nb; i++) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
}
if (nloe) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
}
return r;
}
// q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales // q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales
static inline size_t q8x4x2_row_size(uint32_t ne) { static inline size_t q8x4x2_row_size(uint32_t ne) {
@ -921,6 +998,293 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1 hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
} }
// ======== IQ4_NL x Q8_0 vec_dot kernels ========
// Same structure as Q4_0 vec_dot but uses IQ4_NL LUT-based load (4-bit index -> int8 kvalue).
// Scale format is identical to Q4_0 (fp16 scales).
static void vec_dot_iq4nlx4x2_q8x4x2_1x1(const int n,
float * restrict s0,
const void * restrict vx0,
const void * restrict vy0) {
assert(n % 32 == 0);
assert((unsigned long) vx0 % 128 == 0);
assert((unsigned long) vy0 % 128 == 0);
const uint32_t qk = QK_Q4_0x4x2 * 4;
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t x_qblk_size = qk / 2; // int4
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t y_qblk_size = qk; // int8
const uint32_t y_qrow_size = n; // int8 (not padded)
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
HVX_Vector r0_sum = Q6_V_vzero();
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(s0, 4, r0_sum);
}
static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n,
float * restrict s0,
const void * restrict vx0,
const void * restrict vx1,
const void * restrict vy0) {
assert(n % 32 == 0);
assert((unsigned long) vx0 % 128 == 0);
assert((unsigned long) vx1 % 128 == 0);
assert((unsigned long) vy0 % 128 == 0);
const uint32_t qk = QK_Q4_0x4x2 * 4;
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t x_qblk_size = qk / 2; // int4
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t y_qblk_size = qk; // int8
const uint32_t y_qrow_size = n; // int8 (not padded)
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
HVX_Vector r0_sum = Q6_V_vzero();
HVX_Vector r1_sum = Q6_V_vzero();
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(s0, 8, rsum);
}
static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
float * restrict s0,
float * restrict s1,
const void * restrict vx0,
const void * restrict vx1,
const void * restrict vy0,
const void * restrict vy1) {
assert(n % 32 == 0);
assert((unsigned long) vx0 % 128 == 0);
assert((unsigned long) vx1 % 128 == 0);
assert((unsigned long) vy0 % 128 == 0);
assert((unsigned long) vy1 % 128 == 0);
const uint32_t qk = QK_Q4_0x4x2 * 4;
const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t x_qblk_size = qk / 2; // int4
const uint32_t x_qrow_size = n / 2; // int4 (not padded)
const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
const uint32_t y_qblk_size = qk; // int8
const uint32_t y_qrow_size = n; // int8 (not padded)
const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;
const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;
const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;
const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;
const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;
const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;
const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;
HVX_Vector r0_c0_sum = Q6_V_vzero();
HVX_Vector r0_c1_sum = Q6_V_vzero();
HVX_Vector r1_c0_sum = Q6_V_vzero();
HVX_Vector r1_c1_sum = Q6_V_vzero();
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
}
if (nloe) {
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
}
HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);
hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);
}
static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
assert(n % 32 == 0); // min sub-block size assert(n % 32 == 0); // min sub-block size
assert((unsigned long) vx0 % 128 == 0); assert((unsigned long) vx0 % 128 == 0);
@ -2393,6 +2757,12 @@ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_t
mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1; mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2; mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
return 0; return 0;
case HTP_TYPE_IQ4_NL:
mmctx->type = "iq4nlx4x2-f32";
mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1;
mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1;
mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2;
return 0;
case HTP_TYPE_MXFP4: case HTP_TYPE_MXFP4:
mmctx->type = "mxfp4x4x2-f32"; mmctx->type = "mxfp4x4x2-f32";
mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1; mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
@ -2556,6 +2926,13 @@ int op_matmul(struct htp_ops_context * octx) {
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
// Cache where src1 was written so subsequent SKIP_QUANTIZE ops can find it
octx->ctx->prev_src1_spad = octx->src1_spad.data;
} else {
// SKIP_QUANTIZE: Q8 data lives at the address written by the previous
// quantize pass. The current op may have a different src0 size (e.g.
// IQ4_NL vs MXFP4), so src1_spad.data computed above could be wrong.
octx->src1_spad.data = octx->ctx->prev_src1_spad;
} }
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
@ -2659,6 +3036,9 @@ int op_matmul_id(struct htp_ops_context * octx) {
const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
octx->ctx->prev_src1_spad = octx->src1_spad.data;
} else {
octx->src1_spad.data = octx->ctx->prev_src1_spad;
} }
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {

View File

@ -333,8 +333,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start)); // (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
} }
// Skip DMA transactions from prev block (if any) // Skip output DMA transactions from prev block (if any)
// No need to wait for these since the DMA is setup for in-order processing // No need to wait for those here since we're explicitly waiting for the latest prefecthes below.
for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop_nowait(dma_queue); } for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop_nowait(dma_queue); }
// Compute loop // Compute loop

View File

@ -114,6 +114,8 @@ set(GGML_OPENCL_KERNELS
gemv_noshuffle_q4_1_f32 gemv_noshuffle_q4_1_f32
gemm_noshuffle_q4_1_f32 gemm_noshuffle_q4_1_f32
gemv_noshuffle_general_q8_0_f32 gemv_noshuffle_general_q8_0_f32
gemv_noshuffle_q4_k_f32
gemm_noshuffle_q4_k_f32
gemv_noshuffle_q6_k_f32 gemv_noshuffle_q6_k_f32
gemm_noshuffle_q6_k_f32 gemm_noshuffle_q6_k_f32
mul mul

View File

@ -538,6 +538,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_restore_block_q4_0_noshuffle; cl_kernel kernel_restore_block_q4_0_noshuffle;
cl_kernel kernel_convert_block_q4_1_noshuffle; cl_kernel kernel_convert_block_q4_1_noshuffle;
cl_kernel kernel_restore_block_q4_1_noshuffle; cl_kernel kernel_restore_block_q4_1_noshuffle;
cl_kernel kernel_convert_block_q4_K_noshuffle;
cl_kernel kernel_restore_block_q4_K_noshuffle;
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K; cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K; cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@ -720,6 +722,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_gemm_noshuffle_q4_1_f32; cl_kernel kernel_gemm_noshuffle_q4_1_f32;
cl_kernel kernel_mul_mm_q8_0_f32_8x4; cl_kernel kernel_mul_mm_q8_0_f32_8x4;
cl_kernel CL_mul_mat_vec_q8_0_f32; cl_kernel CL_mul_mat_vec_q8_0_f32;
cl_kernel kernel_gemv_noshuffle_q4_k_f32;
cl_kernel kernel_gemm_noshuffle_q4_k_f32;
cl_kernel kernel_gemv_noshuffle_q6_K_f32; cl_kernel kernel_gemv_noshuffle_q6_K_f32;
cl_kernel kernel_gemm_noshuffle_q6_K_f32; cl_kernel kernel_gemm_noshuffle_q6_K_f32;
#endif // GGML_OPENCL_USE_ADRENO_KERNELS #endif // GGML_OPENCL_USE_ADRENO_KERNELS
@ -932,6 +936,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
@ -2619,6 +2625,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT("."); GGML_LOG_CONT(".");
} }
// gemm_noshuffle_q4_k_f32
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemm_noshuffle_q4_k_f32.cl.h"
};
#else
const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
#endif
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_k_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// gemv_noshuffle_q4_k_f32
{
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable ";
if (backend_ctx->has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
}
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "gemv_noshuffle_q4_k_f32.cl.h"
};
#else
const std::string kernel_src = read_file("gemv_noshuffle_q4_k_f32.cl");
#endif
cl_program prog = build_program_from_source(
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_k_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std + std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable " " -cl-mad-enable "
" -cl-fast-relaxed-math"; " -cl-fast-relaxed-math";
@ -5060,12 +5105,25 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err); CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
CL_CHECK(err); CL_CHECK(err);
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
if (use_adreno_kernels(backend_ctx, tensor)) {
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
}
#else
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
#endif
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {64, 1, 1}; size_t local_work_size[] = {64, 1, 1};
@ -5076,6 +5134,20 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
CL_CHECK(clReleaseMemObject(data_device)); CL_CHECK(clReleaseMemObject(data_device));
tensor->extra = extra; tensor->extra = extra;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
int M = tensor->ne[1];
int K = tensor->ne[0];
GGML_ASSERT(K % 32 == 0);
// Transpose q, d, dm as ushort
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M);
transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M);
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
return; return;
} }
if (tensor->type == GGML_TYPE_Q6_K) { if (tensor->type == GGML_TYPE_Q6_K) {
@ -5516,12 +5588,60 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
ggml_nbytes(tensor), NULL, &err); ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err); CL_CHECK(err);
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
int M = tensor->ne[1];
int K = tensor->ne[0];
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
static ggml_cl_buffer buf_trans_q;
static ggml_cl_buffer buf_trans_d;
static ggml_cl_buffer buf_trans_dm;
buf_trans_q.allocate(backend_ctx->context, size_q);
buf_trans_d.allocate(backend_ctx->context, size_d);
buf_trans_dm.allocate(backend_ctx->context, size_dm);
// Transpose q, d, dm back
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256);
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K_noshuffle;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_dm.buffer));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K; cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1}; size_t local_work_size[] = {1, 1, 1};
@ -9688,6 +9808,192 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
#endif #endif
} }
static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
ggml_tensor_extra_cl_q4_K * extra0_q4_k = (ggml_tensor_extra_cl_q4_K *)src0->extra;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne1 = dst->ne[1];
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
cl_context context = backend_ctx->context;
cl_kernel kernel;
cl_int err;
cl_image_format img_fmt;
cl_image_desc img_desc;
cl_buffer_region region;
int M = ne01;
int N = ne1;
int K = ne00;
cl_uchar mask_d6 = 0x3F;
cl_uchar mask_d4 = 0x0F;
cl_uchar mask_hi2 = 0xC0;
if (ne1 == 1) {
cl_mem q_img = nullptr;
cl_mem b_sub_buf = nullptr;
cl_mem b_img = nullptr;
// image for q
img_fmt = { CL_R, CL_UNSIGNED_INT32};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = M * K / 2 / 4;
img_desc.buffer = extra0_q4_k->q;
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
// subbuffer for activations
region.origin = offset1;
region.size = K * N * sizeof(float);
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for activations
img_fmt = {CL_RGBA, CL_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * N / 4;
img_desc.buffer = b_sub_buf;
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
kernel = backend_ctx->kernel_gemv_noshuffle_q4_k_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->dm));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->s));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_d6));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d4));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_hi2));
size_t local_work_size[3] = {64, 4, 1};
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
CL_CHECK(clReleaseMemObject(q_img));
CL_CHECK(clReleaseMemObject(b_sub_buf));
CL_CHECK(clReleaseMemObject(b_img));
} else {
cl_mem b_sub_buf = nullptr;
cl_mem b_sub_buf_trans = nullptr;
cl_mem b_img = nullptr;
cl_mem b_img_trans = nullptr;
// subbuffer for activations
region.origin = offset1;
region.size = K * N * sizeof(float);
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for activations
img_fmt = {CL_RGBA, CL_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * N / 4;
img_desc.buffer = b_sub_buf;
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
// pad N to multiple of 8
int extra_elements = N % 8;
int padding = 0;
if (extra_elements > 0){
padding = 8 - extra_elements;
}
// subbuffer for transposed activations
region.origin = 0;
region.size = K * (N + padding) * sizeof(float)/2;
backend_ctx->prealloc_act_trans.allocate(context, region.size);
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
// image for transposed activations
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
memset(&img_desc, 0, sizeof(img_desc));
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc.image_width = K * (N + padding) / 4;
img_desc.buffer = b_sub_buf_trans;
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
// transpose activations
int height_B = N/4;
if (height_B == 0) {
height_B = 1;
}
int width_B = K/4;
int padded_height_B = (N + padding)/4;
kernel = backend_ctx->kernel_transpose_32_16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
size_t local_work_size_t[2] = { 1, 16 };
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
// gemm
kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32;
int padded_N = N + padding;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_k->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->s));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->d));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->dm));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &padded_N));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d6));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d4));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_hi2));
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
size_t local_work_size[3] = {1, 128, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
CL_CHECK(clReleaseMemObject(b_sub_buf));
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
CL_CHECK(clReleaseMemObject(b_img));
CL_CHECK(clReleaseMemObject(b_img_trans));
}
#else
GGML_UNUSED(backend);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
#endif
}
static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
GGML_ASSERT(src0); GGML_ASSERT(src0);
@ -10014,6 +10320,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
return; return;
} }
// q4_k x fp32
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
return;
}
// q6_K x fp32 // q6_K x fp32
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) { if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst); ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);

View File

@ -424,13 +424,17 @@ kernel void kernel_restore_block_q8_0_trans(
// Convert the block_q4_K format to 4 separate arrays (AOS -> SOA). // Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
// This kernel does not deshuffle the bits. // This kernel does not deshuffle the bits.
// Each thread processes a super block. // Each thread processes a super block.
// Mask args are just to keep the signature consistent with the no-shuffle
// version and they are not used in this kernel.
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
kernel void kernel_convert_block_q4_K( kernel void kernel_convert_block_q4_K(
global struct block_q4_K * src0, global struct block_q4_K * src0,
global uchar * dst_q, global uchar * dst_q,
global uchar * dst_s, global uchar * dst_s,
global half * dst_d, global half * dst_d,
global half * dst_dm global half * dst_dm,
uchar mask_0F,
uchar mask_F0
) { ) {
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0); global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0); global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
@ -451,12 +455,15 @@ kernel void kernel_convert_block_q4_K(
// Restore block_q4_K from flattened arrays. // Restore block_q4_K from flattened arrays.
// Each thread processes a super block. // Each thread processes a super block.
// Mask args are just to keep the signature consistent with the no-shuffle ones.
kernel void kernel_restore_block_q4_K( kernel void kernel_restore_block_q4_K(
global uchar * src_q, global uchar * src_q,
global uchar * src_s, global uchar * src_s,
global half * src_d, global half * src_d,
global half * src_dm, global half * src_dm,
global struct block_q4_K * dst global struct block_q4_K * dst,
uchar mask_0F,
uchar mask_F0
) { ) {
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0); global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0); global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
@ -475,6 +482,70 @@ kernel void kernel_restore_block_q4_K(
} }
} }
kernel void kernel_convert_block_q4_K_noshuffle(
global struct block_q4_K * src0,
global uchar * dst_q,
global uchar * dst_s,
global half * dst_d,
global half * dst_dm,
uchar mask_0F,
uchar mask_F0
) {
global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
global half * dm = (global half *) dst_dm + get_global_id(0);
*d = b->d;
*dm = b->dm;
for (int i = 0; i < QK_K / 64; ++i) {
for (int j = 0; j < 16; ++j) {
uchar x0 = b->q[i*32 + 2*j];
uchar x1 = b->q[i*32 + 2*j + 1];
q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
}
}
for (int i = 0; i < K_SCALE_SIZE; ++i) {
s[i] = b->s[i];
}
}
kernel void kernel_restore_block_q4_K_noshuffle(
global uchar * src_q,
global uchar * src_s,
global half * src_d,
global half * src_dm,
global struct block_q4_K * dst,
uchar mask_0F,
uchar mask_F0
) {
global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
global half * dm = (global half *) src_dm + get_global_id(0);
b->d = *d;
b->dm = *dm;
for (int i = 0; i < QK_K / 64; ++i) {
for (int j = 0; j < 16; ++j) {
uchar lo = q[i*32 + j];
uchar hi = q[i*32 + j + 16];
b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
}
}
for (int i = 0; i < K_SCALE_SIZE; ++i) {
b->s[i] = s[i];
}
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// kernel_convert_block_q6_K // kernel_convert_block_q6_K
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).

View File

@ -0,0 +1,172 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK_K 256
#define K_SCALE_SIZE 12
inline void get_scale_min_k4(
int j,
global const uchar * q,
uchar * d,
uchar * m,
uchar mask_d6,
uchar mask_d4,
uchar mask_hi2
) {
if (j < 4) {
*d = q[j] & mask_d6;
*m = q[j+4] & mask_d6;
} else {
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
}
}
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_128
#endif
kernel void kernel_gemm_noshuffle_q4_k_f32(
global const ushort * src0_q,
global const uchar * src0_s,
global const half * src0_d,
global const half * src0_dm,
read_only image1d_buffer_t src1,
global float * dst,
ulong offsetd,
int m,
int n,
int k,
int n_no_padding,
uchar mask_d6,
uchar mask_d4,
uchar mask_hi2
) {
dst = (global float *)((global char *)dst + offsetd);
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
half8 B;
half4 dequantized_weights;
int num_blocks_K = k / QK_K;
global const ushort * weight_ptr = src0_q + gx_2;
global const half * d_ptr = src0_d + gx_2;
global const half * dm_ptr = src0_dm + gx_2;
for (int i = 0; i < k; i += 32) {
int sb_idx = i / QK_K;
int sub_idx = (i / 32) % 8;
half4 d = vload4(0, d_ptr + sb_idx * m);
half4 dm = vload4(0, dm_ptr + sb_idx * m);
global const uchar * sc0 = src0_s + (gx_2+0) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
global const uchar * sc1 = src0_s + (gx_2+1) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
global const uchar * sc2 = src0_s + (gx_2+2) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
global const uchar * sc3 = src0_s + (gx_2+3) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
uchar sv0, mn0, sv1, mn1, sv2, mn2, sv3, mn3;
get_scale_min_k4(sub_idx, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
get_scale_min_k4(sub_idx, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
get_scale_min_k4(sub_idx, sc2, &sv2, &mn2, mask_d6, mask_d4, mask_hi2);
get_scale_min_k4(sub_idx, sc3, &sv3, &mn3, mask_d6, mask_d4, mask_hi2);
half4 scale = convert_half4(convert_float4(d) * convert_float4((uchar4)(sv0, sv1, sv2, sv3)));
half4 mval = convert_half4(convert_float4(dm) * convert_float4((uchar4)(mn0, mn1, mn2, mn3)));
for (int l = 0; l < 32; l += 4) {
int ki = i + l;
ushort4 bits4 = vload4(0, weight_ptr + (ki/4) * m);
// j=0
B.s0123 = read_imageh(src1, gy*2 + (ki+0) * n_4);
B.s4567 = read_imageh(src1, gy*2+1 + (ki+0) * n_4);
dequantized_weights.s0 = (bits4.s0 & 0x000F) * scale.s0 - mval.s0;
dequantized_weights.s1 = (bits4.s1 & 0x000F) * scale.s1 - mval.s1;
dequantized_weights.s2 = (bits4.s2 & 0x000F) * scale.s2 - mval.s2;
dequantized_weights.s3 = (bits4.s3 & 0x000F) * scale.s3 - mval.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=1
B.s0123 = read_imageh(src1, gy*2 + (ki+1) * n_4);
B.s4567 = read_imageh(src1, gy*2+1 + (ki+1) * n_4);
dequantized_weights.s0 = ((bits4.s0 & 0x00F0) >> 4) * scale.s0 - mval.s0;
dequantized_weights.s1 = ((bits4.s1 & 0x00F0) >> 4) * scale.s1 - mval.s1;
dequantized_weights.s2 = ((bits4.s2 & 0x00F0) >> 4) * scale.s2 - mval.s2;
dequantized_weights.s3 = ((bits4.s3 & 0x00F0) >> 4) * scale.s3 - mval.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=2
B.s0123 = read_imageh(src1, gy*2 + (ki+2) * n_4);
B.s4567 = read_imageh(src1, gy*2+1 + (ki+2) * n_4);
dequantized_weights.s0 = ((bits4.s0 & 0x0F00) >> 8) * scale.s0 - mval.s0;
dequantized_weights.s1 = ((bits4.s1 & 0x0F00) >> 8) * scale.s1 - mval.s1;
dequantized_weights.s2 = ((bits4.s2 & 0x0F00) >> 8) * scale.s2 - mval.s2;
dequantized_weights.s3 = ((bits4.s3 & 0x0F00) >> 8) * scale.s3 - mval.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=3
B.s0123 = read_imageh(src1, gy*2 + (ki+3) * n_4);
B.s4567 = read_imageh(src1, gy*2+1 + (ki+3) * n_4);
dequantized_weights.s0 = ((bits4.s0 & 0xF000) >> 12) * scale.s0 - mval.s0;
dequantized_weights.s1 = ((bits4.s1 & 0xF000) >> 12) * scale.s1 - mval.s1;
dequantized_weights.s2 = ((bits4.s2 & 0xF000) >> 12) * scale.s2 - mval.s2;
dequantized_weights.s3 = ((bits4.s3 & 0xF000) >> 12) * scale.s3 - mval.s3;
c0 += B * dequantized_weights.s0;
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
}
}
int idx = (gy<<3)*m + (gx<<2);
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if (idx+3 < m*n_no_padding) {
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View File

@ -0,0 +1,318 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif
#define QK_K 256
#define NSUBGROUPS 4
#define SUBGROUP_SIZE 64
inline void get_scale_min_k4(
int j,
global const uchar * q,
uchar * d,
uchar * m,
uchar mask_d6,
uchar mask_d4,
uchar mask_hi2
) {
if (j < 4) {
*d = q[j] & mask_d6;
*m = q[j+4] & mask_d6;
} else {
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
}
}
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, minv, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, minv, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, minv, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, minv, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) * scale.s0 - minv.s0) * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) * scale.s0 - minv.s0) * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) * scale.s0 - minv.s0) * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) * scale.s0 - minv.s0) * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) * scale.s0 - minv.s0) * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) * scale.s1 - minv.s1) * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) * scale.s1 - minv.s1) * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) * scale.s1 - minv.s1) * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) * scale.s1 - minv.s1) * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) * scale.s1 - minv.s1) * shared_y.s7; \
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_gemv_noshuffle_q4_k_f32(
read_only image1d_buffer_t src0_q,
global half2 * src0_d,
global half2 * src0_m,
global uchar * src0_s,
read_only image1d_buffer_t src1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
uchar mask_d6,
uchar mask_d4,
uchar mask_hi2)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M / 2;
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
uint scales_per_row = (K / QK_K) * 12;
private uint4 regA;
private half2 regS;
private half2 regM;
private float8 regB;
private float2 totalSum = (float2)(0.0f);
for (uint k = groupId; k < (K / 32); k += NSUBGROUPS) {
uint sb = k / 8;
uint j = k % 8;
half2 d = src0_d[gid + sb * LINE_STRIDE_A];
half2 dm = src0_m[gid + sb * LINE_STRIDE_A];
global const uchar * sc0 = src0_s + 2 * gid * scales_per_row + sb * 12;
global const uchar * sc1 = src0_s + (2 * gid + 1) * scales_per_row + sb * 12;
uchar sv0, mn0, sv1, mn1;
get_scale_min_k4(j, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
get_scale_min_k4(j, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
regS = convert_half2(convert_float2(d) * convert_float2((uchar2)(sv0, sv1)));
regM = convert_half2(convert_float2(dm) * convert_float2((uchar2)(mn0, mn1)));
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAST
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regM, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regM, regB);
#endif // VECTOR_SUB_GROUP_BROADCAST
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAST
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regM, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regM, regB);
#endif // VECTOR_SUB_GROUP_BROADCAST
}
// reduction in local memory, assumes #wave=4
local float2 reduceLM[SUBGROUP_SIZE * 3];
if (groupId == 1) {
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
}
if (groupId == 2) {
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
}
if (groupId == 3) {
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
}
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
}
if (groupId == 0) {
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
}
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

View File

@ -1340,7 +1340,9 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
if (buffer && buffer->iface.init_tensor) { if (buffer && buffer->iface.init_tensor) {
buffer->iface.init_tensor(buffer, tensor); buffer->iface.init_tensor(buffer, tensor);
} else { } else {
GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n"); if (!buffer) {
GGML_LOG_ERROR("Tensor with null buffer passed to init_tensor function\n");
}
} }
if (tensor->extra != nullptr) { if (tensor->extra != nullptr) {

View File

@ -70,6 +70,7 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp16(const int DKQ, co
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64) GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64) GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64) GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 64, 64)
return 0; return 0;
} }
@ -310,11 +311,11 @@ static __dpct_inline__ void flash_attn_tile_load_tile(const sycl::half2 * const
sycl::half2 * const __restrict__ tile_KV, sycl::half2 * const __restrict__ tile_KV,
const int stride_KV, const int stride_KV,
const int i_sup) { const int i_sup) {
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
constexpr int cpy_nb = ggml_sycl_get_max_cpy_bytes(); constexpr int cpy_nb = ggml_sycl_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4; constexpr int cpy_ne = cpy_nb / 4;
auto load = [&] (const int n) { auto load = [&] (const int n) {
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
const int stride_j = warp_size >> n; const int stride_j = warp_size >> n;
if (stride_j == 0) { if (stride_j == 0) {
@ -455,7 +456,7 @@ static __dpct_inline__ void flash_attn_tile_iter_KQ(T_vec_dot * const Q_tmp,
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_fa, nbatch_K, cpy_ne, oob_check> flash_attn_tile_load_tile<warp_size, nwarps, nbatch_fa, nbatch_K, cpy_ne, oob_check>
(K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2, KV_tmp, stride_K2, k_VKQ_sup); (K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2, KV_tmp, stride_K2, k_VKQ_sup);
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
#ifdef SYCL_FAST_FP16 #ifdef SYCL_FAST_FP16
static_assert((nbatch_K/2) % cpy_ne == 0, "bad nbatch_K"); static_assert((nbatch_K/2) % cpy_ne == 0, "bad nbatch_K");
@ -505,7 +506,7 @@ static __dpct_inline__ void flash_attn_tile_iter_KQ(T_vec_dot * const Q_tmp,
} }
if (k_KQ_0 + nbatch_K < DKQ) { if (k_KQ_0 + nbatch_K < DKQ) {
item_ct1.barrier(); // Sync not needed on last iteration. item_ct1.barrier(sycl::access::fence_space::local_space); // Sync not needed on last iteration.
} }
} }
@ -545,7 +546,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
const int k_VKQ_max, const int k_VKQ_max,
const int col_Q_0, const int col_Q_0,
float * KQ_max_new_shared) { float * KQ_max_new_shared) {
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
constexpr int cpy_nb = ggml_sycl_get_max_cpy_bytes(); constexpr int cpy_nb = ggml_sycl_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4; constexpr int cpy_ne = cpy_nb / 4;
@ -620,14 +621,14 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
} }
if constexpr (np == 1) { if constexpr (np == 1) {
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
} else { } else {
static_assert(cpw == 1, "bad cpw"); static_assert(cpw == 1, "bad cpw");
if (item_ct1.get_local_id(2) == 0) { if (item_ct1.get_local_id(2) == 0) {
KQ_max_new_shared[item_ct1.get_local_id(1)] = KQ_max_new[0]; KQ_max_new_shared[item_ct1.get_local_id(1)] = KQ_max_new[0];
} }
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
KQ_max_new[0] = KQ_max_new_shared[(item_ct1.get_local_id(1) & ~(np - 1)) + item_ct1.get_local_id(2) % np]; KQ_max_new[0] = KQ_max_new_shared[(item_ct1.get_local_id(1) & ~(np - 1)) + item_ct1.get_local_id(2) % np];
KQ_max_new[0] = warp_reduce_max<np>(KQ_max_new[0]); KQ_max_new[0] = warp_reduce_max<np>(KQ_max_new[0]);
} }
@ -697,7 +698,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) { for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) {
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_V, DV, 0, oob_check> flash_attn_tile_load_tile<warp_size, nwarps, nbatch_V, DV, 0, oob_check>
(V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2, KV_tmp, stride_V2, k_VKQ_sup - k0); (V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2, KV_tmp, stride_V2, k_VKQ_sup - k0);
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
#ifdef SYCL_FAST_FP16 #ifdef SYCL_FAST_FP16
#pragma unroll #pragma unroll
@ -765,7 +766,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
} }
} }
#endif // SYCL_FAST_FP16 #endif // SYCL_FAST_FP16
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
} }
} }
@ -972,7 +973,7 @@ static void flash_attn_tile(const char * Q,
} }
} }
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
// Main loop over KV cache: // Main loop over KV cache:
const int k_VKQ_max = KV_max ? KV_max[sequence * item_ct1.get_group_range(2) + item_ct1.get_group(2)] : ne11; const int k_VKQ_max = KV_max ? KV_max[sequence * item_ct1.get_group_range(2) + item_ct1.get_group(2)] : ne11;
@ -1051,7 +1052,7 @@ static void flash_attn_tile(const char * Q,
return; return;
} }
item_ct1.barrier(); item_ct1.barrier(sycl::access::fence_space::local_space);
#pragma unroll #pragma unroll
for (int ip = 1; ip < np; ++ip) { for (int ip = 1; ip < np; ++ip) {
@ -1193,37 +1194,39 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_sycl_context & ctx, ggm
constexpr size_t nbytes_shared = 0; constexpr size_t nbytes_shared = 0;
if constexpr (DV <= 256) { if (DV < 512 && Q->ne[1] < 32) {
if (Q->ne[1] > 16/ncols2) { if constexpr (ncols2 <= 32) {
constexpr int cols_per_block = 32; if (Q->ne[1] > 16/ncols2) {
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; constexpr int cols_per_block = 32;
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc); const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
launch_fattn<DV, cols_per_block/ncols2, ncols2, const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size> launch_fattn<DV, cols_per_block/ncols2, ncols2,
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false); flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
return; (ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
return;
}
} }
} if constexpr (ncols2 <= 16) {
if (Q->ne[1] > 8/ncols2) {
if (Q->ne[1] > 8/ncols2) { constexpr int cols_per_block = 16;
constexpr int cols_per_block = 16; const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc); launch_fattn<DV, cols_per_block/ncols2, ncols2,
launch_fattn<DV, cols_per_block/ncols2, ncols2, flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size> (ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false); return;
return; }
} }
if constexpr (ncols2 <= 8) {
if constexpr (ncols2 <= 8) { if (Q->ne[1] > 4/ncols2) {
if (Q->ne[1] > 4/ncols2) { constexpr int cols_per_block = 8;
constexpr int cols_per_block = 8; const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc); launch_fattn<DV, cols_per_block/ncols2, ncols2,
launch_fattn<DV, cols_per_block/ncols2, ncols2, flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size> (ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false); return;
return; }
} }
} }

View File

@ -1112,6 +1112,16 @@ struct vk_op_glu_push_constants {
uint32_t mode; // 0: default, 1: swapped, 2: split uint32_t mode; // 0: default, 1: swapped, 2: split
float alpha; // for swiglu_oai float alpha; // for swiglu_oai
float limit; float limit;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t ne01;
uint32_t ne02;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t ne11;
uint32_t ne12;
}; };
struct vk_op_unary_push_constants { struct vk_op_unary_push_constants {
@ -5044,7 +5054,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
} else { } else {
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
} }
vk::DeviceCreateInfo device_create_info; vk::DeviceCreateInfo device_create_info{};
std::vector<const char *> device_extensions; std::vector<const char *> device_extensions;
vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures(); vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures();
@ -5413,12 +5423,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
#endif #endif
device->name = GGML_VK_NAME + std::to_string(idx); device->name = GGML_VK_NAME + std::to_string(idx);
device_create_info = { device_create_info
vk::DeviceCreateFlags(), .setFlags(vk::DeviceCreateFlags())
device_queue_create_infos, .setQueueCreateInfos(device_queue_create_infos)
{}, .setPEnabledExtensionNames(device_extensions);
device_extensions
};
device_create_info.setPNext(&device_features2); device_create_info.setPNext(&device_features2);
device->device = device->physical_device.createDevice(device_create_info); device->device = device->physical_device.createDevice(device_create_info);
@ -11048,8 +11056,6 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
const float alpha = op_params_f[2]; const float alpha = op_params_f[2];
const float limit = op_params_f[3]; const float limit = op_params_f[3];
GGML_ASSERT(ggml_is_contiguous(src0));
if (!split) { if (!split) {
GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]);
} else { } else {
@ -11067,7 +11073,17 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t)dst->ne[0], (uint32_t)dst->ne[0],
mode, mode,
alpha, alpha,
limit limit,
(uint32_t)(src0->nb[1] / src0->nb[0]),
(uint32_t)(src0->nb[2] / src0->nb[0]),
(uint32_t)(src0->nb[3] / src0->nb[0]),
(uint32_t)src0->ne[1],
(uint32_t)src0->ne[2],
(uint32_t)(dst->nb[1] / dst->nb[0]),
(uint32_t)(dst->nb[2] / dst->nb[0]),
(uint32_t)(dst->nb[3] / dst->nb[0]),
(uint32_t)dst->ne[1],
(uint32_t)dst->ne[2]
}); });
} }
@ -15217,8 +15233,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_GLU_OP_SWIGLU_OAI: case GGML_GLU_OP_SWIGLU_OAI:
case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK: case GGML_GLU_OP_GEGLU_QUICK:
return ggml_is_contiguous(op->src[0]) && return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
(op->src[0]->type == op->type); (op->src[0]->type == op->type);
default: default:

View File

@ -16,4 +16,14 @@ layout (push_constant) uniform parameter
uint mode; uint mode;
float alpha; float alpha;
float limit; float limit;
uint nb01;
uint nb02;
uint nb03;
uint ne01;
uint ne02;
uint nb11;
uint nb12;
uint nb13;
uint ne11;
uint ne12;
} p; } p;

View File

@ -8,22 +8,32 @@ void main() {
const uint row = i / p.ne20; const uint row = i / p.ne20;
const uint col = i - row * p.ne20; const uint col = i - row * p.ne20;
const uint i3 = row / (p.ne01 * p.ne02);
const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01;
const uint i1 = row % p.ne01;
const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col;
const uint dst_i3 = row / (p.ne11 * p.ne12);
const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11;
const uint dst_i1 = row % p.ne11;
const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col;
if (p.mode == 0) { if (p.mode == 0) {
// Default // Default
const uint offset = p.ne00 / 2; const uint offset = p.ne00 / 2;
const uint idx = row * p.ne00 + col; const uint idx = src_idx;
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
} else if (p.mode == 1) { } else if (p.mode == 1) {
// Swapped // Swapped
const uint offset = p.ne00 / 2; const uint offset = p.ne00 / 2;
const uint idx = row * p.ne00 + col; const uint idx = src_idx;
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
} else { } else {
// Split // Split
const uint idx = row * p.ne00 + col; const uint idx = src_idx;
data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
} }
} }

View File

@ -14,12 +14,12 @@ except ImportError:
SentencePieceProcessor: Any = None SentencePieceProcessor: Any = None
try: try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
_filter_valid_tokenizer_files, _filter_valid_tokenizer_files,
) )
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
SentencePieceTokenizer, SentencePieceTokenizer,
) )
except ImportError: except ImportError:
@ -32,7 +32,7 @@ else:
_mistral_common_installed = True _mistral_common_installed = True
try: try:
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
get_one_valid_tokenizer_file, get_one_valid_tokenizer_file,
) )
except ImportError: except ImportError:

View File

@ -0,0 +1,154 @@
{%- set image_count = namespace(value=0) %}
{%- set video_count = namespace(value=0) %}
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
{%- if content is string %}
{{- content }}
{%- elif content is iterable and content is not mapping %}
{%- for item in content %}
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain images.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set image_count.value = image_count.value + 1 %}
{%- endif %}
{%- if add_vision_id %}
{{- 'Picture ' ~ image_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
{%- elif 'video' in item or item.type == 'video' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain videos.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set video_count.value = video_count.value + 1 %}
{%- endif %}
{%- if add_vision_id %}
{{- 'Video ' ~ video_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
{%- elif 'text' in item %}
{{- item.text }}
{%- else %}
{{- raise_exception('Unexpected item type in content.') }}
{%- endif %}
{%- endfor %}
{%- elif content is none or content is undefined %}
{{- '' }}
{%- else %}
{{- raise_exception('Unexpected content type.') }}
{%- endif %}
{%- endmacro %}
{%- if not messages %}
{{- raise_exception('No messages provided.') }}
{%- endif %}
{%- if tools and tools is iterable and tools is not mapping %}
{{- '<|im_start|>system\n' }}
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- if messages[0].role == 'system' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{%- if content %}
{{- '\n\n' + content }}
{%- endif %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if messages[0].role == 'system' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" %}
{%- set content = render_content(message.content, false)|trim %}
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if ns.multi_step_tool %}
{{- raise_exception('No user query found in messages.') }}
{%- endif %}
{%- for message in messages %}
{%- set content = render_content(message.content, true)|trim %}
{%- if message.role == "system" %}
{%- if not loop.first %}
{{- raise_exception('System message must be at the beginning.') }}
{%- endif %}
{%- elif message.role == "user" %}
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is string %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in content %}
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- set reasoning_content = reasoning_content|trim %}
{%- if loop.index0 > ns.last_query_index %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- if loop.first %}
{%- if content|trim %}
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- else %}
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- else %}
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- if tool_call.arguments is defined %}
{%- for args_name, args_value in tool_call.arguments|items %}
{{- '<parameter=' + args_name + '>\n' }}
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
{{- args_value }}
{{- '\n</parameter>\n' }}
{%- endfor %}
{%- endif %}
{{- '</function>\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- content }}
{{- '\n</tool_response>' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- raise_exception('Unexpected message role.') }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}

View File

@ -147,7 +147,7 @@ ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd
for codepoint, norm in table_nfd: for codepoint, norm in table_nfd:
start = ranges_nfd[-1][0] start = ranges_nfd[-1][0]
if ranges_nfd[-1] != (start, codepoint - 1, norm): if ranges_nfd[-1] != (start, codepoint - 1, norm):
ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below ranges_nfd.append((0, 0, 0)) # dummy, will be replaced below
start = codepoint start = codepoint
ranges_nfd[-1] = (start, codepoint, norm) ranges_nfd[-1] = (start, codepoint, norm)

View File

@ -1 +1 @@
c044a8eeae2591faa0950c8b5e514cbc4bbfc4ca a04eea0761a85d18f3f504d6ab970c5c9dce705f

View File

@ -5,7 +5,7 @@ import os
import sys import sys
import subprocess import subprocess
HTTPLIB_VERSION = "refs/tags/v0.39.0" HTTPLIB_VERSION = "refs/tags/v0.40.0"
vendor = { vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@ -294,7 +294,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
} }
// get extra buffer types of the CPU // get extra buffer types of the CPU
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future // TODO: a more general solution for non-CPU extra buft should be implemented in the future
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948 // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
std::vector<ggml_backend_buffer_type_t> buft_extra; std::vector<ggml_backend_buffer_type_t> buft_extra;
{ {

View File

@ -557,6 +557,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT,
LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_K,

View File

@ -18,7 +18,7 @@ struct llama_ubatch {
} }
// typical for M-RoPE cases: // typical for M-RoPE cases:
// 0 - sequantial position of the tokens/embeddings in the sequence // 0 - sequential position of the tokens/embeddings in the sequence
// 1 - y position in the image // 1 - y position in the image
// 2 - x position in the image // 2 - x position in the image
// 3 - other // 3 - other

View File

@ -586,7 +586,7 @@ void llama_context::sched_reserve() {
// reserve again with pp graph to avoid ggml-alloc reallocations during inference // reserve again with pp graph to avoid ggml-alloc reallocations during inference
{ {
// TODO: not sure if the following graph would be worster case for multi-stream KV caches: // TODO: not sure if the following graph would be worst case for multi-stream KV caches:
// //
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
// //

View File

@ -1665,7 +1665,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
ggml_tensor * llm_graph_context::build_inp_out_ids() const { ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls, // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
// but this would make the graph topology depend on the number of output tokens, which can interere with // but this would make the graph topology depend on the number of output tokens, which can interfere with
// features that require constant topology such as pipeline parallelism // features that require constant topology such as pipeline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471 // ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
//if (n_outputs < n_tokens) { //if (n_outputs < n_tokens) {

View File

@ -333,7 +333,7 @@ public:
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
// store k_cur and v_cur in the cache based on the provided head location // store k_cur and v_cur in the cache based on the provided head location
// note: the heads in k_cur and v_cur should be layed out contiguously in memory // note: the heads in k_cur and v_cur should be laid out contiguously in memory
// - k_cur [n_embd_head_k, n_head_k, n_tokens] // - k_cur [n_embd_head_k, n_head_k, n_tokens]
// - k_idxs [n_tokens] // - k_idxs [n_tokens]
// - v_cur [n_embd_head_v, n_head_v, n_tokens] // - v_cur [n_embd_head_v, n_head_v, n_tokens]

View File

@ -1158,6 +1158,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
if (overrides->buft == ggml_backend_cpu_buffer_type()) { if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types // when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu); buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
if (use_mmap) {
static std::once_flag once;
std::call_once(once, [] {
LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
});
}
} else { } else {
buft = overrides->buft; buft = overrides->buft;
} }

View File

@ -9,7 +9,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1); cb(inpL, "inp_scaled", -1);

View File

@ -9,7 +9,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1); cb(inpL, "inp_scaled", -1);

View File

@ -12,7 +12,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1); cb(inpL, "inp_scaled", -1);

View File

@ -118,12 +118,12 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
params.out_file = "tests.txt"; params.out_file = "tests.txt";
common_init();
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
return 1; return 1;
} }
common_init();
// Load CPU-only // Load CPU-only
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
params.devices = { cpu_device, nullptr }; params.devices = { cpu_device, nullptr };

View File

@ -8424,6 +8424,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 256, 1, 1}, order)); // test ceildiv in CUDA's CUB's DeviceSegmentedSort
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order));
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order));

View File

@ -1330,7 +1330,7 @@ static void test_nemotron_reasoning_detection(testing & t) {
analysis.analyze_template(tmpl); analysis.analyze_template(tmpl);
// Check reasoning markers // Check reasoning markers
t.assert_equal("reasoning_start should be '<think>'", "<think>", analysis.reasoning.start); t.assert_equal("reasoning_start should be '<think>\\n'", "<think>\n", analysis.reasoning.start);
t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end); t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);
// Check reasoning mode detection // Check reasoning mode detection

File diff suppressed because it is too large Load Diff

View File

@ -387,6 +387,24 @@ static void test_expressions(testing & t) {
"Bob" "Bob"
); );
test_template(t, "empty computed member defaults to undefined",
"{{ a[]|default('fallback') }}",
{{"a", {{"name", "Bob"}}}},
"fallback"
);
test_template(t, "empty computed member is undefined",
"{{ a[] is undefined }}",
{{"a", {{"name", "Bob"}}}},
"True"
);
test_template(t, "undefined computed member is undefined",
"{{ a[undefined] is undefined }}",
{{"a", {{"name", "Bob"}}}},
"True"
);
test_template(t, "array access", test_template(t, "array access",
"{{ items[1] }}", "{{ items[1] }}",
{{"items", json::array({"a", "b", "c"})}}, {{"items", json::array({"a", "b", "c"})}},

View File

@ -1525,6 +1525,47 @@ int main() {
} }
}); });
// C++ only tests (features not yet supported in JS/Python implementations)
{
fprintf(stderr, "#\n# Testing C++ only features\n#\n");
auto run = [](const TestCase & tc) {
fprintf(stderr, "- %s\n", tc.name.c_str());
try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
tc.verify_status(SUCCESS);
} catch (const std::invalid_argument & ex) {
fprintf(stderr, "Error: %s\n", ex.what());
tc.verify_status(FAILURE);
}
};
run({
SUCCESS,
"regexp with non-capturing group",
R"""({
"type": "string",
"pattern": "^(?:foo|bar)baz$"
})""",
R"""(
root ::= "\"" (("foo" | "bar") "baz") "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});
run({
SUCCESS,
"regexp with nested non-capturing groups",
R"""({
"type": "string",
"pattern": "^(?:(?:ab)+c)?d$"
})""",
R"""(
root ::= "\"" ((("ab")+ "c")? "d") "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});
}
if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m"); fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
} else { } else {

Some files were not shown because too many files have changed in this diff Show More