Merge branch 'master' into quantize

This commit is contained in:
Ed Addario 2026-03-01 09:27:37 +00:00
commit 53cff1ed33
No known key found for this signature in database
GPG Key ID: E7875815A3230993
164 changed files with 10188 additions and 4191 deletions

View File

@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=7.0
ARG AMDGPU_VERSION=7.0
ARG ROCM_VERSION=7.2
ARG AMDGPU_VERSION=7.2
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@ -11,13 +11,12 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
#ARG ROCM_DOCKER_ARCH='gfx1151'
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}

View File

@ -11,5 +11,5 @@ runs:
- name: Setup ROCm
uses: ./.github/actions/install-exe
with:
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
args: -install

View File

@ -68,7 +68,7 @@ jobs:
env:
# Make sure this is in sync with build.yml
HIPSDK_INSTALLER_VERSION: "25.Q3"
HIPSDK_INSTALLER_VERSION: "26.Q1"
steps:
- name: Clone

View File

@ -1175,10 +1175,8 @@ jobs:
runs-on: windows-2022
env:
# The ROCm version must correspond to the version used in the HIP SDK.
ROCM_VERSION: "6.4.2"
# Make sure this is in sync with build-cache.yml
HIPSDK_INSTALLER_VERSION: "25.Q3"
HIPSDK_INSTALLER_VERSION: "26.Q1"
steps:
- name: Clone
@ -1188,7 +1186,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }}/pool/main/r/rocwmma-dev/rocwmma-dev_1.7.0.60402-120~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@ -1231,7 +1229,7 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `

View File

@ -21,7 +21,7 @@ on:
jobs:
deploy:
runs-on: ubuntu-slim
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

View File

@ -516,17 +516,113 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-22-rocm:
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- ROCM_VERSION: "7.2"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt install -y build-essential git cmake wget
- name: Setup Legacy ROCm
if: matrix.ROCM_VERSION == '7.2'
id: legacy_env
run: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
sudo tee /etc/apt/sources.list.d/rocm.list << EOF
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
EOF
sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
Package: *
Pin: release o=repo.radeon.com
Pin-Priority: 600
EOF
sudo apt update
sudo apt-get install -y libssl-dev rocm-hip-sdk
- name: Setup TheRock
if: matrix.ROCM_VERSION != '7.2'
id: therock_env
run: |
wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
mkdir install
tar -xf *.tar.gz -C install
export ROCM_PATH=$(pwd)/install
echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-DGGML_HIP=ON \
-DHIP_PLATFORM=amd \
-DGGML_HIP_ROCWMMA_FATTN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
windows-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
HIPSDK_INSTALLER_VERSION: "26.Q1"
strategy:
matrix:
include:
- name: "radeon"
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- name: Clone
@ -536,7 +632,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@ -559,7 +655,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
@ -593,20 +689,20 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_BACKEND_DL=ON `
-DGGML_NATIVE=OFF `
-DGGML_CPU=OFF `
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
@ -784,6 +880,7 @@ jobs:
- windows-cuda
- windows-sycl
- windows-hip
- ubuntu-22-rocm
- ubuntu-22-cpu
- ubuntu-22-vulkan
- macOS-arm64
@ -868,6 +965,7 @@ jobs:
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
**Windows:**

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)

View File

@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--temp"}, "N",
{"--temp", "--temperature"}, "N",
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--top-nsigma"}, "N",
{"--top-nsigma", "--top-n-sigma"}, "N",
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
[](common_params & params, const std::string & value) {
params.sampling.top_n_sigma = std::stof(value);
@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--typical"}, "N",
{"--typical", "--typical-p"}, "N",
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
[](common_params & params, const std::string & value) {
params.sampling.typ_p = std::stof(value);
@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"-a", "--alias"}, "STRING",
"set alias for model name (to be used by REST API)",
"set model name aliases, comma-separated (to be used by API)",
[](common_params & params, const std::string & value) {
params.model_alias = value;
for (auto & alias : string_split<std::string>(value, ',')) {
alias = string_strip(alias);
if (!alias.empty()) {
params.model_alias.insert(alias);
}
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
add_opt(common_arg(
{"--tags"}, "STRING",
"set model tags, comma-separated (informational, not used for routing)",
[](common_params & params, const std::string & value) {
for (auto & tag : string_split<std::string>(value, ',')) {
tag = string_strip(tag);
if (!tag.empty()) {
params.model_tags.insert(tag);
}
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA

View File

@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
}
// remove potential partial suffix
if (builder.pos() == builder.input().size()) {
if (builder.pos() == builder.input().size() && builder.is_partial()) {
if (unclosed_reasoning_content.empty()) {
rstrip(content);
trim_potential_partial_word(content);

View File

@ -893,23 +893,6 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
}
static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
static const xml_tool_call_format form = ([]() {
xml_tool_call_format form {};
form.scope_start = "<tool_call>";
form.tool_start = "<function=";
form.tool_sep = ">";
form.key_start = "<parameter=";
form.key_val_sep = ">";
form.val_end = "</parameter>";
form.tool_end = "</function>";
form.scope_end = "</tool_call>";
form.trim_raw_argval = true;
return form;
})();
builder.consume_reasoning_with_xml_tool_calls(form);
}
static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
static const xml_tool_call_format form = ([]() {
xml_tool_call_format form {};
@ -1590,9 +1573,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_KIMI_K2:
common_chat_parse_kimi_k2(builder);
break;
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
common_chat_parse_qwen3_coder_xml(builder);
break;
case COMMON_CHAT_FORMAT_APRIEL_1_5:
common_chat_parse_apriel_1_5(builder);
break;

View File

@ -736,7 +736,6 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
@ -1522,14 +1521,17 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data;
}
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
// Nemotron Nano 3 and Step-3.5-Flash use the Qwen3 Coder tool calling with thinking
bool supports_reasoning = (tmpl.source().find("<think>") != std::string::npos);
// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<think>\n")) {
if (supports_reasoning && string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
@ -1538,19 +1540,21 @@ static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_
}
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
if (supports_reasoning) {
data.preserved_tokens.insert(data.preserved_tokens.end(), {"<think>", "</think>"});
}
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
if (inputs.enable_thinking && extract_reasoning) {
if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
@ -1888,38 +1892,6 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
return data;
}
static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.prompt = apply(tmpl, params);
data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
data.preserved_tokens = {
"<tool_call>",
"</tool_call>",
"<function=",
"</function>",
"<parameter=",
"</parameter>",
};
// build grammar for tool call
static const xml_tool_call_format form {
/* form.scope_start = */ "<tool_call>\n",
/* form.tool_start = */ "<function=",
/* form.tool_sep = */ ">\n",
/* form.key_start = */ "<parameter=",
/* form.key_val_sep = */ ">\n",
/* form.val_end = */ "\n</parameter>\n",
/* form.tool_end = */ "</function>\n",
/* form.scope_end = */ "</tool_call>",
};
build_grammar_xml_tool_call(data, params.tools, form);
return data;
}
static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@ -3147,13 +3119,7 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("<function=") != std::string::npos &&
src.find("<parameter=") != std::string::npos) {
workaround::func_args_not_string(params.messages);
// Models with <think> support (Step-3.5-Flash, Nemotron 3 Nano) use the
// Nemotron v3 PEG parser for streaming and schema-aware parameter parsing.
// Qwen3-Coder has no <think> in its template.
if (src.find("<think>") != std::string::npos) {
return common_chat_params_init_nemotron_v3(tmpl, params);
}
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
return common_chat_params_init_qwen3_coder(tmpl, params);
}
// Xiaomi MiMo format detection (must come before Hermes 2 Pro)

View File

@ -128,7 +128,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_GLM_4_5,
COMMON_CHAT_FORMAT_MINIMAX_M2,
COMMON_CHAT_FORMAT_KIMI_K2,
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
COMMON_CHAT_FORMAT_SOLAR_OPEN,

View File

@ -1760,3 +1760,65 @@ float lr_opt::get_lr(float epoch) const {
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
return r;
}
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
llama_batch batch = llama_batch_get_one(&last_token, 1);
batch.pos = &pos;
if (llama_decode(ctx, batch)) {
LOG_ERR("%s: failed to replay last token\n", __func__);
return false;
}
return true;
}
bool common_prompt_batch_decode(
struct llama_context * ctx,
const std::vector<llama_token> & tokens,
int & n_past,
int n_batch,
std::string_view state_path,
bool save_state) {
const int n_eval = tokens.size();
if (n_eval == 0) {
return true;
}
if (save_state && n_eval > 1) {
const int n_tokens_before_last = n_eval - 1;
GGML_ASSERT(n_eval <= n_batch);
// Decode all but the last token so we can save the memory state before decoding the last token.
// This is done so we can restore the session state later and replay the last token.
// Memory implementations in recurrent/hybrid models don't support removing tokens from their
// memory, so we can't just remove the last token from the memory and replay the last token which
// is the reason for this logic.
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_tokens_before_last;
llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
llama_token last_token = tokens.back();
llama_batch batch = llama_batch_get_one(&last_token, 1);
int32_t pos = n_past;
batch.pos = &pos;
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval last token\n", __func__);
return false;
}
n_past++;
} else {
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
return true;
}

View File

@ -410,7 +410,8 @@ struct common_params {
struct common_params_model model;
std::string model_alias = ""; // model alias // NOLINT
std::set<std::string> model_alias; // model aliases // NOLINT
std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
@ -804,6 +805,23 @@ void common_batch_add(
const std::vector<llama_seq_id> & seq_ids,
bool logits);
// decodes a single batch of tokens for a prompt and manages session tokens
//
// Note: We save state before the last token so that we can replay it to ensure
// compatibility with all memory types. Recurrent/hybrid models cannot remove
// tokens from memory, so this approach works across all model architectures.
bool common_prompt_batch_decode(
struct llama_context * ctx,
const std::vector<llama_token> & embd,
int & n_past,
int n_batch,
std::string_view state_path,
bool save_state);
// replays the last token after loading state to regenerate logits
// used after loading session state to ensure the sampling context has valid logits
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
//
// Vocab utils
//

View File

@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) {
auto builtins = global_builtins();
if (!it->is_undefined()) {
if (ctx.is_get_stats) {
it->stats.used = true;
value_t::stats_t::mark_used(it);
}
JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str());
return it;
@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) {
static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) {
JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str());
if (ctx.is_get_stats) {
input->stats.used = true;
value_t::stats_t::mark_used(input);
input->stats.ops.insert(name);
}
auto builtins = input->get_builtins();
@ -448,7 +448,7 @@ value for_statement::execute_impl(context & ctx) {
// mark the variable being iterated as used for stats
if (ctx.is_get_stats) {
iterable_val->stats.used = true;
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.ops.insert("array_access");
}
@ -470,7 +470,7 @@ value for_statement::execute_impl(context & ctx) {
items.push_back(std::move(tuple));
}
if (ctx.is_get_stats) {
iterable_val->stats.used = true;
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.ops.insert("object_access");
}
} else {
@ -480,7 +480,7 @@ value for_statement::execute_impl(context & ctx) {
items.push_back(item);
}
if (ctx.is_get_stats) {
iterable_val->stats.used = true;
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.ops.insert("array_access");
}
}
@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) {
int64_t arr_size = 0;
if (is_val<value_array>(object)) {
arr_size = object->as_array().size();
} else if (is_val<value_string>(object)) {
arr_size = object->as_string().length();
}
if (is_stmt<slice_expression>(this->property)) {
@ -817,8 +819,9 @@ value member_expression::execute_impl(context & ctx) {
}
if (ctx.is_get_stats && val && object && property) {
val->stats.used = true;
object->stats.used = true;
value_t::stats_t::mark_used(val);
value_t::stats_t::mark_used(object);
value_t::stats_t::mark_used(property);
if (is_val<value_int>(property)) {
object->stats.ops.insert("array_access");
} else if (is_val<value_string>(property)) {

View File

@ -161,6 +161,11 @@ static value tojson(const func_args & args) {
value val_separators = args.get_kwarg_or_pos("separators", 3);
value val_sort = args.get_kwarg_or_pos("sort_keys", 4);
int indent = -1;
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
if (is_val<value_int>(val_indent)) {
indent = static_cast<int>(val_indent->as_int());
}
@ -891,6 +896,11 @@ const func_builtins & value_array_t::get_builtins() const {
}},
{"string", [](const func_args & args) -> value {
args.ensure_vals<value_array>();
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
return mk_val<value_string>(args.get_pos(0)->as_string());
}},
{"tojson", tojson},
@ -1046,6 +1056,11 @@ const func_builtins & value_object_t::get_builtins() const {
{"tojson", tojson},
{"string", [](const func_args & args) -> value {
args.ensure_vals<value_object>();
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
return mk_val<value_string>(args.get_pos(0)->as_string());
}},
{"length", [](const func_args & args) -> value {
@ -1358,4 +1373,21 @@ std::string value_to_string_repr(const value & val) {
}
}
// stats utility
void value_t::stats_t::mark_used(value & val, bool deep) {
val->stats.used = true;
if (deep) {
if (is_val<value_array>(val)) {
for (auto & item : val->val_arr) {
mark_used(item, deep);
}
} else if (is_val<value_object>(val)) {
for (auto & pair : val->val_obj) {
mark_used(pair.first, deep);
mark_used(pair.second, deep);
}
}
}
}
} // namespace jinja

View File

@ -118,6 +118,8 @@ struct value_t {
bool used = false;
// ops can be builtin calls or operators: "array_access", "object_access"
std::set<std::string> ops;
// utility to recursively mark value and its children as used
static void mark_used(value & val, bool deep = false);
} stats;
value_t() = default;

View File

@ -116,7 +116,8 @@ class ModelBase:
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
disable_mistral_community_chat_template: bool = False,
sentence_transformers_dense_modules: bool = False):
sentence_transformers_dense_modules: bool = False,
fuse_gate_up_exps: bool = False):
if type(self) is ModelBase or \
type(self) is TextModel or \
type(self) is MmprojModel:
@ -135,6 +136,9 @@ class ModelBase:
self.dry_run = dry_run
self.remote_hf_model_id = remote_hf_model_id
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
self.fuse_gate_up_exps = fuse_gate_up_exps
self._gate_exp_buffer: dict[int, Tensor] = {}
self._up_exp_buffer: dict[int, Tensor] = {}
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
self.metadata_override = metadata_override
@ -512,8 +516,31 @@ class ModelBase:
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
return [(self.map_tensor_name(name), data_torch)]
new_name = self.map_tensor_name(name)
# Handle gate/up expert tensor fusion if enabled
if self.fuse_gate_up_exps and bid is not None:
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid):
self._gate_exp_buffer[bid] = data_torch
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
self._up_exp_buffer[bid] = data_torch
# Check if both gate and up are buffered for this layer
if bid in self._gate_exp_buffer and bid in self._up_exp_buffer:
gate_data = self._gate_exp_buffer.pop(bid)
up_data = self._up_exp_buffer.pop(bid)
# gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd)
fused_data = torch.cat([gate_data, up_data], dim=1)
fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid)
logger.info(f"Fused gate_exps and up_exps for layer {bid}")
return [(fused_name, fused_data)]
# If we buffered a gate/up tensor, wait for the other
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \
self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
return []
return [(new_name, data_torch)]
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
del name, new_name, bid, n_dims # unused
@ -1148,6 +1175,9 @@ class TextModel(ModelBase):
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d":
# ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano
res = "jina-v5-nano"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
@ -1274,6 +1304,9 @@ class TextModel(ModelBase):
if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d":
# ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash
res = "joyai-llm"
if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
# ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
res = "kanana2"
if res is None:
logger.warning("\n")
@ -6122,6 +6155,32 @@ class NeoBert(BertModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
class EuroBertModel(TextModel):
model_arch = gguf.MODEL_ARCH.EUROBERT
def set_vocab(self):
self.gguf_writer.add_add_bos_token(False)
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
# EuroBert is bidirectional (encoder)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self._try_set_pooling_type()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Strip "model." prefix from tensor names
if name.startswith("model."):
name = name[6:]
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
class XLMRobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
@ -11910,6 +11969,11 @@ def parse_args() -> argparse.Namespace:
"Default these modules are not included.")
)
parser.add_argument(
"--fuse-gate-up-exps", action="store_true",
help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
)
args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
@ -12047,7 +12111,8 @@ def main() -> None:
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split,
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
fuse_gate_up_exps=args.fuse_gate_up_exps
)
if args.vocab_only:

View File

@ -107,6 +107,7 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
@ -152,6 +153,7 @@ models = [
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View File

@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with:
- **VM-specific**: Only works in virtual machines with virtio-gpu support
- **Host dependency**: Requires properly configured host-side backend
- **Latency**: Small overhead from VM escaping for each operation
- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
addressable memory is limited to 64 GB. So the maximum GPU memory
will be `64GB - RAM`, regardless of the hardware VRAM size.
* This work is pending upstream changes in the VirglRenderer
project.

View File

@ -22,7 +22,7 @@
**Llama.cpp + ZenDNN**
The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN).
For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn
|:-------:|:-------:|:----------------------------------------------:|
| Linux | Support | Ubuntu 20.04, 22.04, 24.04 |
For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os).
## Hardware
@ -44,9 +44,9 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based
| CPU Family | Status | Notes |
|:-----------------------------:|:-------:|:----------------------------------:|
| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture |
| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture |
| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture |
| AMD EPYC™ 9005 Series (Turin) | Support | 5th Gen - Zen 5 architecture |
| AMD EPYC™ 9004 Series (Genoa) | Support | 4th Gen - Zen 4 architecture |
| AMD EPYC™ 7003 Series (Milan) | Support | 3rd Gen - Zen 3 architecture |
| AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |
*Notes:*
@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope
| Operation | Status | Notes |
|:-------------|:-------:|:----------------------------------------------:|
| MUL_MAT | | Accelerated via ZenDNN LowOHA MatMul |
| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul |
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
@ -104,7 +104,6 @@ If you want to build ZenDNN yourself or use a specific version:
# Clone ZenDNN repository
git clone https://github.com/amd/ZenDNN.git
cd ZenDNN
git checkout zendnnl
# Build and install (requires CMake >= 3.25)
mkdir build && cd build
@ -114,7 +113,7 @@ cmake --build . --target all
Default installation path: `ZenDNN/build/install`
**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md).
**Step 2: Build llama.cpp with custom ZenDNN path**
@ -146,8 +145,7 @@ Run llama.cpp server with ZenDNN acceleration:
```sh
# Set optimal configuration
export OMP_NUM_THREADS=64 # Adjust to your CPU core count
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance
export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo for best performance
# Start server
./build/bin/llama-server \
@ -160,62 +158,26 @@ export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance
Access the server at `http://localhost:8080`.
**Performance tips**:
- Set `OMP_NUM_THREADS` to match your physical core count
- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance
- For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
## Environment Variable
### Build Time
For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md).
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_ZENDNN | ON/OFF | Enable ZenDNN backend support |
| ZENDNN_ROOT | Path to ZenDNN installation | Set ZenDNN installation directory |
| GGML_OPENMP | ON/OFF (recommended: ON) | Enable OpenMP for multi-threading |
### Performance Optimization
### Runtime
| Name | Value | Function |
|-------------------------|--------------------------|-------------------------------------------------------------------|
| OMP_NUM_THREADS | Number (e.g., 64) | Set number of OpenMP threads (recommended: physical core count) |
| ZENDNNL_MATMUL_ALGO | 0-5 | Select MatMul backend algorithm (see Performance Optimization) |
| ZENDNNL_PROFILE_LOG_LEVEL | 0-4 | Profiling log level (0=disabled, 4=verbose) |
| ZENDNNL_ENABLE_PROFILER | 0 or 1 | Enable detailed profiling (1=enabled) |
| ZENDNNL_API_LOG_LEVEL | 0-4 | API log level (0=disabled, 4=verbose) |
**Example**:
ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm:
```sh
export OMP_NUM_THREADS=64
export ZENDNNL_MATMUL_ALGO=2 # Use Blocked AOCL BLIS for best performance
./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended)
```
## Performance Optimization
### MatMul Algorithm Selection
ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
```sh
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS (recommended)
```
**Available algorithms**:
| Value | Algorithm | Description |
|:-----:|:-----------------------|:----------------------------------------------|
| 0 | Dynamic Dispatch | Automatic backend selection (default) |
| 1 | AOCL BLIS | AOCL BLIS backend |
| 2 | AOCL BLIS Blocked | **Blocked AOCL BLIS (recommended)** |
| 3 | OneDNN | OneDNN backend |
| 4 | OneDNN Blocked | Blocked OneDNN |
| 5 | LibXSMM | LibXSMM backend |
For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
### Profiling and Debugging
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
## Known Issues
@ -245,10 +207,9 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode
A: Ensure:
1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
2. `OMP_NUM_THREADS` is set appropriately (physical core count)
3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
4. You're using a sufficiently large model (small models may not benefit as much)
5. Enable profiling to verify ZenDNN MatMul is being called
2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP)
3. You're using a sufficiently large model (small models may not benefit as much)
4. Enable profiling to verify ZenDNN MatMul is being called
### **GitHub Contribution**:
Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.

View File

@ -77,7 +77,10 @@ causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-em
@./scripts/causal/compare-embeddings-logits.sh
causal-inspect-original-model:
@./scripts/utils/inspect-org-model.py
@./scripts/utils/inspect-org-model.py --list-all -s
causal-list-original-model-tensors:
@./scripts/utils/inspect-org-model.py --list-all-short -s
causal-inspect-converted-model:
@./scripts/utils/inspect-converted-model.sh
@ -153,7 +156,7 @@ embedding-verify-logits-st: embedding-run-original-model-st embedding-run-conver
embedding-inspect-original-model:
$(call validate_embedding_model_path,embedding-inspect-original-model)
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH} --list-all -s
embedding-inspect-converted-model:
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}

View File

@ -1,67 +1,290 @@
#!/usr/bin/env python3
import argparse
import os
import json
import os
import re
import struct
import sys
from pathlib import Path
from typing import Optional
from safetensors import safe_open
from collections import defaultdict
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
MODEL_SAFETENSORS_FILE = "model.safetensors"
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
# Check if there's an index file (multi-file model)
index_path = os.path.join(model_path, "model.safetensors.index.json")
single_file_path = os.path.join(model_path, "model.safetensors")
DTYPE_SIZES = {
"F64": 8, "I64": 8, "U64": 8,
"F32": 4, "I32": 4, "U32": 4,
"F16": 2, "BF16": 2, "I16": 2, "U16": 2,
"I8": 1, "U8": 1, "BOOL": 1,
"F8_E4M3": 1, "F8_E5M2": 1,
}
if os.path.exists(index_path):
# Multi-file model
print("Multi-file model detected")
SIZE_UNITS = ['B', 'KB', 'MB', 'GB', 'TB']
with open(index_path, 'r') as f:
index_data = json.load(f)
# Get the weight map (tensor_name -> file_name)
weight_map = index_data.get("weight_map", {})
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
index_file = model_path / MODEL_SAFETENSORS_INDEX
# Group tensors by file for efficient processing
file_tensors = defaultdict(list)
for tensor_name, file_name in weight_map.items():
file_tensors[file_name].append(tensor_name)
if index_file.exists():
with open(index_file, 'r') as f:
index = json.load(f)
return index.get("weight_map", {})
print("Tensors in model:")
return None
# Process each shard file
for file_name, tensor_names in file_tensors.items():
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
with safe_open(file_path, framework="pt") as f:
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
def get_all_tensor_names(model_path: Path) -> list[str]:
weight_map = get_weight_map(model_path)
elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
if weight_map is not None:
return list(weight_map.keys())
with safe_open(single_file_path, framework="pt") as f:
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):
tensor = f.get_tensor(key)
print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
with safe_open(single_file, framework="pt", device="cpu") as f:
return list(f.keys())
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
else:
print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
print("Available files:")
if os.path.exists(model_path):
for item in sorted(os.listdir(model_path)):
print(f" {item}")
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return weight_map.get(tensor_name)
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
return single_file.name
return None
def read_safetensors_header(file_path: Path) -> dict:
with open(file_path, 'rb') as f:
header_size = struct.unpack('<Q', f.read(8))[0]
return json.loads(f.read(header_size))
def get_tensor_size_bytes(tensor_meta: dict) -> int:
offsets = tensor_meta.get("data_offsets")
if offsets and len(offsets) == 2:
return offsets[1] - offsets[0]
n_elements = 1
for d in tensor_meta.get("shape", []):
n_elements *= d
return n_elements * DTYPE_SIZES.get(tensor_meta.get("dtype", "F32"), 4)
def format_size(size_bytes: int) -> str:
val = float(size_bytes)
for unit in SIZE_UNITS[:-1]:
if val < 1024.0:
return f"{val:.2f} {unit}"
val /= 1024.0
return f"{val:.2f} {SIZE_UNITS[-1]}"
def get_all_tensor_metadata(model_path: Path) -> dict[str, dict]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
file_to_tensors: dict[str, list[str]] = {}
for tensor_name, file_name in weight_map.items():
file_to_tensors.setdefault(file_name, []).append(tensor_name)
all_metadata: dict[str, dict] = {}
for file_name, tensor_names in file_to_tensors.items():
try:
header = read_safetensors_header(model_path / file_name)
for tensor_name in tensor_names:
if tensor_name in header:
all_metadata[tensor_name] = header[tensor_name]
except Exception as e:
print(f"Warning: Could not read header from {file_name}: {e}", file=sys.stderr)
return all_metadata
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
header = read_safetensors_header(single_file)
return {k: v for k, v in header.items() if k != "__metadata__"}
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def normalize_tensor_name(tensor_name: str) -> str:
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
normalized = re.sub(r'\.\d+$', '.#', normalized)
return normalized
def list_all_tensors(
model_path: Path,
short: bool = False,
show_sizes: bool = False,
):
tensor_names = get_all_tensor_names(model_path)
metadata: Optional[dict[str, dict]] = None
if show_sizes:
metadata = get_all_tensor_metadata(model_path)
total_bytes = 0
if short:
seen: dict[str, str] = {}
for tensor_name in sorted(tensor_names):
normalized = normalize_tensor_name(tensor_name)
if normalized not in seen:
seen[normalized] = tensor_name
display_pairs = list(sorted(seen.items()))
name_width = max((len(n) for n, _ in display_pairs), default=0)
for normalized, first_name in display_pairs:
if metadata and first_name in metadata:
m = metadata[first_name]
size = get_tensor_size_bytes(m)
total_bytes += size
print(f"{normalized:{name_width}} {m.get('dtype', '?'):6s} {str(m.get('shape', '')):30s} {format_size(size)}")
else:
print(normalized)
else:
print(f" Directory {model_path} does not exist")
exit(1)
name_width = max((len(n) for n in tensor_names), default=0)
for tensor_name in sorted(tensor_names):
if metadata and tensor_name in metadata:
m = metadata[tensor_name]
size = get_tensor_size_bytes(m)
total_bytes += size
print(f"{tensor_name:{name_width}} {m.get('dtype', '?'):6s} {str(m.get('shape', '')):30s} {format_size(size)}")
else:
print(tensor_name)
if show_sizes:
print(f"\nTotal: {format_size(total_bytes)}")
def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
print(f"Error: Could not find tensor '{tensor_name}' in model index")
print(f"Model path: {model_path}")
sys.exit(1)
file_path = model_path / tensor_file
try:
header = read_safetensors_header(file_path)
tensor_meta = header.get(tensor_name, {})
dtype_str = tensor_meta.get("dtype")
with safe_open(file_path, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor_slice = f.get_slice(tensor_name)
shape = tensor_slice.get_shape()
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
if dtype_str:
print(f"Dtype: {dtype_str}")
if tensor_meta:
print(f"Size: {format_size(get_tensor_size_bytes(tensor_meta))}")
if num_values is not None:
tensor = f.get_tensor(tensor_name)
if not dtype_str:
print(f"Dtype: {tensor.dtype}")
flat = tensor.flatten()
n = min(num_values, flat.numel())
print(f"Values: {flat[:n].tolist()}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
sys.exit(1)
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Print tensor information from a safetensors model"
)
parser.add_argument(
"tensor_name",
nargs="?",
help="Name of the tensor to inspect"
)
parser.add_argument(
"-m", "--model-path",
type=Path,
help="Path to the model directory (default: MODEL_PATH environment variable)"
)
parser.add_argument(
"-l", "--list-all-short",
action="store_true",
help="List unique tensor patterns (layer numbers replaced with #)"
)
parser.add_argument(
"-la", "--list-all",
action="store_true",
help="List all tensor names with actual layer numbers"
)
parser.add_argument(
"-n", "--num-values",
nargs="?",
const=10,
default=None,
type=int,
metavar="N",
help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
)
parser.add_argument(
"-s", "--sizes",
action="store_true",
help="Show dtype, shape, and size for each tensor when listing"
)
args = parser.parse_args()
model_path = args.model_path
if model_path is None:
model_path_str = os.environ.get("MODEL_PATH")
if model_path_str is None:
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
sys.exit(1)
model_path = Path(model_path_str)
if not model_path.exists():
print(f"Error: Model path does not exist: {model_path}")
sys.exit(1)
if not model_path.is_dir():
print(f"Error: Model path is not a directory: {model_path}")
sys.exit(1)
if args.list_all_short or args.list_all:
list_all_tensors(model_path, short=args.list_all_short, show_sizes=args.sizes)
else:
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list-all-short or --list-all")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name, args.num_values)
if __name__ == "__main__":
main()

View File

@ -1,174 +0,0 @@
#!/usr/bin/env python3
import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Optional
from safetensors import safe_open
MODEL_SAFETENSORS_FILE = "model.safetensors"
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
index_file = model_path / MODEL_SAFETENSORS_INDEX
if index_file.exists():
with open(index_file, 'r') as f:
index = json.load(f)
return index.get("weight_map", {})
return None
def get_all_tensor_names(model_path: Path) -> list[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return list(weight_map.keys())
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
with safe_open(single_file, framework="pt", device="cpu") as f:
return list(f.keys())
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return weight_map.get(tensor_name)
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
return single_file.name
return None
def normalize_tensor_name(tensor_name: str) -> str:
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
normalized = re.sub(r'\.\d+$', '.#', normalized)
return normalized
def list_all_tensors(model_path: Path, unique: bool = False):
tensor_names = get_all_tensor_names(model_path)
if unique:
seen = set()
for tensor_name in sorted(tensor_names):
normalized = normalize_tensor_name(tensor_name)
if normalized not in seen:
seen.add(normalized)
print(normalized)
else:
for tensor_name in sorted(tensor_names):
print(tensor_name)
def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
print(f"Error: Could not find tensor '{tensor_name}' in model index")
print(f"Model path: {model_path}")
sys.exit(1)
file_path = model_path / tensor_file
try:
with safe_open(file_path, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor_slice = f.get_slice(tensor_name)
shape = tensor_slice.get_shape()
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
if num_values is not None:
tensor = f.get_tensor(tensor_name)
print(f"Dtype: {tensor.dtype}")
flat = tensor.flatten()
n = min(num_values, flat.numel())
print(f"Values: {flat[:n].tolist()}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
sys.exit(1)
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Print tensor information from a safetensors model"
)
parser.add_argument(
"tensor_name",
nargs="?", # optional (if --list is used for example)
help="Name of the tensor to inspect"
)
parser.add_argument(
"-m", "--model-path",
type=Path,
help="Path to the model directory (default: MODEL_PATH environment variable)"
)
parser.add_argument(
"-l", "--list",
action="store_true",
help="List unique tensor patterns in the model (layer numbers replaced with #)"
)
parser.add_argument(
"-n", "--num-values",
nargs="?",
const=10,
default=None,
type=int,
metavar="N",
help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
)
args = parser.parse_args()
model_path = args.model_path
if model_path is None:
model_path_str = os.environ.get("MODEL_PATH")
if model_path_str is None:
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
sys.exit(1)
model_path = Path(model_path_str)
if not model_path.exists():
print(f"Error: Model path does not exist: {model_path}")
sys.exit(1)
if not model_path.is_dir():
print(f"Error: Model path is not a directory: {model_path}")
sys.exit(1)
if args.list:
list_all_tensors(model_path, unique=True)
else:
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name, args.num_values)
if __name__ == "__main__":
main()

View File

@ -5,12 +5,15 @@
#include <vector>
#include <cstdio>
int main(int argc, char ** argv) {
common_params params;
params.prompt = "The quick brown fox";
params.sampling.seed = 1234;
const std::string_view state_file = "dump_state.bin";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
@ -53,35 +56,16 @@ int main(int argc, char ** argv) {
// tokenize prompt
auto tokens = common_tokenize(ctx, params.prompt, true);
// prepare the batch
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
for (size_t i = 0; i < tokens.size(); i++) {
common_batch_add(batch, tokens[i], i, {0}, false);
const bool save_state = true;
if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
return 1;
}
batch.logits[batch.n_tokens - 1] = true; // generate next token
// evaluate prompt
llama_decode(ctx, batch);
n_past += batch.n_tokens;
// save state (rng, logits, embedding and kv_cache) to file
{
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write);
fclose(fp_write);
fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
}
// save state (last tokens)
const auto n_past_saved = n_past;
// first run
printf("\nfirst run: %s", params.prompt.c_str());
llama_batch batch = llama_batch_init(1, 0, 1);
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl, ctx, -1);
auto next_token_str = common_token_to_piece(ctx, next_token);
@ -111,27 +95,23 @@ int main(int argc, char ** argv) {
printf("\nsecond run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem;
// load state from file
std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
size_t n_token_count_out = 0;
FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
return 1;
}
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
fprintf(stderr, "\n%s : failed to load state\n", __func__);
return 1;
}
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
// restore state (last tokens)
n_past = n_past_saved;
n_past = n_token_count_out;
if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
return 1;
}
++n_past;
// second run
for (auto i = 0; i < params.n_predict; i++) {
@ -160,7 +140,9 @@ int main(int argc, char ** argv) {
}
// make new context
llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
auto params_ctx3 = common_context_params_to_llama(params);
params_ctx3.n_seq_max = 2;
llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
@ -169,26 +151,21 @@ int main(int argc, char ** argv) {
printf("\nsingle seq run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem;
n_token_count_out = 0;
FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
return 1;
}
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
fprintf(stderr, "\n%s : failed to load state\n", __func__);
return 1;
}
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
// restore state (last tokens)
n_past = n_past_saved;
n_past = n_token_count_out;
if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
return 1;
}
++n_past;
// save seq 0 and load into seq 1
{

View File

@ -730,10 +730,6 @@ extern "C" {
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
"use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);

View File

@ -141,27 +141,50 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
namespace ggml::cpu::amx {
class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
// src1 must be host buffer
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}
// src1 must be float32
if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}
if (op->op != GGML_OP_MUL_MAT) {
return false;
}
return false;
auto * src0 = op->src[0];
auto * src1 = op->src[1];
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
return false;
}
if (!src0->buffer || src0->buffer->buft != ggml_backend_amx_buffer_type()) {
return false;
}
if (src1->buffer && !ggml_backend_buft_is_host(src1->buffer->buft)) {
return false;
}
if (op->ne[0] % (TILE_N * 2)) {
return false;
}
int alignment;
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q8_0:
alignment = TILE_K;
break;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_XS:
alignment = 256; // QK_K
break;
case GGML_TYPE_F16:
alignment = 16;
break;
default:
return false;
}
if (src0->ne[0] % alignment) {
return false;
}
if (src1->type != GGML_TYPE_F32) {
return false;
}
return true;
}
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {

View File

@ -1,4 +1,3 @@
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
@ -202,35 +201,27 @@ struct tile_config_t{
// advanced-matrix-extensions-intrinsics-functions.html
//
#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb
void ggml_tile_config_init(void) {
static thread_local bool is_first_time = true;
inline void ggml_tile_config_init(void) {
static thread_local bool done = false;
if (!is_first_time) {
if (done) {
return;
}
static thread_local tile_config_t tc;
tile_config_t current_tc;
_tile_storeconfig(&current_tc);
alignas(64) tile_config_t tc = {};
tc.palette_id = 1;
tc.start_row = 0;
tc.rows[0] = 8; tc.colsb[0] = 64;
tc.rows[1] = 8; tc.colsb[1] = 64;
tc.rows[2] = 16; tc.colsb[2] = 32;
tc.rows[3] = 16; tc.colsb[3] = 32;
tc.rows[4] = 16; tc.colsb[4] = 64;
tc.rows[5] = 16; tc.colsb[5] = 64;
tc.rows[6] = 16; tc.colsb[6] = 64;
tc.rows[7] = 16; tc.colsb[7] = 64;
// load only when config changes
if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
tc.palette_id = 1;
tc.start_row = 0;
TC_CONFIG_TILE(TMM0, 8, 64);
TC_CONFIG_TILE(TMM1, 8, 64);
TC_CONFIG_TILE(TMM2, 16, 32);
TC_CONFIG_TILE(TMM3, 16, 32);
TC_CONFIG_TILE(TMM4, 16, 64);
TC_CONFIG_TILE(TMM5, 16, 64);
TC_CONFIG_TILE(TMM6, 16, 64);
TC_CONFIG_TILE(TMM7, 16, 64);
_tile_loadconfig(&tc);
}
is_first_time = false;
_tile_loadconfig(&tc);
done = true;
}
// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
@ -268,33 +259,6 @@ int get_row_size(int K) {
return row_size;
}
// vectorized dtype conversion
inline float FP16_TO_FP32(ggml_half val) {
__m256i v = _mm256_setr_epi16(
val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 o = _mm512_cvtph_ps(v);
return _mm512_cvtss_f32(o);
}
inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
__m256i v = _mm256_set1_epi16(val);
return _mm512_cvtph_ps(v);
}
// horizontal reduce
inline float _mm512_reduce_max_ps(const __m512 x) {
__m512 v = x;
__m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_ps(v, v, 0x4E);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_ps(v, v, 0xB1);
v = _mm512_max_ps(v, v1);
return _mm512_cvtss_f32(v);
}
// transpose utils
#define SHUFFLE_EPI32(a, b, mask) \
_mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
@ -1370,9 +1334,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE) \
tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply( \
K, (const float *)src1->data + mb_start * K, \
(const type *)src0->data + nb_start * K, \
(float *)dst->data + mb_start * ldc + nb_start, ldc);
K, (const float *)src1->data + src1_offset + mb_start * K, \
(const type *)src0->data + src0_offset + nb_start * K, \
(float *)dst->data + dst_offset + mb_start * ldc + nb_start, ldc)
// re-organize in the format {NB, KB, TILE_SIZE}:
@ -2019,11 +1983,11 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
}
};
#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \
tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply( \
KB, (const char *)wdata + 0 * row_size_A, \
(const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \
(float *) dst->data + 0 * N + nb_start, ldc)
#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \
tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply( \
KB, wdata_batch, \
(const char *)src0->data + src0_offset + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \
(float *) dst->data + dst_offset + nb_start, ldc)
template <typename TA, typename TB, typename TC, int BLOCK_K,
typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
@ -2079,7 +2043,7 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
_tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));
if (need_unpack) {
unpack_B<TB>(Tile1, B_blk0);
unpack_B<TB>(Tile1, B_blk1);
_tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
} else {
_tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
@ -2336,6 +2300,13 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
});
}
// ne2 is passed explicitly to help compiler optimize repeated calls
inline int64_t ggml_batch_offset(const ggml_tensor * t, int64_t batch_idx, int64_t ne2) {
const int64_t i2 = batch_idx % ne2;
const int64_t i3 = batch_idx / ne2;
return i3 * t->nb[3] + i2 * t->nb[2];
}
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
@ -2348,12 +2319,13 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
const int M = dst->ne[1];
const int K = src0->ne[0];
const int64_t n_batch = dst->ne[2] * dst->ne[3];
size_t desired_wsize = 0;
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
desired_wsize = M * row_size_A;
desired_wsize = n_batch * M * row_size_A;
});
return desired_wsize;
@ -2365,7 +2337,7 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
// src1: input in shape of {M, K}, float32
// dst: output in shape of {M, N}, float32
//
// the function performs: dst = src1 @ src0.T
// the function performs: dst = src1 @ src0.T for each batch
//
void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
@ -2382,17 +2354,26 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int K = src0->ne[0];
const int ldc = dst->nb[1] / dst->nb[0];
const int64_t ne2 = dst->ne[2];
const int64_t n_batch = ne2 * dst->ne[3];
if (is_floating_type) {
constexpr int BLOCK_M = 4;
constexpr int BLOCK_N = 6;
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
for (int i = begin; i < end; ++i) {
int mb = i / NB;
int nb = i % NB;
int batch_idx = i / (MB * NB);
int remaining = i % (MB * NB);
int mb = remaining / NB;
int nb = remaining % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
@ -2424,10 +2405,10 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
void * wdata = params->wdata;
//TODO: performance improvement: merge quant A
if (params->ith == 0) {
// if (params->ith == 0) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = M * row_size_A;
const size_t desired_wsize = n_batch * M * row_size_A;
if (params->wsize < desired_wsize) {
GGML_ABORT("insufficient work space size");
}
@ -2436,12 +2417,19 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
const float * A_data = static_cast<const float *>(src1->data);
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
}
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
}
});
});
}
// }
ggml_barrier(params->threadpool);
@ -2451,13 +2439,19 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
constexpr int BLOCK_N = TILE_N * kTilesN;
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, NB, [&](int begin, int end) {
parallel_for_ggml(params, n_batch * NB, [&](int begin, int end) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const int KB = K / blck_size;
const int TILE_SIZE = get_tile_size<type>();
const int row_size_A = KB * sizeof(vec_dot_type);
for (int i = begin; i < end; ++i) {
int nb = i;
int batch_idx = i / NB;
int nb = i % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
const char * wdata_batch = (const char *)wdata + batch_idx * row_size_A;
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96
@ -2481,7 +2475,7 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
// init tile config for each thread
ggml_tile_config_init();
@ -2491,8 +2485,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int row_size_A = KB * sizeof(vec_dot_type);
for (int i = begin; i < end; ++i) {
int mb = i / NB;
int nb = i % NB;
int batch_idx = i / (MB * NB);
int remaining = i % (MB * NB);
int mb = remaining / NB;
int nb = remaining % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
const char * wdata_batch = (const char *)wdata + batch_idx * M * row_size_A;
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
@ -2501,9 +2501,9 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
mb_size, nb_size, KB,
(const char *)wdata + mb_start * row_size_A,
(const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
(float *) dst->data + mb_start * N + nb_start, ldc);
wdata_batch + mb_start * row_size_A,
(const char *)src0->data + src0_offset + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
(float *) dst->data + dst_offset + mb_start * N + nb_start, ldc);
}
});
});

View File

@ -42,11 +42,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -55,11 +58,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
@ -67,8 +73,10 @@
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
@ -77,19 +85,23 @@
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__POWERPC__) || defined(__powerpc__)
@ -110,11 +122,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -123,11 +138,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__loongarch64)
@ -148,11 +166,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -161,11 +182,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__riscv)
@ -187,11 +211,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -199,11 +226,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__s390x__)
@ -230,11 +260,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -243,11 +276,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__wasm__)
@ -276,11 +312,14 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@ -289,11 +328,14 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#endif

View File

@ -498,6 +498,81 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
float * res_ptr = s;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
float32x4_t sumf = vdupq_n_f32(0);
for (int l = 0; l < nb; l++) {
uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
int32x4_t sumi = vdupq_n_s32(0);
sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
float32x4_t b_d = {
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
};
float32x4_t d = a_d * b_d;
sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
}
vst1q_f32(res_ptr + x * 4, sumf);
}
return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
ggml_gemv_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
@ -785,6 +860,165 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q5_K_8x4_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 4;
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
const uint8x16_t m4b = vdupq_n_u8(0x0f);
const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2);
// 1x8 tile = 2 x 4
float32x4_t acc_f32[col_groups];
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int i = 0; i < col_groups; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
float32x4_t q5_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d)); // d0 d1 d2 d3
float32x4_t q5_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4)); // d4 d5 d6 d7
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
float32x4_t sb_scale_0123 = vmulq_f32(q5_d_0, q8_d);
float32x4_t sb_scale_4567 = vmulq_f32(q5_d_1, q8_d);
float32x4_t q5_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin)); // dmin 0..3
float32x4_t q5_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4)); // dmin 4..7
float32x4_t sb_min_0123 = vmulq_f32(q5_dmin_0, q8_d);
float32x4_t sb_min_4567 = vmulq_f32(q5_dmin_1, q8_d);
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
int32x4_t acc_lo[col_groups];
int32x4_t acc_hi[col_groups];
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
int16_t bsums_arr[8];
vst1q_s16(bsums_arr, bsums);
uint8x16_t qh[col_groups][8];
for (int c = 0; c < col_groups; c++) {
for (int i = 0; i < 8; i++) {
qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
}
}
for (int sb = 0; sb < QK_K / 64; sb++) {
for (int i = 0; i < col_groups; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int16x8_t q5sb_mins[2];
int16x8_t q5sb_scales[2];
for (int i = 0; i < 2; i++) {
int8_t aux_q5sb[8];
const int offset = sb * 24 + i * 12;
decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
}
int8x16_t q8_qs[4];
for (int i = 0; i < 4; i++) {
q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
}
for (int c = 0; c < col_groups; c++) {
uint8x16_t q5_cols[8];
uint8x16_t hbit_lo[8];
uint8x16_t hbit_hi[8];
int8x16_t q5_lo[8];
int8x16_t q5_hi[8];
for (int i = 0; i < 8; i++) {
q5_cols[i] = vld1q_u8(q5_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
hbit_lo[i] = vandq_u8(qh[c][i], mone);
hbit_hi[i] = vshlq_n_u8(vandq_u8(qh[c][i], mtwo), 3);
qh[c][i] = vshrq_n_u8(qh[c][i], 2);
q5_lo[i] = vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_cols[i], m4b), hbit_lo[i], 4));
q5_hi[i] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_cols[i], 4), hbit_hi[i]));
}
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[0], q8_qs[0], 0);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[1], q8_qs[0], 1);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[2], q8_qs[0], 2);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[3], q8_qs[0], 3);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[4], q8_qs[1], 0);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[5], q8_qs[1], 1);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[6], q8_qs[1], 2);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[7], q8_qs[1], 3);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[0], q8_qs[2], 0);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[1], q8_qs[2], 1);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[2], q8_qs[2], 2);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[3], q8_qs[2], 3);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[4], q8_qs[3], 0);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[5], q8_qs[3], 1);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[6], q8_qs[3], 2);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[7], q8_qs[3], 3);
}
// Scales
// row c0123 blk0 and blk1
const int16x4_t sc_0123_lo = vget_low_s16(q5sb_scales[0]);
const int16x4_t sc_0123_hi = vget_low_s16(q5sb_scales[1]);
const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
// row c4567 blk0 and blk1
const int16x4_t sc_4567_lo = vget_high_s16(q5sb_scales[0]);
const int16x4_t sc_4567_hi = vget_high_s16(q5sb_scales[1]);
const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
// Bias Correction
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
} // for sb
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
} // for b
int base = x * ncols_interleaved;
vst1q_f32(s + base, acc_f32[0]);
vst1q_f32(s + base + 4, acc_f32[1]);
} // for x
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemv_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q5_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
@ -3005,6 +3239,87 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert (n % qk == 0);
assert (nr % 4 == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
const int8x16_t kvalues = vld1q_s8(kvalues_mxfp4);
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
float32x4_t sumf[4];
for (int m = 0; m < 4; m++) {
sumf[m] = vdupq_n_f32(0);
}
for (int l = 0; l < nb; l++) {
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
float32x4_t b_d = {
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[0]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[3]),
};
int32x4_t sumi_0 = vdupq_n_s32(0);
int32x4_t sumi_1 = vdupq_n_s32(0);
int32x4_t sumi_2 = vdupq_n_s32(0);
int32x4_t sumi_3 = vdupq_n_s32(0);
for (int k = 0; k < 4; k++) {
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
}
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
}
for (int m = 0; m < 4; m++) {
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
}
}
}
return;
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
ggml_gemm_mxfp4_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
@ -3205,6 +3520,235 @@ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q5_K_8x4_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 4;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int q8_k_blocklen = 4;
constexpr int acc_size = 2 * 4; // 2 row pairs, 4 col pairs
constexpr int col_groups = ncols_interleaved / 4;
const uint8x16_t m4b = vdupq_n_u8(0x0f);
const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2);
// 8 accumulators: 2 row pairs, 4 col pairs
float32x4_t acc_f32[acc_size];
for (int y = 0; y < nr / q8_k_blocklen; y++) {
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int i = 0; i < acc_size; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
// d5 0 1 2 3, 4 5 6 7
float32x4_t q5_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d));
float32x4_t q5_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4));
// d8 0 1 2 3
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
// mins
float32x4_t q5_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin));
float32x4_t q5_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4));
// Precomputation of scales and mins
float32x4_t sbd_scale_0123[q8_k_blocklen];
float32x4_t sbd_scale_4567[q8_k_blocklen];
float32x4_t sbd_min_0123[q8_k_blocklen];
float32x4_t sbd_min_4567[q8_k_blocklen];
sbd_scale_0123[0] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 0);
sbd_scale_4567[0] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 0);
sbd_min_0123[0] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 0);
sbd_min_4567[0] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 0);
sbd_scale_0123[1] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 1);
sbd_scale_4567[1] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 1);
sbd_min_0123[1] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 1);
sbd_min_4567[1] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 1);
sbd_scale_0123[2] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 2);
sbd_scale_4567[2] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 2);
sbd_min_0123[2] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 2);
sbd_min_4567[2] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 2);
sbd_scale_0123[3] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 3);
sbd_scale_4567[3] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 3);
sbd_min_0123[3] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 3);
sbd_min_4567[3] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 3);
// Precomputation of bsums, each vpaddq calcs all the bsums for each row
const int16x8_t bsums[q8_k_blocklen] = {
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
};
int16_t bsums_arr[QK_K / 64][8];
for (int q8_row = 0; q8_row < 4; q8_row++) {
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
}
// interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
int32x4_t bias_acc[acc_size];
for (int i = 0; i < acc_size; i++) {
bias_acc[i] = vdupq_n_s32(0);
}
uint8x16_t qh[col_groups][8];
for (int c = 0; c < col_groups; c++) {
for (int i = 0; i < 8; i++) {
qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
}
}
for (int sb = 0; sb < QK_K / 64; sb++) {
// Int accumulators for qs vecdot (4 row * 2 col quartets)
int32x4_t acc_lo[acc_size];
int32x4_t acc_hi[acc_size];
for (int i = 0; i < acc_size; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int16x8_t q5sb_scales[2];
int16x8_t q5sb_mins[2];
for (int i = 0; i < 2; i++) {
int8_t aux_q5sb[8];
const int offset = sb * 24 + i * 12;
decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
}
constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
for (int k = 0; k < reads_per_sb; k++) {
const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
// 0..3 & 32..35
const uint8x16_t q5_0123 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k);
const uint8x16_t q5_4567 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k + 16);
// NOTE: This is the only difference with q4_K
const uint8x16_t hbit_lo_0123 = vandq_u8(qh[0][k], mone);
const uint8x16_t hbit_hi_0123 = vshlq_n_u8(vandq_u8(qh[0][k], mtwo), 3);
qh[0][k] = vshrq_n_u8(qh[0][k], 2);
const uint8x16_t hbit_lo_4567 = vandq_u8(qh[1][k], mone);
const uint8x16_t hbit_hi_4567 = vshlq_n_u8(vandq_u8(qh[1][k], mtwo), 3);
qh[1][k] = vshrq_n_u8(qh[1][k], 2);
// From here, same as q4_K
const int8x16_t q5_0123_lo =
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_0123, m4b), hbit_lo_0123, 4));
const int8x16_t q5_0123_hi =
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_0123, 4), hbit_hi_0123));
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q5_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q5_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q5_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q5_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q5_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q5_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q5_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q5_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
const int8x16_t q5_4567_lo =
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_4567, m4b), hbit_lo_4567, 4));
const int8x16_t q5_4567_hi =
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_4567, 4), hbit_hi_4567));
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q5_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q5_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q5_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q5_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q5_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q5_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q5_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q5_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
}
// Scale and bias application
// acc is stored interleaved to match output layout
const int16x4_t sc_0123_lo = vget_low_s16(q5sb_scales[0]);
const int16x4_t sc_4567_lo = vget_high_s16(q5sb_scales[0]);
const int16x4_t sc_0123_hi = vget_low_s16(q5sb_scales[1]);
const int16x4_t sc_4567_hi = vget_high_s16(q5sb_scales[1]);
for (int row = 0; row < q8_k_blocklen; row++) {
// Bias correction
// row c0123 blk0 and blk1
const float32x4_t sumf_0123 =
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
// row c4567 blk0 and blk1
const float32x4_t sumf_4567 =
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
// Bias
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
// row c0123 blk0 and blk1
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
// row c4567 blk0 and blk1
bias_acc[2 * row + 1] =
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
bias_acc[2 * row + 1] =
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
}
} // for sb
for (int row = 0; row < q8_k_blocklen; row++) {
acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
acc_f32[2 * row + 1] =
vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
}
} // for b
for (int i = 0; i < q8_k_blocklen; i++) {
int row = y * q8_k_blocklen + i;
for (int j = 0; j < 2; j++) {
int col = x * ncols_interleaved + j * 4;
int offset = row * bs + col;
vst1q_f32(s + offset, acc_f32[2 * i + j]);
}
}
} // for x
} // for y
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemm_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q4_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,

View File

@ -522,7 +522,8 @@ template<typename block_tx8>
static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, __m256i signextendlut) {
static_assert(
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>,
std::is_same_v<block_tx8, block_iq4_nlx8> ||
std::is_same_v<block_tx8, block_mxfp4x8>,
"Unsupported block type");
const int qk = QK8_0;
@ -580,6 +581,18 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
// Load 8 E8M0 exponents and convert to float via LUT
// Rearranged to match changemask order: 0,4,1,5,2,6,3,7
col_scale_f32 = _mm256_set_ps(
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[0]));
}
// Load and convert to FP32 scale from block_q8_0
@ -628,7 +641,8 @@ template<typename block_tx8>
static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, __m256i signextendlut) {
static_assert(
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>,
std::is_same_v<block_tx8, block_iq4_nlx8> ||
std::is_same_v<block_tx8, block_mxfp4x8>,
"Unsupported block type");
const int qk = QK8_0;
@ -749,6 +763,25 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
//TODO: simd-ify
col_scale_f32 = _mm512_set_ps(
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[0]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[0]));
}
// Process LHS in pairs of rows
@ -941,6 +974,25 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
//TODO: simd-ify
col_scale_f32 = _mm512_set_ps(
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[0]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_0[b].e[0]));
}
// Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
@ -1123,6 +1175,16 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
col_scale_f32 = _mm256_set_ps(
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[0]));
}
// Process LHS in groups of four
@ -1283,6 +1345,16 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
std::is_same_v<block_tx8, block_q4_0x8> ||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
col_scale_f32 = _mm256_set_ps(
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[7]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[6]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[5]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[4]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[3]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[2]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[1]),
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[b].e[0]));
}
// Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
@ -1625,6 +1697,19 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemv_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
#if defined(__AVX2__)
__m256i signextendlut = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)kvalues_mxfp4));
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
gemv_q4_b32_8x8_q8_0_lut_avx<block_mxfp4x8>(n, s, bs, vx, vy, nr, nc, signextendlut);
return;
#endif
ggml_gemv_mxfp4_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
@ -3423,6 +3508,21 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
#if defined(__AVX2__) || defined(__AVX512F__)
{
__m256i signextendlut = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)kvalues_mxfp4));
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
gemm_q4_b32_8x8_q8_0_lut_avx<block_mxfp4x8>(n, s, bs, vx, vy, nr, nc, signextendlut);
return;
}
#endif // defined(__AVX2__) || defined(__AVX512F__)
ggml_gemm_mxfp4_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;

View File

@ -450,6 +450,208 @@ static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
}
}
template <int M, int N>
static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int blocklen = M;
constexpr int ncols_interleaved = N;
const int qk = QK_K;
const int nb = n / qk;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[ncols_interleaved];
float sum_minf[ncols_interleaved];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
const block_q8_K * a_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0;
sum_minf[j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
constexpr int scale_stride = 32;
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
const int qh_shift = (k / (32 / blocklen)) * 2;
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
const int qh_idx = (k * blocklen + i) % 32;
const int qh_chunk = qh_idx / blocklen;
const int qh_pos = qh_idx % blocklen;
const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
const uint8_t h0 = (qh_val >> qh_shift) & 1;
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
}
}
}
template <int M, int N>
static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int blocklen = M;
constexpr int ncols_interleaved = N;
const int qk = QK_K;
const int nb = n / qk;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
float sumf[4][ncols_interleaved];
float sum_minf[4][ncols_interleaved];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0;
sum_minf[m][j] = 0.0;
}
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
constexpr int scale_stride = 32;
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
const int qh_shift = (k / (32 / blocklen)) * 2;
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
const int qh_idx = (k * blocklen + i) % 32;
const int qh_chunk = qh_idx / blocklen;
const int qh_pos = qh_idx % blocklen;
const int b_qh_offset =
qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
const uint8_t h0 = (qh_val >> qh_shift) & 1;
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
const int q8_offset = (k / (32 / blocklen)) * 256 +
(k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for (int m = 0; m < 4; m++) {
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
}
}
}
}
}
extern "C" {
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@ -803,98 +1005,12 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemv_q5_K_8x8_q8_K_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
}
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[8];
float sum_minf[8];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
const block_q8_K * a_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0;
sum_minf[j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
const int qh_shift = (k / 4) * 2;
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
const int qh_idx = (k * 8 + i) % 32;
const int qh_chunk = qh_idx / 8;
const int qh_pos = qh_idx % 8;
const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
const uint8_t h0 = (qh_val >> qh_shift) & 1;
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
const int q8_offset = (k >> 2) * 64 + (k % 4) * blocklen + i;
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
}
}
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
}
@ -982,6 +1098,82 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
}
}
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert(nr == 1);
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[4];
int sumi;
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
}
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
}
}
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
assert(nr == 1);
assert(n % qk == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[8];
int sumi;
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
}
sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
}
}
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
@ -1494,107 +1686,12 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemm_q5_K_8x8_q8_K_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
}
constexpr uint32_t kmask1 = 0x3f3f3f3f;
constexpr uint32_t kmask2 = 0x0f0f0f0f;
constexpr uint32_t kmask3 = 0x03030303;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
float sumf[4][8];
float sum_minf[4][8];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0;
sum_minf[m][j] = 0.0;
}
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
const int qh_shift = (k / 4) * 2;
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
const int qh_idx = (k * 8 + i) % 32;
const int qh_chunk = qh_idx / 8;
const int qh_pos = qh_idx % 8;
const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
const uint8_t h0 = (qh_val >> qh_shift) & 1;
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
const int q8_offset = (k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i;
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for (int m = 0; m < 4; m++) {
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
}
}
}
}
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@ -1705,6 +1802,94 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
}
}
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
float sumf[4][4];
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
}
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++)
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
}
}
}
}
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
float sumf[4][8];
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
}
sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++)
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
}
}
}
}
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
float * GGML_RESTRICT s,
size_t bs,
@ -2029,18 +2214,16 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
const int end = QK_K * 4 / blck_size_interleave;
// Interleave Q5_K quants by taking 8 bytes at a time
// Interleave Q5_K quants by taking blck_size_interleave bytes at a time
for (int i = 0; i < end; ++i) {
int src_id = i % 8;
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
}
// Repeat for low bits 8 bytes at a time as well, since
// Repeat for high bits with the same chunk size, since
// the high bits are interleaved in Q5_K and the index is
// qh_idx = (qs_idx % 32);
// qh_val = qh[qh_idx] >> (qs_idx / 32);
@ -2049,9 +2232,7 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
memcpy(&elems, &in[src_id].qh[src_offset], sizeof(uint64_t));
memcpy(&out.qh[dst_offset], &elems, sizeof(uint64_t));
memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
}
// The below logic is copied over from Q4_K
@ -2249,7 +2430,7 @@ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
const void * GGML_RESTRICT data,
size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
GGML_ASSERT(interleave_block == 8);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
constexpr int nrows_interleaved = 8;
block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
@ -2493,6 +2674,121 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
GGML_UNUSED(data_size);
}
static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
block_mxfp4x4 out;
for (int i = 0; i < 4; i++) {
out.e[i] = in[i].e;
}
const int end = QK_MXFP4 * 2 / blck_size_interleave;
if (blck_size_interleave == 4) {
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
}
} else {
GGML_ASSERT(false);
}
return out;
}
static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
GGML_ASSERT(interleave_block == 4);
const block_mxfp4 * src = (const block_mxfp4 *)data;
block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
block_mxfp4 dst_tmp[4];
int nrow = ggml_nrows(t);
int nrows_interleaved = 4;
int nblocks = t->ne[0] / QK_MXFP4;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
GGML_UNUSED(data_size);
}
static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
block_mxfp4x8 out;
for (int i = 0; i < 8; i++) {
out.e[i] = in[i].e;
}
const int end = QK_MXFP4 * 4 / blck_size_interleave;
if (blck_size_interleave == 8) {
for (int i = 0; i < end; ++i) {
int src_id = i % 8;
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
}
} else {
GGML_ASSERT(false);
}
return out;
}
static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
GGML_ASSERT(interleave_block == 8);
const block_mxfp4 * src = (const block_mxfp4 *)data;
block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
block_mxfp4 dst_tmp[8];
int nrow = ggml_nrows(t);
int nrows_interleaved = 8;
int nblocks = t->ne[0] / QK_MXFP4;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
if (t->ne[1] % nrows_interleaved != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
GGML_UNUSED(data_size);
}
namespace ggml::cpu::repack {
// repack
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@ -2523,6 +2819,10 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
}
template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
}
template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
}
@ -2548,6 +2848,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
}
template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
}
template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
}
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
}
@ -2591,6 +2899,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
@ -2611,6 +2923,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
@ -2654,6 +2974,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
@ -2674,6 +2998,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
@ -3068,6 +3400,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
// instance for Q5_K
static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
// instance for Q6_K
@ -3081,6 +3414,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
// instance for MXFP4
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
// instance for Q8_0
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
@ -3130,6 +3467,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &q5_K_8x8_q8_K;
}
}
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
if (cur->ne[1] % 8 == 0) {
return &q5_K_8x4_q8_K;
}
}
} else if (cur->type == GGML_TYPE_Q6_K) {
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (cur->ne[1] % 8 == 0) {
@ -3152,6 +3494,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &iq4_nl_4x4_q8_0;
}
}
} else if (cur->type == GGML_TYPE_MXFP4) {
if (ggml_cpu_has_avx2()) {
if (cur->ne[1] % 8 == 0) {
return &mxfp4_8x8_q8_0;
}
}
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
if (cur->ne[1] % 4 == 0) {
return &mxfp4_4x4_q8_0;
}
}
} else if (cur->type == GGML_TYPE_Q8_0) {
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (cur->ne[1] % 4 == 0) {

View File

@ -97,6 +97,19 @@ struct block_iq4_nlx8 {
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
struct block_mxfp4x4 {
uint8_t e[4];
uint8_t qs[QK_MXFP4 * 2];
};
static_assert(sizeof(block_mxfp4x4) == 4 + QK_MXFP4 * 2, "wrong mxfp4x4 block size/padding");
struct block_mxfp4x8 {
uint8_t e[8];
uint8_t qs[QK_MXFP4 * 4];
};
static_assert(sizeof(block_mxfp4x8) == 8 + QK_MXFP4 * 4, "wrong mxfp4x8 block size/padding");
#if defined(__cplusplus)
extern "C" {
#endif
@ -111,22 +124,28 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@ -143,22 +162,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

View File

@ -1149,8 +1149,7 @@ struct ggml_cuda_graph {
size_t num_nodes = 0;
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool disable_due_to_too_many_updates = false;
int number_consecutive_updates = 0;
bool warmup_complete = false;
std::vector<ggml_cuda_graph_node_properties> props;
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@ -1159,21 +1158,9 @@ struct ggml_cuda_graph {
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
std::vector<ggml_cuda_graph_node_properties> extra;
void record_update(bool use_graph, bool update_required) {
if (use_graph && update_required) {
number_consecutive_updates++;
} else {
number_consecutive_updates = 0;
}
if (number_consecutive_updates >= 4) {
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
disable_due_to_too_many_updates = true;
}
}
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env);
}
#endif
};

View File

@ -16,27 +16,27 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
return;
}
const int64_t i01 = blockIdx.y;
for (int64_t i01 = blockIdx.y; i01 < ne01; i01 += gridDim.y) {
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
const int64_t ib = ibx0 + i00/qk; // block index
const int64_t iqs = (i00%qk)/qr; // quant index
const int64_t iybs = i00 - i00%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;
const int64_t ib = ibx0 + i00/qk; // block index
const int64_t iqs = (i00%qk)/qr; // quant index
const int64_t iybs = i00 - i00%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize
float2 v;
dequantize_kernel(vx, ib, iqs, v);
// dequantize
float2 v;
dequantize_kernel(vx, ib, iqs, v);
const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs;
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs;
y[iy0 + 0] = ggml_cuda_cast<dst_t>(v.x);
y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
}
}
}
@ -492,7 +492,7 @@ static void dequantize_block_cuda(const void * vx, dst_t * y,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const int64_t ne0203 = ne02*ne03;
const uint3 ne02_fdv = init_fastdiv_values(ne02);
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, (int)std::min(ne0203, (int64_t)65535));
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), (int)std::min(ne01, (int64_t)65535), (int)std::min(ne0203, (int64_t)65535));
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
}
@ -628,18 +628,18 @@ static __global__ void convert_unary(
return;
}
const int64_t i01 = blockIdx.y;
const src_t * x = (const src_t *) vx;
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
for (int64_t i01 = blockIdx.y; i01 < ne01; i01 += gridDim.y) {
for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
const int64_t i02 = dm.y;
const int64_t i03 = dm.x;
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = (i0203*ne01 + i01)*ne00 + i00;
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = (i0203*ne01 + i01)*ne00 + i00;
y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
}
}
}
@ -649,7 +649,7 @@ static void convert_unary_cuda(const void * vx, dst_t * y,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const int64_t ne0203 = ne02*ne03;
const uint3 ne02_fdv = init_fastdiv_values(ne02);
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, (int)std::min(ne0203, (int64_t)65535));
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, (int)std::min(ne01, (int64_t)65535), (int)std::min(ne0203, (int64_t)65535));
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
}

View File

@ -111,6 +111,44 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
}
static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_cdna(const int DKQ, const int DV, const int ncols) {
// Conservative configs for CDNA (MI100+): 64KB LDS, wavefront64, nstages=1 (no cp.async).
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 8, 128, 2, 128, 32, 32, 32, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 16, 128, 2, 64, 32, 32, 32, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 32, 128, 2, 64, 32, 32, 32, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 32, 32, 32, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 8, 128, 2, 128, 40, 40, 40, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 16, 128, 2, 64, 40, 40, 40, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 32, 128, 2, 64, 40, 40, 40, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 64, 256, 2, 64, 40, 40, 40, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 8, 128, 2, 128, 48, 48, 48, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 16, 128, 2, 64, 48, 48, 48, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 32, 128, 2, 64, 48, 48, 48, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 64, 256, 2, 64, 48, 48, 48, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 8, 128, 2, 128, 56, 56, 56, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 128, 2, 64, 56, 56, 56, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 128, 2, 64, 56, 56, 56, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 64, 256, 2, 64, 56, 56, 56, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 8, 128, 2, 128, 64, 64, 64, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 128, 2, 64, 64, 64, 64, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 256, 2, 64, 64, 64, 64, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 1, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 256, 2, 32, 128, 128, 128, 1, true);
// Fallback for unsupported DKQ values (e.g. 576). Must return non-zero values to satisfy
// compile-time static_asserts even though the kernel guard prevents runtime execution.
// nthreads=256 gives nwarps=4 (warp_size=64) or 8 (warp_size=32), nbatch_fa=128 satisfies np*16 divisibility.
return fattn_mma_config(256, 1, 128, 4, 4, 4, 1, false);
}
static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
if (ampere_mma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
@ -118,6 +156,9 @@ static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, c
if (turing_mma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
}
if (amd_mfma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_cdna(DKQ, DV, ncols);
}
if (amd_wmma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
}
@ -130,6 +171,8 @@ static constexpr __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config(cons
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
#elif defined(TURING_MMA_AVAILABLE)
return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
#elif defined(AMD_MFMA_AVAILABLE)
return ggml_cuda_fattn_mma_get_config_cdna(DKQ, DV, ncols);
#elif defined(VOLTA_MMA_AVAILABLE)
return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
#elif defined(AMD_WMMA_AVAILABLE)
@ -205,15 +248,15 @@ static constexpr __device__ bool ggml_cuda_fattn_mma_get_Q_in_reg(const int DKQ,
}
static constexpr __device__ int get_cols_per_thread() {
#if defined(AMD_WMMA_AVAILABLE)
return 1; // RDNA has a single column.
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
return 1; // AMD has a single column per thread.
#else
return 2; // This is specifically KQ columns, Volta only has a single VKQ column.
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
}
static __host__ int get_cols_per_warp(const int cc) {
if (turing_mma_available(cc) || amd_wmma_available(cc)) {
if (turing_mma_available(cc) || amd_wmma_available(cc) || amd_mfma_available(cc)) {
return 16;
} else {
// Volta
@ -241,6 +284,7 @@ static constexpr __device__ int ggml_cuda_fattn_mma_get_nstages(const int DKQ, c
template<int stride_tile, int nwarps, int nbatch_fa, bool use_cp_async, bool oob_check>
static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV, const int i_sup) {
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
// K/V data is loaded with decreasing granularity for D for better memory bandwidth.
// The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
if constexpr (use_cp_async) {
@ -252,10 +296,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
auto load = [&] __device__ (auto n) {
const int stride_k = WARP_SIZE >> n;
const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
const int stride_k = warp_size >> n;
const int k0_start = stride_k == warp_size ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
const int k0_stop = chunks_per_row - chunks_per_row % (1*stride_k);
const int stride_i = WARP_SIZE / stride_k;
const int stride_i = warp_size / stride_k;
if (k0_start == k0_stop) {
return;
@ -263,7 +307,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
#pragma unroll
for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
const int i = i0 + threadIdx.y*stride_i + (stride_k == warp_size ? 0 : threadIdx.x / stride_k);
if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
break;
@ -271,7 +315,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
#pragma unroll
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
cp_async_cg_16<preload>(tile_KV_32 + i*(stride_tile*sizeof(half2)) + k*16, KV + i*stride_KV + k*h2_per_chunk);
}
@ -287,10 +331,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
} else {
// TODO use ggml_cuda_memcpy_1
auto load = [&] __device__ (const int n) {
const int stride_k = WARP_SIZE >> n;
const int k0_start = stride_k == WARP_SIZE ? 0 : D2 - D2 % (2*stride_k);
const int stride_k = warp_size >> n;
const int k0_start = stride_k == warp_size ? 0 : D2 - D2 % (2*stride_k);
const int k0_stop = D2 - D2 % (1*stride_k);
const int stride_i = WARP_SIZE / stride_k;
const int stride_i = warp_size / stride_k;
if (k0_start == k0_stop) {
return;
@ -298,7 +342,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
#pragma unroll
for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
const int i = i0 + threadIdx.y*stride_i + (stride_k == warp_size ? 0 : threadIdx.x / stride_k);
if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
break;
@ -306,7 +350,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
#pragma unroll
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
tile_KV[i*stride_tile + k] = !oob_check || i < i_sup ? KV[i*stride_KV + k] : make_half2(0.0f, 0.0f);
}
@ -324,18 +368,19 @@ template<int ncols1, int nwarps, int nbatch_fa, bool use_cp_async, bool oob_chec
static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
const half * const __restrict__ mask_h, half * const __restrict__ tile_mask,
const int stride_mask, const int i_sup, const int j0, const uint3 ne01) {
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
if constexpr (use_cp_async) {
static_assert(nbatch_fa <= 8*WARP_SIZE && nbatch_fa % 8 == 0, "bad nbatch_fa");
static_assert(nbatch_fa <= 8*warp_size && nbatch_fa % 8 == 0, "bad nbatch_fa");
static_assert(!oob_check, "OOB check incompatible with cp_async");
constexpr int preload = nbatch_fa >= 32 ? nbatch_fa * sizeof(half) : 64;
constexpr int cols_per_warp = 8*WARP_SIZE/nbatch_fa;
constexpr int cols_per_warp = 8*warp_size/nbatch_fa;
constexpr int stride_j = nwarps * cols_per_warp;
const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared(tile_mask);
#pragma unroll
for (int j1 = 0; j1 < ncols1; j1 += stride_j) {
const int j_sram = j1 + threadIdx.y*cols_per_warp + threadIdx.x / (WARP_SIZE/cols_per_warp);
const int j_sram = j1 + threadIdx.y*cols_per_warp + threadIdx.x / (warp_size/cols_per_warp);
const int j_vram = fastmodulo(j0 + j_sram, ne01);
if (j1 + stride_j > ncols1 && j_sram >= ncols1) {
@ -357,25 +402,25 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
}
#pragma unroll
for (int i0 = 0; i0 < nbatch_fa; i0 += WARP_SIZE) {
for (int i0 = 0; i0 < nbatch_fa; i0 += warp_size) {
const int i = i0 + threadIdx.x;
tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[j_vram*stride_mask + i] : half(0.0f);
}
}
} else if constexpr (nbatch_fa < 2*WARP_SIZE) {
constexpr int cols_per_warp = 2*WARP_SIZE/nbatch_fa;
} else if constexpr (nbatch_fa < 2*warp_size) {
constexpr int cols_per_warp = 2*warp_size/nbatch_fa;
constexpr int stride_j = nwarps * cols_per_warp;
#pragma unroll
for (int j1 = 0; j1 < ncols1; j1 += stride_j) {
const int j_sram = j1 + threadIdx.y*cols_per_warp + threadIdx.x / (WARP_SIZE/cols_per_warp);
const int j_sram = j1 + threadIdx.y*cols_per_warp + threadIdx.x / (warp_size/cols_per_warp);
const int j_vram = fastmodulo(j0 + j_sram, ne01);
if (j1 + stride_j > ncols1 && j_sram >= ncols1) {
break;
}
const int i = threadIdx.x % (WARP_SIZE/cols_per_warp);
const int i = threadIdx.x % (warp_size/cols_per_warp);
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + j_vram*stride_mask + 2*i);
}
@ -390,7 +435,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
}
#pragma unroll
for (int i0 = 0; i0 < nbatch_fa; i0 += 2*WARP_SIZE) {
for (int i0 = 0; i0 < nbatch_fa; i0 += 2*warp_size) {
const int i = i0 + 2*threadIdx.x;
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + j_vram*stride_mask + i);
@ -428,7 +473,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int jt,
const int kb0,
const int k_VKQ_sup) {
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int ncols = ncols1 * ncols2;
constexpr int cols_per_warp = T_B_KQ::I;
constexpr int cols_per_thread = get_cols_per_thread();
@ -447,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int k_VKQ_0 = kb0 * nbatch_fa;
#if defined(TURING_MMA_AVAILABLE)
T_C_KQ KQ_C[nbatch_fa/(np*(cols_per_warp == 8 ? T_C_KQ::I : T_C_KQ::J))];
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
#else // Volta
T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
@ -500,13 +546,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
} else {
// Wide version of KQ_C is column-major
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
// AMD matrix C is column-major.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
#else
// swap A and B for CUDA.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[k_KQ_0/T_A_KQ::J], K_A);
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
}
}
}
@ -526,13 +572,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
} else {
// Wide version of KQ_C is column-major
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
// AMD matrix C is column-major.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
#else
// swap A and B for CUDA.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
}
}
}
@ -585,12 +631,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = l % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
}
}
@ -601,7 +647,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
for (int col = 0; col < cols_per_thread; ++col) {
#pragma unroll
for (int offset = 16; offset >= 4; offset >>= 1) {
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, warp_size));
}
}
@ -611,12 +657,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = l % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[KQ_idx]);
KQ_rowsum_add[KQ_idx] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
} else {
@ -649,12 +695,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = (l/2) % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
}
}
@ -666,6 +712,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
// Values per KQ column are spread across 4 threads:
constexpr int offset_first = 2;
constexpr int offset_last = 1;
#elif defined(AMD_MFMA_AVAILABLE)
// MFMA: 4 threads per Q column (threadIdx.x % 16 == col, spaced by 16).
constexpr int offset_first = 32;
constexpr int offset_last = 16;
#elif defined(AMD_WMMA_AVAILABLE)
// Values per KQ column are spread across 2 threads:
constexpr int offset_first = 16;
@ -677,7 +727,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#endif // defined(TURING_MMA_AVAILABLE)
#pragma unroll
for (int offset = offset_first; offset >= offset_last; offset >>= 1) {
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, warp_size));
}
}
@ -687,12 +737,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = (l/2) % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[KQ_idx]);
KQ_rowsum_add[KQ_idx] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
} else {
@ -739,7 +789,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
const half2 KQ_max_scale_h2 = make_half2(
KQ_max_scale[0], KQ_max_scale[0]);
#pragma unroll
@ -818,7 +868,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
}
const half2 * tile_V_i = !V_is_K_view || i0_stop > 2*nbatch_K2 ? tile_V : tile_V + i0_start/2;
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J;
#pragma unroll
for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) {
@ -830,24 +880,38 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
#if defined(LDMATRIX_TRANS_AVAILABLE)
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
#elif defined(AMD_MFMA_AVAILABLE)
// MFMA A register layout: A_mat[i=lane%16][k=4*(lane/16)+reg].
// Normal load gives A_mat[seq][dv] but we need A_mat[dv][seq] = V^T.
// Load with transposed addressing: 4 strided half loads.
{
const half2 * xs0 = tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2;
const half * xs0_h = (const half *) xs0;
const int stride_h = stride_tile_V * 2; // stride in half units
half * A_h = (half *) A.x;
#pragma unroll
for (int l = 0; l < 4; ++l) {
A_h[l] = xs0_h[(4*(threadIdx.x / 16) + l) * stride_h + threadIdx.x % 16];
}
}
#else
// TODO: Try to transpose tile_V when loading gmem to smem.
// Use mma to transpose T_A_VKQ for RDNA.
T_A_VKQ A_trans;
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
mma(A, A_trans, A_identity);
#endif // defined(TURING_MMA_AVAILABLE)
#endif // defined(LDMATRIX_TRANS_AVAILABLE)
if constexpr (T_B_KQ::I == 8) {
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
} else {
// Wide version of VKQ_C is column-major.
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
#if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
// AMD matrix C is column-major.
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
#else
// swap A and B for CUDA.
mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A);
#endif // defined(AMD_WMMA_AVAILABLE)
#endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
}
}
}
@ -866,7 +930,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::I)], A);
}
}
#endif // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#endif // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
if constexpr (nstages <= 1) {
__syncthreads(); // Only needed if tile_K == tile_V.
@ -879,7 +943,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
tile_Q, tile_K, tile_V, tile_mask,
Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
NO_DEVICE_CODE;
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
}
#if defined(TURING_MMA_AVAILABLE)
@ -899,7 +963,7 @@ template<> struct mma_tile_sizes<8> {
using T_B_VKQ = tile< 8, 8, half2>; // column-major
using T_C_VKQ = tile<16, 4, half2>; // row-major
};
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
template<int ncols> struct mma_tile_sizes {
using T_A_KQ = tile<16, 8, half2>; // row-major
using T_B_KQ = tile<16, 8, half2>; // column-major
@ -944,9 +1008,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const int zt_gqa,
const int kb0_start,
const int kb0_stop) {
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int ncols = ncols1 * ncols2;
using T_A_KQ = typename mma_tile_sizes<ncols>::T_A_KQ;
using T_B_KQ = typename mma_tile_sizes<ncols>::T_B_KQ;
@ -986,7 +1051,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
T_B_KQ Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)];
#if defined(TURING_MMA_AVAILABLE)
T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)];
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
T_C_VKQ VKQ_C[ DV/(2*T_C_VKQ::J)];
#else // Volta
T_C_VKQ VKQ_C[ DV/(2*T_C_VKQ::J)];
@ -1004,10 +1069,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
// The loading is done with decreasing granularity for D for better memory bandwidth.
const half2 scale_h2 = make_half2(scale, scale);
#pragma unroll
for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
const int k0_start = stride_k == WARP_SIZE ? 0 : DKQ/2 - (DKQ/2) % (2*stride_k);
for (int stride_k : {warp_size, warp_size/2, warp_size/4, warp_size/8}) {
const int k0_start = stride_k == warp_size ? 0 : DKQ/2 - (DKQ/2) % (2*stride_k);
const int k0_stop = DKQ/2 - (DKQ/2) % (1*stride_k);
const int stride_jc = WARP_SIZE / stride_k;
const int stride_jc = warp_size / stride_k;
if (k0_start == k0_stop) {
continue;
@ -1015,7 +1080,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
#pragma unroll
for (int jc0 = 0; jc0 < ncols; jc0 += nwarps*stride_jc) {
const int jc = jc0 + threadIdx.y*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
const int jc = jc0 + threadIdx.y*stride_jc + (stride_k == warp_size ? 0 : threadIdx.x / stride_k);
if (jc0 + nwarps*stride_jc > ncols && jc >= ncols) {
break;
@ -1027,7 +1092,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
if ((ncols1 == 1 || jt*ncols1 + j < int(ne01.z)) && (ncols2 == 1 || zt_gqa*ncols2 + c < gqa_ratio)) {
#pragma unroll
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
const float2 tmp = Q_f2[(jt*ncols1 + j)*stride_Q1 + c*stride_Q2 + k];
tile_Q[jc*stride_tile_Q + k] = scale_h2 * make_half2(tmp.x, tmp.y);
@ -1035,7 +1100,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
} else {
#pragma unroll
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
tile_Q[jc*stride_tile_Q + k] = make_half2(0.0f, 0.0f);
}
@ -1127,6 +1192,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
// The partial sums are spread across 8/4 threads.
constexpr int offset_first = cols_per_warp == 8 ? 16 : 2;
constexpr int offset_last = cols_per_warp == 8 ? 4 : 1;
#elif defined(AMD_MFMA_AVAILABLE)
// The partial sums are spread across 4 threads (wavefront64, 16 cols).
constexpr int offset_first = 32;
constexpr int offset_last = 16;
#elif defined(AMD_WMMA_AVAILABLE)
// The partial sums are spread across 2 threads.
constexpr int offset_first = 16;
@ -1140,7 +1209,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
for (int col = 0; col < cols_per_thread; ++col) {
#pragma unroll
for (int offset = offset_first; offset >= offset_last; offset >>= 1) {
KQ_rowsum[col] += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum[col], offset, WARP_SIZE);
KQ_rowsum[col] += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum[col], offset, warp_size);
}
}
}
@ -1189,7 +1258,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
#pragma unroll
for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
@ -1249,7 +1318,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(threadIdx.x % 4);
const float2 KQ_cmr = make_float2(KQ_max[threadIdx.x % cols_per_thread], KQ_rowsum[threadIdx.x % cols_per_thread]);
const bool thread_should_write = threadIdx.x % 4 < cols_per_thread;
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(0);
const float2 KQ_cmr = make_float2(KQ_max[0], KQ_rowsum[0]);
const bool thread_should_write = threadIdx.x / 16 < cols_per_thread;
@ -1283,14 +1352,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
// Warps with threadIdx.y % np != 0 must NOT return early.
// All threads must return simultaneously to avoid race conditions with work on the next tile.
constexpr int nmeta = np*cols_per_warp >= WARP_SIZE ? np*cols_per_warp/WARP_SIZE : 1;
constexpr int nmeta = np*cols_per_warp >= warp_size ? np*cols_per_warp/warp_size : 1;
const int jc_meta = threadIdx.y*cols_per_warp + (np*cols_per_warp < WARP_SIZE ? threadIdx.x % (np*cols_per_warp) : threadIdx.x);
const int jc_meta = threadIdx.y*cols_per_warp + (np*cols_per_warp < warp_size ? threadIdx.x % (np*cols_per_warp) : threadIdx.x);
float2 * const meta_ptr = ((float2 *) tile_Q) + jc_meta*(tile_stride/2) + nbatch_combine/2;
float2 meta[nmeta];
#pragma unroll
for (int imeta = 0; imeta < nmeta; ++imeta) {
meta[imeta] = meta_ptr[imeta * WARP_SIZE * tile_stride/2];
meta[imeta] = meta_ptr[imeta * warp_size * tile_stride/2];
}
float KQ_cmn = meta[0].x; // KQ combine max new, max between all parallel warps.
@ -1300,8 +1369,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
#pragma unroll
for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
if (offset < WARP_SIZE) {
KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
if (offset < warp_size) {
KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, warp_size));
}
}
@ -1318,8 +1387,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
#pragma unroll
for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
if (offset < WARP_SIZE) {
KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
if (offset < warp_size) {
KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, warp_size);
}
}
@ -1328,19 +1397,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
// Write back combined meta data:
#pragma unroll
for (int imeta = 0; imeta < nmeta; ++imeta) {
if (np*cols_per_warp >= WARP_SIZE || threadIdx.x < np*cols_per_warp) {
if (np*cols_per_warp >= warp_size || threadIdx.x < np*cols_per_warp) {
// Combined KQ max scale + rowsum.
meta_ptr[imeta * WARP_SIZE * tile_stride/2] = make_float2(KQ_cms[imeta], KQ_crs);
meta_ptr[imeta * warp_size * tile_stride/2] = make_float2(KQ_cms[imeta], KQ_crs);
}
}
// Combined KQ max + rowsum.
static_assert(cols_per_warp <= WARP_SIZE);
if (needs_fixup && (cols_per_warp == WARP_SIZE || threadIdx.x < cols_per_warp)) {
static_assert(cols_per_warp <= warp_size);
if (needs_fixup && (cols_per_warp == warp_size || threadIdx.x < cols_per_warp)) {
float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
}
if (is_fixup && (cols_per_warp == WARP_SIZE || threadIdx.x < cols_per_warp)) {
if (is_fixup && (cols_per_warp == warp_size || threadIdx.x < cols_per_warp)) {
float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
}
@ -1388,10 +1457,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(DV/2));
#pragma unroll
for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
const int k0_start = stride_k == WARP_SIZE ? 0 : nbatch_combine - nbatch_combine % (2*stride_k);
for (int stride_k : {warp_size, warp_size/2, warp_size/4, warp_size/8}) {
const int k0_start = stride_k == warp_size ? 0 : nbatch_combine - nbatch_combine % (2*stride_k);
const int k0_stop = nbatch_combine - nbatch_combine % (1*stride_k);
const int stride_jc = WARP_SIZE / stride_k;
const int stride_jc = warp_size / stride_k;
if (k0_start == k0_stop) {
continue;
@ -1399,7 +1468,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
#pragma unroll
for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) {
const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == warp_size ? 0 : threadIdx.x / stride_k);
if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) {
break;
@ -1417,7 +1486,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const float * meta_j = (const float *) tile_Q + jc_tile_K*tile_stride + nbatch_combine;
#pragma unroll
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
const int k = k0 + (stride_k == warp_size ? threadIdx.x : threadIdx.x % stride_k);
float2 dstk_val = make_float2(0.0f, 0.0f);
#pragma unroll
@ -1453,7 +1522,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
jt, kb0_start, kb0_stop);
NO_DEVICE_CODE;
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)
}
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool V_is_K_view>
@ -1480,7 +1549,7 @@ static __global__ void flash_attn_ext_f16(
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE))
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@ -1508,10 +1577,18 @@ static __global__ void flash_attn_ext_f16(
}
#endif // defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
if (DKQ != 64 && DKQ != 80 && DKQ != 96 && DKQ != 112 && DKQ != 128) {
NO_DEVICE_CODE;
return;
}
#endif // defined(AMD_MFMA_AVAILABLE)
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int ncols = ncols1 * ncols2;
constexpr int nbatch_fa = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
constexpr int nthreads = ggml_cuda_fattn_mma_get_nthreads(DKQ, DV, ncols);
constexpr int nwarps = nthreads / WARP_SIZE;
constexpr int nwarps = nthreads / warp_size;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
@ -1624,7 +1701,7 @@ static __global__ void flash_attn_ext_f16(
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE))
}
template <int DKQ, int DV, int ncols1, int ncols2>
@ -1644,7 +1721,8 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
const int nstages = ggml_cuda_fattn_mma_get_nstages (DKQ, DV, ncols1, ncols2, cc);
const int cols_per_warp = std::min(ncols, get_cols_per_warp(cc));
const int nwarps = nthreads / WARP_SIZE;
const int warp_size_host = ggml_cuda_info().devices[ctx.device].warp_size;
const int nwarps = nthreads / warp_size_host;
constexpr bool V_is_K_view = DKQ == 576; // Guaranteed by the kernel selection logic in fattn.cu
@ -1694,7 +1772,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
}
launch_fattn<DV, ncols1, ncols2>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, nbatch_fa, true, true, true);
(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, nbatch_fa, true, true, true, warp_size_host);
}

View File

@ -440,6 +440,18 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_MMA_F16;
}
// Use MFMA flash attention for CDNA (MI100+):
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 576) {
const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
// MMA vs tile crossover benchmarked on MI300X @ d32768:
// hsk=64 (gqa=4): MMA wins at eff >= 128 (+11%)
// hsk=128 (gqa=4): MMA wins at eff >= 128 (+4%)
if (eff_nq >= (GGML_CUDA_CC_IS_CDNA1(cc) && Q->ne[0] == 64 ? 64 : 128)) {
return BEST_FATTN_KERNEL_MMA_F16;
}
// Fall through to tile kernel for small effective batch sizes.
}
// If there are no tensor cores available, use the generic tile kernel:
if (can_use_vector_kernel) {
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {

View File

@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->instance == nullptr) {
res = true;
}
// Check if the graph size has changed
if (graph->props.size() != (size_t)cgraph->n_nodes) {
res = true;
@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
#ifdef USE_CUDA_GRAPH
graph_key = ggml_cuda_graph_get_key(cgraph);
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph_set_enabled(cuda_ctx, graph_key);
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->is_enabled()) {
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph);
if (graph_compatible) {
const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
graph->record_update(use_cuda_graph, cuda_graph_update_required);
if (!graph->warmup_complete) {
// Warmup: need at least 2 calls with no property change on the 2nd call
if (!properties_changed) {
graph->warmup_complete = true;
GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__);
use_cuda_graph = true;
cuda_graph_update_required = true;
}
// else: properties changed or first call - execute directly (use_cuda_graph stays false)
} else {
// Post-warmup: normal CUDA graph operation
if (properties_changed) {
// Properties changed - reset warmup, execute directly until stable again
graph->warmup_complete = false;
GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__);
} else {
use_cuda_graph = true;
cuda_graph_update_required = graph->instance == nullptr;
}
}
}
}
#endif // USE_CUDA_GRAPH

View File

@ -668,7 +668,7 @@ namespace ggml_cuda_mma {
return ret;
}
#elif defined(AMD_WMMA_AVAILABLE)
#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)
template <int I, int J>
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
tile<I, J/2, half2> ret;
@ -964,6 +964,34 @@ namespace ggml_cuda_mma {
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // defined(RDNA4)
#elif defined(AMD_MFMA_AVAILABLE)
// MFMA: FP16 input, FP32 accumulate, convert back to half2.
using halfx4_t = __attribute__((ext_vector_type(4))) _Float16;
using floatx4_t = __attribute__((ext_vector_type(4))) float;
// Convert existing half2 accumulator to float for MFMA:
floatx4_t acc_f32;
{
const halfx4_t acc_h = reinterpret_cast<const halfx4_t&>(D.x[0]);
#pragma unroll
for (int i = 0; i < 4; ++i) {
acc_f32[i] = (float)acc_h[i];
}
}
const halfx4_t& a_frag = reinterpret_cast<const halfx4_t&>(A.x[0]);
const halfx4_t& b_frag = reinterpret_cast<const halfx4_t&>(B.x[0]);
acc_f32 = __builtin_amdgcn_mfma_f32_16x16x16f16(a_frag, b_frag, acc_f32, 0, 0, 0);
// Convert back to half2:
{
halfx4_t result_h;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result_h[i] = (_Float16)acc_f32[i];
}
reinterpret_cast<halfx4_t&>(D.x[0]) = result_h;
}
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;

View File

@ -1749,23 +1749,6 @@ static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backe
return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
}
static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
if (x->ne[0] != y->ne[0]) {
return false;
}
if (x->ne[1] != y->ne[1]) {
return false;
}
if (x->ne[2] != y->ne[2]) {
return false;
}
if (x->ne[3] != y->ne[3]) {
return false;
}
return true;
}
static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
@ -1797,43 +1780,6 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
return opt_experimental;
}
static bool hex_supported_src0_type(ggml_type t) {
return t == GGML_TYPE_F32;
}
static bool hex_supported_src1_type(ggml_type t) {
return t == GGML_TYPE_F32;
}
static bool hex_supported_src2_type(ggml_type t) {
return t == GGML_TYPE_F32;
}
static bool hex_supported_src1_type2(ggml_type t) {
return t == GGML_TYPE_F16;
}
static bool hex_supported_src1_type3(ggml_type t) {
return t == GGML_TYPE_I32;
}
static bool hex_supported_dst_type(ggml_type t) {
return t == GGML_TYPE_F32;
}
static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
// TODO: support broadcast for ne[2 and 3]
if (x->ne[0] != y->ne[0]) {
return false;
}
if (x->ne[2] != y->ne[2]) {
return false;
}
if (x->ne[3] != y->ne[3]) {
return false;
}
return true;
}
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
@ -1919,19 +1865,19 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_src1_type(src1->type)) {
if (src1->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dims2(src0, dst)) {
if (!ggml_are_same_shape(src0, dst)) {
return false;
}
if (!ggml_can_repeat(src1, src0)) {
if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
return false;
}
@ -1943,16 +1889,16 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_src1_type(src1->type)) {
if (src1->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dims2(src0, dst)) {
if (!ggml_are_same_shape(src0, dst)) {
return false;
}
@ -1968,13 +1914,13 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dims2(src0, dst)) {
if (!ggml_are_same_shape(src0, dst)) {
return false;
}
@ -1990,10 +1936,10 @@ static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session *
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
@ -2011,10 +1957,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
@ -2023,10 +1969,10 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
}
if (src1) {
if (!hex_supported_src1_type(src1->type)) {
if (src1->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dims2(src0, src1)) {
if (!ggml_are_same_shape(src0, src1)) {
return false;
}
if (!ggml_is_contiguous(src1)) {
@ -2047,15 +1993,15 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
return false; // FIXME: add support for sinks
}
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
if (src1) {
if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
return false;
}
if (src0->ne[0] != src1->ne[0]) {
@ -2162,17 +2108,17 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
const struct ggml_tensor * src2 = op->src[2];
const struct ggml_tensor * dst = op;
if (!hex_supported_src0_type(src0->type)) {
if (src0->type != GGML_TYPE_F32) {
return false; // FIXME: add support for GGML_TYPE_F16 for src0
}
if (!hex_supported_dst_type(dst->type)) {
if (dst->type != GGML_TYPE_F32) {
return false;
}
if (!hex_supported_src1_type3(src1->type)) {
if (src1->type != GGML_TYPE_I32) {
return false;
}
if (src2) {
if (!hex_supported_src2_type(src2->type)) {
if (src2->type != GGML_TYPE_F32) {
return false;
}
int n_dims = op_params[1];

View File

@ -69,27 +69,45 @@
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * src1_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
struct htp_act_context {
struct htp_ops_context * octx;
// Precomputed values
const uint8_t * data_src0;
const uint8_t * data_src1;
uint8_t * data_dst;
size_t src0_row_size;
size_t src1_row_size;
size_t dst_row_size;
size_t src0_row_size_aligned;
size_t src1_row_size_aligned;
size_t dst_row_size_aligned;
size_t src0_spad_half_size;
size_t src1_spad_half_size;
size_t dst_spad_half_size;
uint32_t block;
uint32_t src0_nrows;
uint32_t src0_nrows_per_thread;
int nc;
};
static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
size_t src0_row_size = nb01;
size_t src1_row_size = nb11;
size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
size_t src0_row_size = actx->src0_row_size;
size_t src1_row_size = actx->src1_row_size;
size_t dst_row_size = actx->dst_row_size;
const uint32_t src0_nrows = actx->src0_nrows;
const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -101,43 +119,34 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const uint8_t * restrict data_src0 = actx->data_src0;
const uint8_t * restrict data_src1 = actx->data_src1;
uint8_t * restrict data_dst = actx->data_dst;
const bool src1_valid = src1->ne[0];
const int nc = (src1_valid) ? ne00 : ne00 / 2;
if (!src1_valid) {
const int32_t swapped = op_params[1];
data_src1 = data_src0;
src1_row_size = src0_row_size;
const int nc = actx->nc;
const size_t nc_in_bytes = nc * SIZEOF_FP32;
data_src0 += swapped ? nc_in_bytes : 0;
data_src1 += swapped ? 0 : nc_in_bytes;
}
const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
const size_t dst_row_size_aligned = actx->dst_row_size_aligned;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
uint8_t * restrict dst_spad_data = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
size_t src0_spad_half_size = actx->src0_spad_half_size;
size_t src1_spad_half_size = actx->src1_spad_half_size;
size_t dst_spad_half_size = actx->dst_spad_half_size;
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
const int BLOCK = actx->block;
if (BLOCK == 0) {
FARF(ERROR,
"swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
dma_queue * dma_queue = actx->octx->ctx->dma[ith];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@ -196,27 +205,22 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * src1_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
size_t src0_row_size = nb01;
size_t src1_row_size = nb11;
size_t dst_row_size = nb1;
size_t src0_row_size = actx->src0_row_size;
size_t src1_row_size = actx->src1_row_size;
size_t dst_row_size = actx->dst_row_size;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows = actx->src0_nrows;
const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -226,45 +230,36 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
return;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const uint8_t * restrict data_src0 = actx->data_src0;
const uint8_t * restrict data_src1 = actx->data_src1;
uint8_t * restrict data_dst = actx->data_dst;
const bool src1_valid = src1->ne[0];
const int nc = (src1_valid) ? ne00 : ne00 / 2;
if (!src1_valid) {
const int32_t swapped = op_params[1];
data_src1 = data_src0;
src1_row_size = src0_row_size;
const int nc = actx->nc;
const size_t nc_in_bytes = nc * SIZEOF_FP32;
data_src0 += swapped ? nc_in_bytes : 0;
data_src1 += swapped ? 0 : nc_in_bytes;
}
const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
const size_t dst_row_size_aligned = actx->dst_row_size_aligned;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
uint8_t * restrict dst_spad_data = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
size_t src0_spad_half_size = actx->src0_spad_half_size;
size_t src1_spad_half_size = actx->src1_spad_half_size;
size_t dst_spad_half_size = actx->dst_spad_half_size;
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
const int BLOCK = actx->block;
if (BLOCK == 0) {
FARF(ERROR,
"swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
"%zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
const float alpha = ((const float *) (op_params))[2];
const float limit = ((const float *) (op_params))[3];
const float alpha = ((const float *) (actx->octx->op_params))[2];
const float limit = ((const float *) (actx->octx->op_params))[3];
dma_queue * dma_queue = actx->octx->ctx->dma[ith];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
@ -335,26 +330,22 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
}
static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble2;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
const size_t src0_row_size = actx->src0_row_size;
const size_t dst_row_size = actx->dst_row_size;
const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
const size_t dst_row_size_aligned = actx->dst_row_size_aligned;
const uint32_t src0_nrows = ne01 * ne02 * ne03;
const uint32_t src0_nrows = actx->src0_nrows;
const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -364,25 +355,29 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
return;
}
const uint8_t * data_src0 = (const uint8_t *) src0->data;
uint8_t * data_dst = (uint8_t *) dst->data;
const uint8_t * data_src0 = actx->data_src0;
uint8_t * data_dst = actx->data_dst;
uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
// nc/ne0 matches.
const int ne0_val = actx->nc; // == dst->ne[0]
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
uint8_t * dst_spad_data = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
size_t src0_spad_half_size = actx->src0_spad_half_size;
size_t dst_spad_half_size = actx->dst_spad_half_size;
// In gelu = x*sigmoid(x*1.702)
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
const int BLOCK = actx->block;
if (BLOCK == 0) {
FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
dma_queue * dma_queue = actx->octx->ctx->dma[ith];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@ -408,9 +403,9 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// gelu = x * sigmoid(1.702 * x) // current implementation
hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0_val);
hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
}
dma_queue_push_vtcm_to_ddr(dma_queue,
@ -435,34 +430,23 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
static void unary_silu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble2;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
const size_t src0_row_size = actx->src0_row_size;
const size_t dst_row_size = actx->dst_row_size;
const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
const size_t dst_row_size_aligned = actx->dst_row_size_aligned;
const uint32_t src0_nrows = ne01 * ne02 * ne03;
const uint32_t src0_nrows = actx->src0_nrows;
const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -472,24 +456,27 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
return;
}
const uint8_t * data_src0 = (const uint8_t *) src0->data;
uint8_t * data_dst = (uint8_t *) dst->data;
const uint8_t * data_src0 = actx->data_src0;
uint8_t * data_dst = actx->data_dst;
uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
const int ne0_val = actx->nc; // == dst->ne[0]
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
uint8_t * dst_spad_data = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
size_t src0_spad_half_size = actx->src0_spad_half_size;
size_t dst_spad_half_size = actx->dst_spad_half_size;
const int BLOCK = actx->block;
if (BLOCK == 0) {
FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
dma_queue * dma_queue = actx->octx->ctx->dma[ith];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@ -515,8 +502,8 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// silu = x * sigmoid(x)
hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0_val);
hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
}
dma_queue_push_vtcm_to_ddr(dma_queue,
@ -544,27 +531,22 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
static const float GELU_COEF_A = 0.044715f;
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * src1_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
struct htp_act_context * actx = (struct htp_act_context *) data;
const struct htp_tensor * src0 = &actx->octx->src0;
const struct htp_tensor * src1 = &actx->octx->src1;
const struct htp_tensor * dst = &actx->octx->dst;
htp_act_preamble3;
size_t src0_row_size = nb01;
size_t src1_row_size = nb11;
size_t dst_row_size = nb1;
size_t src0_row_size = actx->src0_row_size;
size_t src1_row_size = actx->src1_row_size;
size_t dst_row_size = actx->dst_row_size;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows = actx->src0_nrows;
const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -574,43 +556,34 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
return;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const uint8_t * restrict data_src0 = actx->data_src0;
const uint8_t * restrict data_src1 = actx->data_src1;
uint8_t * restrict data_dst = actx->data_dst;
const bool src1_valid = src1->ne[0];
const int nc = (src1_valid) ? ne00 : ne00 / 2;
if (!src1_valid) {
const int32_t swapped = op_params[1];
data_src1 = data_src0;
src1_row_size = src0_row_size;
const int nc = actx->nc;
const size_t nc_in_bytes = nc * SIZEOF_FP32;
data_src0 += swapped ? nc_in_bytes : 0;
data_src1 += swapped ? 0 : nc_in_bytes;
}
const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
const size_t dst_row_size_aligned = actx->dst_row_size_aligned;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
uint8_t * restrict dst_spad_data = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
size_t src0_spad_half_size = actx->src0_spad_half_size;
size_t src1_spad_half_size = actx->src1_spad_half_size;
size_t dst_spad_half_size = actx->dst_spad_half_size;
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
const int BLOCK = actx->block;
if (BLOCK == 0) {
FARF(ERROR,
"geglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
dma_queue * dma_queue = actx->octx->ctx->dma[ith];
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@ -678,33 +651,7 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
static void glu_geglu_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_geglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
static int execute_op_activations_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
@ -719,26 +666,26 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
switch (octx->op) {
case HTP_OP_UNARY_SILU:
act_op_func = unary_silu_f32;
act_op_func = (worker_callback_t)unary_silu_f32_per_thread;
op_type = "silu-f32";
break;
case HTP_OP_GLU_SWIGLU:
act_op_func = glu_swiglu_f32;
act_op_func = (worker_callback_t)glu_swiglu_f32_per_thread;
op_type = "swiglu-f32";
break;
case HTP_OP_GLU_SWIGLU_OAI:
act_op_func = glu_swiglu_oai_f32;
act_op_func = (worker_callback_t)glu_swiglu_oai_f32_per_thread;
op_type = "swiglu-oai-f32";
break;
case HTP_OP_UNARY_GELU:
act_op_func = unary_gelu_f32;
act_op_func = (worker_callback_t)unary_gelu_f32_per_thread;
op_type = "gelu-f32";
break;
case HTP_OP_GLU_GEGLU:
act_op_func = glu_geglu_f32;
act_op_func = (worker_callback_t)glu_geglu_f32_per_thread;
op_type = "geglu-f32";
break;
default:
@ -797,13 +744,58 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
return HTP_STATUS_OK;
}
return err;
uint32_t n_jobs = MIN(n_threads, src0_nrows);
// Prepare context
struct htp_act_context actx;
actx.octx = octx;
actx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
actx.src0_row_size = src0_row_size;
actx.src1_row_size = src1_row_size;
actx.dst_row_size = dst_row_size;
actx.src0_row_size_aligned = src0_row_size_aligned;
actx.src1_row_size_aligned = src1_row_size_aligned;
actx.dst_row_size_aligned = dst_row_size_aligned;
actx.src0_spad_half_size = octx->src0_spad.size_per_thread / 2;
actx.src1_spad_half_size = octx->src1_spad.size_per_thread / 2;
actx.dst_spad_half_size = octx->dst_spad.size_per_thread / 2;
actx.block = actx.src0_spad_half_size / actx.src0_row_size_aligned;
actx.src0_nrows = src0_nrows;
actx.nc = dst->ne[0];
// Pointers and GLU logic
const uint8_t * data_src0 = (const uint8_t *) src0->data;
const uint8_t * data_src1 = (const uint8_t *) src1->data;
if (!src1_valid && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) {
const int32_t swapped = octx->op_params[1];
data_src1 = data_src0;
actx.src1_row_size = actx.src0_row_size;
size_t nc_in_bytes = actx.nc * SIZEOF_FP32;
if (swapped) {
data_src0 += nc_in_bytes;
} else {
data_src1 += nc_in_bytes;
}
}
actx.data_src0 = data_src0;
actx.data_src1 = data_src1;
actx.data_dst = (uint8_t *) dst->data;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, &actx, n_jobs);
return HTP_STATUS_OK;
}
int op_activations(struct htp_ops_context * octx) {

View File

@ -15,6 +15,13 @@
#include "htp-ops.h"
#include "hvx-utils.h"
struct get_rows_context {
struct htp_ops_context * octx;
uint32_t src1_nrows_per_thread;
struct fastdiv_values get_rows_div_ne10;
struct fastdiv_values get_rows_div_ne10_ne11;
};
#define get_rows_preamble \
const uint32_t ne00 = octx->src0.ne[0]; \
const uint32_t ne01 = octx->src0.ne[1]; \
@ -39,20 +46,22 @@
\
const uint32_t nr = ne10 * ne11 * ne12;
static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
struct get_rows_context * grctx = (struct get_rows_context *)data;
struct htp_ops_context * octx = grctx->octx;
get_rows_preamble;
// parallelize by src1 elements (which correspond to dst rows)
const uint32_t dr = octx->src1_nrows_per_thread;
const uint32_t dr = grctx->src1_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
for (uint32_t i = ir0; i < ir1; ++i) {
const uint32_t i12 = fastdiv(i, &octx->get_rows_div_ne10_ne11);
const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
const uint32_t rem = i - i12 * ne11 * ne10;
const uint32_t i11 = fastdiv(rem, &octx->get_rows_div_ne10);
const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
const uint32_t i10 = rem - i11 * ne10;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@ -68,12 +77,6 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3;
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
}
return HTP_STATUS_OK;
}
static void get_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
get_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
}
int op_get_rows(struct htp_ops_context * octx) {
@ -95,12 +98,14 @@ int op_get_rows(struct htp_ops_context * octx) {
return HTP_STATUS_OK;
}
octx->get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]);
octx->get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
struct get_rows_context grctx;
grctx.octx = octx;
grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]);
grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
const uint32_t n_jobs = MIN(nr, octx->n_threads);
octx->src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
grctx.src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, get_rows_work_f32_f32, octx, n_jobs);
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_jobs);
return HTP_STATUS_OK;
}

View File

@ -102,7 +102,7 @@ static inline bool dma_queue_push(dma_queue * q,
dmlink(q->tail, desc);
q->tail = desc;
// FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
// FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
q->push_idx = (q->push_idx + 1) & q->idx_mask;
return true;
}
@ -144,11 +144,37 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
dptr = q->dptr[q->pop_idx];
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
// FARF(ERROR, "dma-pop: i %u dst %p src %p\n", q->pop_idx, dptr.dst, dptr.src);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
return dptr;
}
static inline dma_ptr dma_queue_pop_nowait(dma_queue * q) {
dma_ptr dptr = { NULL };
if (q->push_idx == q->pop_idx) {
return dptr;
}
dptr = q->dptr[q->pop_idx];
// FARF(ERROR, "dma-pop-nowait: i %u dst %p src %p\n", q->pop_idx, dptr.dst, dptr.src);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
return dptr;
}
static inline bool dma_queue_empty(dma_queue * q) {
return q->push_idx == q->pop_idx;
}
static inline uint32_t dma_queue_depth(dma_queue * q) {
return (q->push_idx - q->pop_idx) & q->idx_mask;
}
static inline uint32_t dma_queue_capacity(dma_queue * q) {
return q->capacity;
}
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -44,32 +44,6 @@ struct htp_ops_context {
uint32_t src0_nrows_per_thread;
uint32_t src1_nrows_per_thread;
struct fastdiv_values src0_div1; // fastdiv values for ne1
struct fastdiv_values src0_div2; // fastdiv values for ne2
struct fastdiv_values src0_div3; // fastdiv values for ne3
struct fastdiv_values src0_div21; // fastdiv values for ne2 * ne1
struct fastdiv_values src1_div1; // fastdiv values for ne1
struct fastdiv_values src1_div2; // fastdiv values for ne2
struct fastdiv_values src1_div3; // fastdiv values for ne3
struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
struct fastdiv_values src3_div1; // fastdiv values for ne1
struct fastdiv_values src3_div2; // fastdiv values for ne2
struct fastdiv_values src3_div3; // fastdiv values for ne3
struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
struct fastdiv_values broadcast_rk2;
struct fastdiv_values broadcast_rk3;
struct fastdiv_values broadcast_rv2;
struct fastdiv_values broadcast_rv3;
struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
struct fastdiv_values get_rows_div_ne10; // fastdiv values for ne10
struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
uint32_t flags;
};

View File

@ -49,62 +49,6 @@ struct htp_matmul_context {
struct fastdiv_values mm_div_r3;
};
// vdelta control to replicate first 4x fp32 values across lanes
static const uint8_t __attribute__((aligned(128))) repl_4x_f32[128] = {
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04,
0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04,
0x04, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10,
};
// vdelta control to replicate and interleave first 8x fp32 values across lanes
static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_f32[128] = {
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00,
0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04,
0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x44, 0x44, 0x44,
0x44, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20,
};
// vdelta control to replicate first fp32 value across all elements
static const uint8_t __attribute__((aligned(128))) repl_1x_f32[128] = {
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08,
0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08,
0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04,
0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10,
0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
};
// vdelta control to replicate first fp16 value across all elements
static const uint8_t __attribute__((aligned(128))) repl_1x_f16[128] = {
0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02,
0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04,
0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08,
0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02,
0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02,
0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10,
0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
};
// vdelta control to replicate first fp16 value across all elements
static const uint8_t __attribute__((aligned(128))) repl_2x_f16[128] = {
0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
};
// vdelta control to expand first 32 e8m0 values into 32 uint32 elements
static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
@ -2067,10 +2011,10 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements
// Convert to QF32
HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero);
HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero);
HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero);
HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero);
HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); // replicated over all lanes
HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); // replicated over all lanes
HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); // replicated over all lanes
HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); // replicated over all lanes
// Combine and convert to fp16
HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
@ -2080,11 +2024,6 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
// Replicate first fp16 scale across all lanes
HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_f16;
vmax01_hf = Q6_V_vdelta_VV(vmax01_hf, ctrl);
vmax23_hf = Q6_V_vdelta_VV(vmax23_hf, ctrl);
HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16);
@ -2130,13 +2069,8 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric
HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
// Compute max and scale
HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf));
// Replicate first fp16 scale across all lanes
HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
vmax01_hf = Q6_V_vdelta_VV(vmax01_hf, ctrl);
vmax23_hf = Q6_V_vdelta_VV(vmax23_hf, ctrl);
HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); // replicated over all lanes
HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf)); // replicated over all lanes
HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
@ -2179,11 +2113,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric
// Compute max and scale
HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf);
// Replicate first fp16 scale across all lanes
HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_f16;
vmax_hf = Q6_V_vdelta_VV(vmax_hf, ctrl);
vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); // replicated over all lanes
HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
HVX_Vector vd_hf = Q6_Vhf_equals_Vqf16(vd_qf16);

View File

@ -10,6 +10,7 @@
#include "hex-dma.h"
#include "hvx-utils.h"
#include "hex-fastdiv.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
@ -21,6 +22,9 @@
#define HTP_ROPE_TYPE_NORMAL 0
#define HTP_ROPE_TYPE_NEOX 2
#define HTP_ROPE_SPAD_NROWS 16
#define HTP_ROPE_SPAD_BLOCK (HTP_ROPE_SPAD_NROWS/2)
#define htp_rope_preamble \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
@ -42,7 +46,7 @@
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct rope_th_ctx {
struct htp_rope_context {
int32_t n_dims;
int32_t mode;
int32_t n_ctx_orig;
@ -57,7 +61,19 @@ struct rope_th_ctx {
float theta_scale;
float corr_dims[2];
uint32_t src0_nrows_per_thread;
size_t spad_stride;
struct htp_ops_context * octx;
size_t src0_row_size;
size_t dst_row_size;
size_t src0_row_size_aligned;
size_t dst_row_size_aligned;
size_t theta_cache_offset;
uint32_t src0_nrows;
uint64_t t_start;
};
static float rope_yarn_ramp(const float low, const float high, const int i0) {
@ -117,64 +133,23 @@ static void rope_corr_dims(int n_dims,
dims[1] = MIN(n_dims - 1, end);
}
static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
const int32_t * op_params = &octx->op_params[0];
uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
rope_ctx->n_dims = ((const int32_t *) op_params)[1];
rope_ctx->mode = ((const int32_t *) op_params)[2];
rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
uint32_t he = ne / 2; // half_dims offset in elements
uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
#pragma unroll(2)
for (uint32_t i = 0; i < nvec; i += 2) {
HVX_Vector v0 = vsrc[i/2+0];
HVX_Vector v1 = vsrc[i/2+hv];
rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
rope_ctx->beta_slow, rope_ctx->corr_dims);
rope_ctx->octx = octx;
FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
}
static void hvx_calc_rope_neox_f32(const float * restrict src0,
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
// for (int i = 0; i < num_elems; i += 2) {
//const float cos_theta = theta_cache[i + 0];
//const float sin_theta = theta_cache[i + 1];
//const float x0 = src[0];
//const float x1 = src[num_elems/2];
//dst[0] = x0*cos_theta - x1*sin_theta;
//dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
//src += 1;
//dst += 1;
// }
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
uint8_t * restrict dst_curr = (uint8_t *) dst;
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
int half_size = (sizeof(float) * (num_elems / 2));
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
HVX_Vector v2 = vtheta[i+0];
HVX_Vector v3 = vtheta[i+1];
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
@ -186,45 +161,34 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
vdst[i/2+0] = Q6_Vsf_equals_Vqf32(v4);
vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
}
src0_curr += VLEN;
theta_curr += 2 * VLEN;
dst_curr += VLEN;
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
const float cos_theta = theta_cache[i+0];
const float sin_theta = theta_cache[i+1];
float x0 = src0[i/2];
float x1 = src0[i/2 + he];
dst[i/2] = x0 * cos_theta - x1 * sin_theta;
dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
}
}
static void hvx_calc_rope_f32(const float * restrict src0,
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
// for (int i = 0; i < num_elems; i += 2) {
//const float cos_theta = theta_cache[i + 0];
//const float sin_theta = theta_cache[i + 1];
static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
//const float x0 = src[0];
//const float x1 = src[1];
uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
//dst[0] = x0*cos_theta - x1*sin_theta;
//dst[1] = x0*sin_theta + x1*cos_theta;
#pragma unroll(2)
for (uint32_t i = 0; i < nvec; i+=2) {
HVX_Vector v0 = vsrc[i+0];
HVX_Vector v1 = vsrc[i+1];
//src += 2;
//dst += 2;
// }
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
uint8_t * restrict dst_curr = (uint8_t *) dst;
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
HVX_Vector v2 = vtheta[i+0];
HVX_Vector v3 = vtheta[i+1];
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); // vx0_x1[0] = x0, vx0_x1[1] = x1
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
@ -239,116 +203,65 @@ static void hvx_calc_rope_f32(const float * restrict src0,
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
*(HVX_Vector *) dst_curr = Q6_V_lo_W(vstore);
*(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
vdst[i+0] = Q6_V_lo_W(vstore);
vdst[i+1] = Q6_V_hi_W(vstore);
}
src0_curr += 2 * VLEN;
theta_curr += 2 * VLEN;
dst_curr += 2 * VLEN;
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
const float cos_theta = theta_cache[i+0];
const float sin_theta = theta_cache[i+1];
float x0 = src0[i+0];
float x1 = src0[i+1];
dst[i+0] = x0 * cos_theta - x1 * sin_theta;
dst[i+1] = x0 * sin_theta + x1 * cos_theta;
}
}
static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
const uint32_t ir0,
const uint32_t ir1,
int nth,
int ith,
const int opt_path) {
struct htp_ops_context * octx = rope_ctx->octx;
static void inline rope_basic_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
#pragma unroll(4)
for (uint32_t i = 0; i < nr; i++) {
float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
float * s = (float *) (src + i * rctx->src0_row_size_aligned);
hvx_rope_f32_aa(d, s, rctx->n_dims, theta_cache);
// fill the remain channels with data from src tensor
if (rctx->n_dims < ne0) {
hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
}
}
}
static void inline rope_neox_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
#pragma unroll(4)
for (uint32_t i = 0; i < nr; i++) {
float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
float * s = (float *) (src + i * rctx->src0_row_size_aligned);
hvx_rope_neox_f32_aa(d, s, rctx->n_dims, theta_cache);
// fill the remain channels with data from src tensor
if (rctx->n_dims < ne0) {
hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
}
}
}
static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_rope_context * rctx = (struct htp_rope_context *) data;
struct htp_ops_context * octx = rctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
const int32_t mode = rope_ctx->mode;
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
htp_rope_preamble;
const int32_t * pos = (const int32_t *) src1->data;
float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
const float * freq_factors = NULL;
if (src2 != NULL) {
freq_factors = (const float *) src2->data;
}
const uint32_t i1_end = MIN(ir1, ne1);
const int32_t half_dims = rope_ctx->n_dims / 2;
const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
const int32_t p = pos[i2];
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
const float * src_loc = src;
float * dst_data_loc = dst_data;
if (1 == opt_path) {
if (is_neox) {
hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
} else {
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
}
src_loc += rope_ctx->n_dims;
dst_data_loc += rope_ctx->n_dims;
} else {
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
const float cos_theta = wp0[i0 + 0];
const float sin_theta = wp0[i0 + 1];
if (is_neox) {
const float x0 = src_loc[0];
const float x1 = src_loc[half_dims];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
src_loc += 1;
dst_data_loc += 1;
} else {
const float x0 = src_loc[0];
const float x1 = src_loc[1];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
src_loc += 2;
dst_data_loc += 2;
}
}
src_loc += (is_neox ? half_dims : 0);
dst_data_loc += (is_neox ? half_dims : 0);
}
// TODO: use simd to speed up the remaining elements copy
memcpy(dst_data_loc, src_loc, remain_bytes);
}
}
}
}
static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
struct htp_ops_context * octx = rope_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
htp_rope_preamble;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
const uint32_t src0_nrows = rctx->src0_nrows;
const uint32_t src0_nrows_per_thread = rctx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -358,32 +271,114 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
uint64_t tt = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
(0 == hex_is_aligned((void *) dst->data, VLEN))) {
FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
is_aligned = 0;
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
const int32_t mode = rctx->mode;
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
// VTCM setup
uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
float * theta_cache = (float *) (src0_spad_base);
src0_spad_base = src0_spad_base + rctx->theta_cache_offset;
uint8_t * dst_spad_base = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
dma_queue * dma_queue = octx->ctx->dma[ith];
const int32_t * pos = (const int32_t *) src1->data;
const float * freq_factors = src2->data ? (const float *) src2->data : NULL;
uint32_t ir = 0;
uint32_t prev_i2 = (uint32_t) -1;
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
if (ir < src0_start_row) { ir++; i1++; continue; }
if (ir >= src0_end_row) goto done;
// Rows in this block
const uint32_t nrows = MIN(src0_end_row - ir, ne1 - i1);
// Depth before prefetch
uint32_t dma_depth = dma_queue_depth(dma_queue);
// FARF(HIGH, "rope-block %u: ir %u n-rows %u dma-depth %u : usec %u", ith, ir, nrows, dma_depth,
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
// Prefetch loop
for (uint32_t pnr = 0, pr = 0; pr < nrows && pr < HTP_ROPE_SPAD_NROWS; pr += pnr) {
pnr = MIN(nrows - pr, HTP_ROPE_SPAD_BLOCK);
uint32_t pi1 = i1 + pr;
uint32_t pir = ir + pr;
// Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + pr * rctx->dst_row_size_aligned), 0, 0, 0);
const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
uint8_t * src_spad = src0_spad_base + pr * rctx->src0_row_size_aligned;
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
// FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p pnr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
}
// Update theta cache
if (i2 != prev_i2) {
prev_i2 = i2;
const int32_t p = pos[i2];
rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
// FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
}
// Skip DMA transactions from prev block (if any)
// No need to wait for these since the DMA is setup for in-order processing
for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop_nowait(dma_queue); }
// Compute loop
for (uint32_t cnr = 0, cr = 0; cr < nrows; cr += cnr, ir += cnr, i1 += cnr) {
// Number of rows to compute
cnr = MIN(nrows - cr, HTP_ROPE_SPAD_BLOCK);
uint8_t * dst_spad = (uint8_t *) dma_queue_pop(dma_queue).src;
uint8_t * src_spad = (uint8_t *) dma_queue_pop(dma_queue).dst;
// FARF(HIGH, "rope-compute %u: ir %u i1 %u i2 %u i3 %u src-spad %p cnr %u : usec %u", ith, ir, i1, i2, i3, src_spad, cnr,
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
if (is_neox) {
rope_neox_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
} else {
rope_basic_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
}
uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, cnr);
// Prefetch more rows (if any)
if ((cr + HTP_ROPE_SPAD_NROWS) < nrows) {
uint32_t pnr = MIN(nrows - (cr + HTP_ROPE_SPAD_NROWS), HTP_ROPE_SPAD_BLOCK);
uint32_t pi1 = i1 + HTP_ROPE_SPAD_NROWS;
uint32_t pir = ir + HTP_ROPE_SPAD_NROWS;
const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
// FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p pnr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
}
}
}
}
}
rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
done:
dma_queue_flush(dma_queue);
tt = HAP_perf_get_qtimer_count() - tt;
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
rope_job_f32_per_thread(rope_ctx, n, i);
FARF(HIGH, "rope-f32: %d/%d: (%u:%u) usec %u\n", ith, nth, src0_start_row, src0_end_row, (unsigned) HAP_perf_qtimer_count_to_us(tt));
}
static int execute_op_rope_f32(struct htp_ops_context * octx) {
@ -394,17 +389,10 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
worker_callback_t op_func;
const char * op_type = NULL;
struct rope_th_ctx rope_ctx;
const char * op_type = "rope-f32";
switch (octx->op) {
case HTP_OP_ROPE:
op_func = rope_job_dispatcher_f32;
op_type = "rope-f32";
init_rope_ctx(&rope_ctx, octx);
break;
default:
@ -415,49 +403,79 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
const uint32_t n_threads = octx->n_threads;
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src0_row_size;
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
// Aligned row sizes for VTCM
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
// Calculate spad sizes per thread
size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
size_t dst_spad_per_thread = HTP_ROPE_SPAD_NROWS * dst_row_size_aligned;
size_t spad_per_thread = src0_spad_per_thread + dst_spad_per_thread;
if (src2->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
"dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
} else {
FARF(HIGH,
"%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
}
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
// Check if we fit in VTCM
size_t total_vtcm_needed = spad_per_thread * n_threads;
if (octx->ctx->vtcm_size < total_vtcm_needed) {
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, total_vtcm_needed);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
// Assign sizes
octx->src0_spad.size_per_thread = src0_spad_per_thread;
octx->dst_spad.size_per_thread = dst_spad_per_thread;
octx->src0_spad.size = n_threads * src0_spad_per_thread;
octx->dst_spad.size = n_threads * dst_spad_per_thread;
octx->src1_spad.size = 0;
// Assign pointers
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = NULL;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
// Fill context
struct htp_rope_context rctx;
memset(&rctx, 0, sizeof(struct htp_rope_context));
rctx.t_start = HAP_perf_get_qtimer_count();
rctx.octx = octx;
const int32_t * op_params = &octx->op_params[0];
rctx.n_dims = ((const int32_t *) op_params)[1];
rctx.mode = ((const int32_t *) op_params)[2];
rctx.n_ctx_orig = ((const int32_t *) op_params)[4];
memcpy(&rctx.freq_base, (int32_t *) op_params + 5, sizeof(float));
memcpy(&rctx.freq_scale, (int32_t *) op_params + 6, sizeof(float));
memcpy(&rctx.ext_factor, (int32_t *) op_params + 7, sizeof(float));
memcpy(&rctx.attn_factor, (int32_t *) op_params + 8, sizeof(float));
memcpy(&rctx.beta_fast, (int32_t *) op_params + 9, sizeof(float));
memcpy(&rctx.beta_slow, (int32_t *) op_params + 10, sizeof(float));
memcpy(&rctx.sections, (int32_t *) op_params + 11, sizeof(int) * 4);
rctx.theta_scale = powf(rctx.freq_base, -2.0f / rctx.n_dims);
rope_corr_dims(rctx.n_dims, rctx.n_ctx_orig, rctx.freq_base, rctx.beta_fast, rctx.beta_slow, rctx.corr_dims);
rctx.src0_row_size = src0_row_size;
rctx.dst_row_size = dst_row_size;
rctx.src0_row_size_aligned = src0_row_size_aligned;
rctx.dst_row_size_aligned = dst_row_size_aligned;
rctx.theta_cache_offset = theta_cache_size_aligned;
uint32_t ne0 = dst->ne[0];
uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
rctx.src0_nrows = src0_nrows;
FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
uint32_t n_jobs = MIN(n_threads, src0_nrows);
rctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, rope_job_f32, &rctx, n_jobs);
}
return err;

View File

@ -43,11 +43,21 @@
\
const uint32_t nr = ne01;
static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
struct htp_set_rows_context {
struct htp_ops_context * octx;
struct fastdiv_values div_ne12;
struct fastdiv_values div_ne11;
uint32_t src0_nrows_per_thread;
};
static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
struct htp_ops_context * octx = srctx->octx;
set_rows_preamble;
// parallelize by rows of src0
const uint32_t dr = octx->src0_nrows_per_thread;
const uint32_t dr = srctx->src0_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
@ -56,8 +66,8 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
for (uint32_t i = ir0; i < ir1; ++i) {
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
const uint32_t i10 = i;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@ -76,15 +86,16 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
}
}
}
return HTP_STATUS_OK;
}
static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const int ith) {
static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
struct htp_ops_context * octx = srctx->octx;
set_rows_preamble;
// parallelize by rows of src0
const uint32_t dr = octx->src0_nrows_per_thread;
const uint32_t dr = srctx->src0_nrows_per_thread;
const uint32_t ir0 = dr * ith;
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
@ -93,8 +104,8 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
for (uint32_t i = ir0; i < ir1; ++i) {
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
const uint32_t i10 = i;
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@ -112,16 +123,6 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
}
}
}
return HTP_STATUS_OK;
}
static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
}
static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
}
int op_set_rows(struct htp_ops_context * octx) {
@ -143,18 +144,20 @@ int op_set_rows(struct htp_ops_context * octx) {
return HTP_STATUS_OK;
}
octx->set_rows_div_ne12 = init_fastdiv_values(ne12);
octx->set_rows_div_ne11 = init_fastdiv_values(ne11);
struct htp_set_rows_context srctx;
srctx.octx = octx;
srctx.div_ne12 = init_fastdiv_values(ne12);
srctx.div_ne11 = init_fastdiv_values(ne11);
const uint32_t n_jobs = MIN(nr, octx->n_threads);
octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
srctx.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
switch(octx->dst.type) {
case HTP_TYPE_F32:
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f32_f32, octx, n_jobs);
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_jobs);
break;
case HTP_TYPE_F16:
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f16_f32, octx, n_jobs);
worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f16_f32, &srctx, n_jobs);
break;
default:
return HTP_STATUS_NO_SUPPORT;

View File

@ -10,6 +10,7 @@
#include "hex-dma.h"
#include "hvx-utils.h"
#include "hex-fastdiv.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
@ -48,7 +49,7 @@
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct softmax_th_ctx {
struct htp_softmax_context {
bool use_f16;
bool use_src1;
uint32_t n_head;
@ -59,28 +60,48 @@ struct softmax_th_ctx {
float m0;
float m1;
uint32_t src0_nrows_per_thread;
struct fastdiv_values fastdiv_ne01;
struct fastdiv_values fastdiv_ne02;
struct fastdiv_values fastdiv_ne12; // For mask broadcasting
struct fastdiv_values fastdiv_ne13; // For mask broadcasting
size_t spad_stride;
struct htp_ops_context * octx;
};
static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
memset(smctx, 0, sizeof(struct htp_softmax_context));
memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
softmax_ctx->n_head = src0->ne[2];
softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
smctx->n_head = src0->ne[2];
smctx->n_head_log2 = 1u << (uint32_t) floor(log2(smctx->n_head));
softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2);
smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2);
softmax_ctx->use_src1 = (src1->ne[0] != 0);
softmax_ctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
smctx->use_src1 = (src1->ne[0] != 0);
smctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
softmax_ctx->octx = octx;
smctx->octx = octx;
// Initialize fastdiv values
const uint32_t ne01 = src0->ne[1];
const uint32_t ne02 = src0->ne[2];
if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
}
static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
@ -139,8 +160,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
max_vec = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
}
HVX_Vector v = hvx_vec_reduce_max_f32(max_vec);
max_vec = hvx_vec_repl4(v);
max_vec = hvx_vec_reduce_max_f32(max_vec); // replicated over all lanes
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
@ -154,8 +174,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
v_pad[i] = v3;
}
v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
sum_vec = hvx_vec_repl4(v);
sum_vec = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec)); // replicated over all lanes
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec);
@ -183,83 +202,9 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
return sum;
}
static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
struct htp_ops_context * octx = softmax_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * dst = &octx->dst;
htp_softmax_preamble3;
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * nb1);
float * wp0 = (float *) src0_spad_data;
float * wp1 = (float *) src1_spad_data;
float * wp2 = (float *) dst_spad_data;
for (uint32_t i03 = 0; i03 < ne03; i03++) {
for (uint32_t i02 = 0; i02 < ne02; i02++) {
for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
const uint32_t i11 = i01;
const uint32_t i12 = i02 % ne12;
const uint32_t i13 = i03 % ne13;
// ALiBi
const uint32_t h = i02; // head
const float slope = (softmax_ctx->max_bias > 0.0f) ?
h < softmax_ctx->n_head_log2 ?
powf(softmax_ctx->m0, h + 1) :
powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
1.0f;
float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
// broadcast the mask across rows
__fp16 * mp_f16 = (softmax_ctx->use_src1) ?
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
float * mp_f32 = (softmax_ctx->use_src1) ?
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
(const uint8_t *) mp_f32, slope);
} else {
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
if (mp_f32) {
if (softmax_ctx->use_f16) {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * (float) mp_f16[i];
}
} else {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * mp_f32[i];
}
}
}
}
if (1 == opt_path) {
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
} else {
float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
sum = sum > 0.0 ? (1.0 / sum) : 1;
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
}
}
}
}
}
static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
struct htp_ops_context * octx = softmax_ctx->octx;
static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_softmax_context * smctx = (struct htp_softmax_context *) data;
struct htp_ops_context * octx = smctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
@ -268,7 +213,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
htp_softmax_preamble3;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
const uint32_t src0_nrows_per_thread = smctx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -291,20 +236,103 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
opt_path = 1;
}
softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * smctx->spad_stride);
float * wp0 = (float *) src0_spad_data;
float * wp1 = (float *) src1_spad_data;
float * wp2 = (float *) dst_spad_data;
uint32_t prev_i2 = (uint32_t)-1;
float slope = 1.0f;
for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
uint32_t i1 = fastmodulo(r, ne01, &smctx->fastdiv_ne01);
uint32_t r_div_ne01 = fastdiv(r, &smctx->fastdiv_ne01);
uint32_t i2 = fastmodulo(r_div_ne01, ne02, &smctx->fastdiv_ne02);
uint32_t i3 = fastdiv(r_div_ne01, &smctx->fastdiv_ne02);
// Map to original logic indices
// i01 = i1
// i02 = i2
// i03 = i3
const uint32_t i11 = i1;
// const uint32_t i12 = i2 % ne12;
// const uint32_t i13 = i3 % ne13;
uint32_t i12, i13;
if (ne12 == ne02) {
i12 = i2;
} else {
i12 = fastmodulo(i2, ne12, &smctx->fastdiv_ne12);
}
if (ne13 == ne03) {
i13 = i3;
} else {
i13 = fastmodulo(i3, ne13, &smctx->fastdiv_ne13);
}
// ALiBi
if (i2 != prev_i2) {
const uint32_t h = i2; // head
slope = (smctx->max_bias > 0.0f) ?
h < smctx->n_head_log2 ?
powf(smctx->m0, h + 1) :
powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
1.0f;
prev_i2 = i2;
}
float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
// broadcast the mask across rows
__fp16 * mp_f16 = (smctx->use_src1) ?
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
float * mp_f32 = (smctx->use_src1) ?
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
(const uint8_t *) mp_f32, slope);
} else {
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
if (mp_f32) {
if (smctx->use_f16) {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * (float) mp_f16[i];
}
} else {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * mp_f32[i];
}
}
}
}
if (1 == opt_path) {
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
} else {
float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
sum = sum > 0.0 ? (1.0 / sum) : 1;
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
softmax_job_f32_per_thread(p_softmax_ctx, n, i);
}
static int execute_op_softmax_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
@ -312,17 +340,12 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
worker_callback_t op_func;
const char * op_type = NULL;
struct softmax_th_ctx softmax_ctx;
struct htp_softmax_context smctx;
const char * op_type = "softmax-f32";
switch (octx->op) {
case HTP_OP_SOFTMAX:
op_func = softmax_job_dispatcher_f32;
op_type = "softmax-f32";
init_softmax_ctx(&softmax_ctx, octx);
init_softmax_ctx(&smctx, octx);
break;
default:
@ -342,6 +365,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
// Use stride for calculating offset
smctx.spad_stride = hex_round_up(src0_row_size, 128);
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
if (src1->ne[0]) {
@ -371,8 +397,8 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
smctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_jobs);
}
return err;

View File

@ -17,7 +17,6 @@
#include "htp-msg.h"
#include "htp-ops.h"
#define sum_rows_preamble \
struct htp_tensor *src0 = &octx->src0;\
struct htp_tensor *dst = &octx->dst; \
@ -42,53 +41,54 @@
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3]; \
static int sum_rows_thread_f32(struct htp_ops_context * octx, const int nth, const int ith) {
sum_rows_preamble;
struct sum_rows_context {
const uint8_t * src_data;
uint8_t * dst_data;
uint32_t ne00;
size_t src_stride;
size_t dst_stride;
uint32_t rows_per_thread;
uint32_t total_rows;
bool opt_path;
};
const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
static void sum_rows_thread_f32(unsigned int nth, unsigned int ith, void *data) {
const struct sum_rows_context * smctx = (const struct sum_rows_context *) data;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t rows_per_thread = smctx->rows_per_thread;
const uint32_t total_rows = smctx->total_rows;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
const uint32_t start_row = rows_per_thread * ith;
const uint32_t end_row = MIN(start_row + rows_per_thread, total_rows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return HTP_STATUS_OK;
if (start_row >= end_row) {
return;
}
int opt_path = 0;
if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
const size_t src_stride = smctx->src_stride;
const size_t dst_stride = smctx->dst_stride;
const uint32_t ne00 = smctx->ne00;
const bool opt_path = smctx->opt_path;
const uint8_t * restrict data_src = (const uint8_t *) src0->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const float * restrict src_th = (const float *) (smctx->src_data + (start_row * src_stride));
float * restrict dst_th = (float *) (smctx->dst_data + (start_row * dst_stride));
const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
float * restrict dst_th = (float *) (data_dst + (src0_start_row * dst_row_size));
// Calculate actual number of rows for this thread
const uint32_t n_rows = end_row - start_row;
for (uint32_t ir = 0; ir < src0_nrows_per_thread; ir++) {
const float * restrict src_local = src_th + (ir * ne00);
for (uint32_t ir = 0; ir < n_rows; ir++) {
const float * restrict src_local = src_th + (ir * (src_stride / sizeof(float)));
if (ir + 1 < src0_nrows_per_thread) {
hex_l2fetch(src_local + ne00, src0_row_size, src0_row_size, 1);
if (ir + 1 < n_rows) {
hex_l2fetch(src_local + (src_stride / sizeof(float)), src_stride, src_stride, 1);
}
if (1 == opt_path) {
if (opt_path) {
dst_th[ir] = hvx_reduce_sum_f32_a((const uint8_t *) src_local, ne00);
} else {
dst_th[ir] = hvx_reduce_sum_f32((const uint8_t *) src_local, ne00);
}
}
return HTP_STATUS_OK;
}
static void sum_rows_work_f32(unsigned int n, unsigned int i, void *data) {
sum_rows_thread_f32((struct htp_ops_context *) data, n, i);
}
int op_sum_rows(struct htp_ops_context * octx) {
@ -106,10 +106,25 @@ int op_sum_rows(struct htp_ops_context * octx) {
const uint32_t src0_nrows = ne01 * ne02 * ne03;
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
uint32_t rows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, sum_rows_work_f32, octx, n_jobs);
bool opt_path = false;
if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
opt_path = true;
}
struct sum_rows_context smctx = {
.src_data = (const uint8_t *) src0->data,
.dst_data = (uint8_t *) dst->data,
.ne00 = ne00,
.src_stride = nb01,
.dst_stride = nb1,
.rows_per_thread = rows_per_thread,
.total_rows = src0_nrows,
.opt_path = opt_path,
};
worker_pool_run_func(octx->ctx->worker_pool, sum_rows_thread_f32, &smctx, n_jobs);
return HTP_STATUS_OK;
}

View File

@ -17,6 +17,28 @@
#include "htp-msg.h"
#include "htp-ops.h"
struct htp_unary_context {
struct htp_ops_context * octx;
// Precomputed values
const uint8_t * data_src0;
uint8_t * data_dst;
size_t src0_row_size;
size_t dst_row_size;
size_t src0_row_size_aligned;
size_t dst_row_size_aligned;
size_t src0_spad_half_size;
size_t dst_spad_half_size;
uint32_t block;
uint32_t src0_nrows;
uint32_t src0_nrows_per_thread;
uint32_t nc;
};
#define htp_unary_preamble \
const uint32_t ne00 = src->ne[0]; \
const uint32_t ne01 = src->ne[1]; \
@ -57,8 +79,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
sum_v = hvx_vec_repl4(reduced_sum);
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); // replicated over all lanes
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
@ -75,128 +96,95 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
}
}
static void scale_htp_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params,
int opt_path) {
static void scale_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params) {
float scale = 0.f;
float bias = 0.f;
memcpy(&scale, &op_params[0], sizeof(float));
memcpy(&bias, &op_params[1], sizeof(float));
for (uint32_t ir = 0; ir < num_rows; ir++) {
const float * restrict src_local = src + (ir * row_elems);
float * restrict dst_local = dst + (ir * row_elems);
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
if (ir + 1 < num_rows) {
hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
}
hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
hvx_scale_offset_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
}
}
static void rms_norm_htp_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params,
int opt_path) {
static void rms_norm_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params) {
float epsilon = 0.f;
memcpy(&epsilon, op_params, sizeof(float));
for (uint32_t ir = 0; ir < num_rows; ir++) {
const float * restrict src_local = src + (ir * row_elems);
float * restrict dst_local = dst + (ir * row_elems);
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
if (ir + 1 < num_rows) {
hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
}
if (1 == opt_path) {
hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
} else {
float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
const float mean = sum / row_elems;
const float scale = 1.0f / sqrtf(mean + epsilon);
hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
}
hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
}
}
static void sqr_htp_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params,
int opt_path) {
static void sqr_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params) {
for (uint32_t ir = 0; ir < num_rows; ir++) {
const float * restrict src_local = src + (ir * row_elems);
float * restrict dst_local = dst + (ir * row_elems);
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
if (ir + 1 < num_rows) {
hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
}
if (1 == opt_path) {
hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
} else {
hvx_sqr_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
}
hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
}
}
static void sqrt_htp_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params,
int opt_path) {
static void sqrt_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params) {
for (uint32_t ir = 0; ir < num_rows; ir++) {
const float * restrict src_local = src + (ir * row_elems);
float * restrict dst_local = dst + (ir * row_elems);
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
if (ir + 1 < num_rows) {
hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
}
if (1 == opt_path) {
hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
} else {
hvx_sqrt_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
}
hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
}
}
static void unary_job_f32_per_thread(const struct htp_tensor * src,
struct htp_tensor * dst,
uint8_t * spad,
int htp_op,
int32_t * op_params,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
struct htp_ops_context * octx = uctx->octx;
const struct htp_tensor * src = &octx->src0;
const struct htp_tensor * dst = &octx->dst;
htp_unary_preamble;
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
int htp_op = octx->op;
int32_t * op_params = octx->op_params;
uint32_t src0_nrows_per_thread = uctx->src0_nrows_per_thread;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const size_t src0_row_size = uctx->src0_row_size;
const size_t dst_row_size = uctx->dst_row_size;
const size_t src0_row_size_aligned = uctx->src0_row_size_aligned;
const size_t dst_row_size_aligned = uctx->dst_row_size_aligned;
const uint32_t src0_nrows = uctx->src0_nrows;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@ -208,79 +196,104 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if ((0 == hex_is_aligned((void *) src->data, VLEN)) || (0 == hex_is_aligned((void *) dst->data, VLEN))) {
is_aligned = 0;
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
const uint8_t * restrict data_src = uctx->data_src0;
uint8_t * restrict data_dst = uctx->data_dst;
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
size_t src0_spad_half_size = uctx->src0_spad_half_size;
size_t dst_spad_half_size = uctx->dst_spad_half_size;
const int BLOCK = uctx->block;
if (BLOCK == 0) {
FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
octx->src0_spad.size_per_thread, src0_row_size_aligned);
return;
}
const uint8_t * restrict data_src = (const uint8_t *) src->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
dma_queue * dma_queue = octx->ctx->dma[ith];
const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
float * restrict dst_th = (float *) (data_dst + (src0_start_row * dst_row_size));
uint8_t * restrict spad_th = (uint8_t *) spad + (ith * nb01);
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
switch (htp_op) {
case HTP_OP_RMS_NORM:
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
break;
case HTP_OP_SCALE:
scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
break;
case HTP_OP_SQR:
sqr_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
break;
case HTP_OP_SQRT:
sqrt_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
break;
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
default:
break;
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
}
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
float * dst_spad = (float *) dma_queue_pop(dma_queue).src;
float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
// Process block in VTCM
switch (htp_op) {
case HTP_OP_RMS_NORM:
rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
case HTP_OP_SCALE:
scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
case HTP_OP_SQR:
sqr_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
case HTP_OP_SQRT:
sqrt_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
break;
default:
break;
}
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
dst_row_size, dst_row_size_aligned, block_size);
// prefetch N+2 loop iteration if any
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
}
}
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
FARF(HIGH, "unary-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, src->ne[0],
src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
octx->src0_nrows_per_thread);
}
static int execute_op_unary_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
struct htp_tensor * dst = &octx->dst;
worker_callback_t unary_op_func;
const char * op_type = NULL;
const char * op_type = NULL;
switch (octx->op) {
case HTP_OP_RMS_NORM:
unary_op_func = unary_job_dispatcher_f32;
op_type = "rmsnorm-f32";
op_type = "rmsnorm-f32";
break;
case HTP_OP_SCALE:
unary_op_func = unary_job_dispatcher_f32;
op_type = "scale-f32";
op_type = "scale-f32";
break;
case HTP_OP_SQR:
unary_op_func = unary_job_dispatcher_f32;
op_type = "sqr-f32";
op_type = "sqr-f32";
break;
case HTP_OP_SQRT:
unary_op_func = unary_job_dispatcher_f32;
op_type = "sqrt-f32";
op_type = "sqrt-f32";
break;
default:
@ -294,32 +307,61 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
const size_t src0_row_size = src0->nb[1];
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
octx->dst_spad.size = hex_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
// Double buffering requires 2x size per buffer
size_t spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
// Make sure the reserved vtcm size is sufficient
if (vtcm_row_per_thread == 0) {
FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size_per_row * n_threads);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread * 2;
octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread * 2;
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
struct htp_unary_context uctx = {
.octx = octx,
.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs,
.src0_nrows = src0_nrows,
worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
.data_src0 = (const uint8_t *)src0->data,
.data_dst = (uint8_t *)dst->data,
.src0_row_size = src0_row_size,
.dst_row_size = dst_row_size,
.src0_row_size_aligned = src0_row_size_aligned,
.dst_row_size_aligned = dst_row_size_aligned,
.src0_spad_half_size = octx->src0_spad.size_per_thread / 2,
.dst_spad_half_size = octx->dst_spad.size_per_thread / 2,
.block = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
.nc = src0->ne[0],
};
worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_jobs);
}
return err;

View File

@ -55,7 +55,11 @@ void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const int32_t* src2_d = (const int32_t*)src2->data;
float* dst_d = (float*)dst->data;
int threads = std::min((int)ne00, 768); // cols
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
int threads = std::min((unsigned int)ne00, max_work_group_size); // cols
ctx.stream()->parallel_for(
sycl::nd_range<3>(
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),

View File

@ -11,8 +11,8 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
int s00, int s01, int s02, int s03,
int s10, int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
@ -44,7 +44,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
for (int i0 = i0s; i0 < ne0;
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
}
}
@ -53,8 +53,8 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
int s00, int s01, int s02, int s03,
int s10, int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@ -82,7 +82,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
}
@ -95,7 +95,8 @@ struct bin_bcast_sycl {
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
const bool src1_is_contiguous, const bool src0_is_permuted, const bool src1_is_permuted,
queue_ptr stream) {
int nr0 = ne10 / ne0;
int nr1 = ne11/ne1;
int nr2 = ne12/ne2;
@ -123,7 +124,7 @@ struct bin_bcast_sycl {
cnb[3] *= cne[3];
};
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
if (src0_is_contiguous && src1_is_contiguous && !src0_is_permuted && !src1_is_permuted) {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
@ -164,7 +165,7 @@ struct bin_bcast_sycl {
size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3];
size_t s0 = nb0 / sizeof(dst_t);
// size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
@ -196,9 +197,6 @@ struct bin_bcast_sycl {
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1);
GGML_ASSERT(s10 == 1);
const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL);
@ -232,8 +230,8 @@ struct bin_bcast_sycl {
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast_unravel<bin_op>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
s03, s11, s12, s13, item_ct1);
ne10, ne11, ne12, ne13, s1, s2, s3, s00, s01, s02,
s03, s10, s11, s12, s13, item_ct1);
});
}
} else {
@ -251,7 +249,7 @@ struct bin_bcast_sycl {
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
ne2, ne3, ne10, ne11, ne12, ne13,
s1, s2, s3, s01, s02, s03, s11, s12, s13,
s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13,
item_ct1);
});
}
@ -268,24 +266,27 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
ggml_type_name(src0->type), ggml_type_name(src1->type));

View File

@ -7,9 +7,21 @@
#include <cstdint>
static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) {
if (cgraph_size == 0) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation);
return 1;
}
// place-holder: validate that the size of shmem_res_id is <= cgraph_size
// need to add another method in the Virgl->APIR callback interface
GGML_UNUSED(shmem_res_id);
return 0; // Valid
}
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
GGML_UNUSED(ctx);
GGML_UNUSED(enc);
static bool async_backend_initialized = false;
static bool async_backend;
@ -34,10 +46,26 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
size_t cgraph_size;
apir_decode_size_t(dec, &cgraph_size);
if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) {
apir_decoder_set_fatal(dec);
return 1;
}
apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__);
return 1;
}
if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__,
cgraph->n_nodes, cgraph->n_leafs);
return 1;
}
ggml_status status;
#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
for (int idx = 0; idx < cgraph->n_nodes; idx++) {
@ -45,7 +73,8 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
if (dev->iface.supports_op(dev, op)) {
continue;
}
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx,
ggml_op_desc(op));
status = GGML_STATUS_ABORTED;
apir_encode_ggml_status(enc, &status);
@ -53,9 +82,17 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
return 0;
}
#endif
// Check if backend is properly initialized
if (!bck) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__);
return 1;
}
status = bck->iface.graph_compute(bck, cgraph);
if (async_backend) {
if (async_backend && bck->iface.synchronize) {
bck->iface.synchronize(bck);
}

View File

@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d
const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
size_t value = buft->iface.get_alloc_size(buft, op);
// Check for decode error
if (op == nullptr) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__);
apir_decoder_set_fatal(dec);
return 1;
}
size_t value;
if (buft->iface.get_alloc_size) {
value = buft->iface.get_alloc_size(buft, op);
} else {
value = ggml_nbytes(op); // Default fallback
}
apir_encode_size_t(enc, &value);

View File

@ -6,11 +6,26 @@
#include <cstdint>
static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) {
// Only check for critical integer overflow - no arbitrary size limits
if (offset > SIZE_MAX - size) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size);
return 1;
}
return 0; // Valid
}
uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
GGML_UNUSED(ctx);
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
apir_encode_uintptr_t(enc, &base);
@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
ggml_tensor * tensor;
// safe to remove the const qualifier here
tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
size_t size;
apir_decode_size_t(dec, &size);
if (validate_buffer_operation(offset, size, __func__) != 0) {
return 1;
}
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
const ggml_tensor * tensor;
// safe to remove the const qualifier here
tensor = apir_decode_ggml_tensor(dec);
@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
size_t size;
apir_decode_size_t(dec, &size);
if (validate_buffer_operation(offset, size, __func__) != 0) {
return 1;
}
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
const ggml_tensor * src;
// safe to remove the const qualifier here
src = apir_decode_ggml_tensor(dec);
@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
uint8_t value;
apir_decode_uint8_t(dec, &value);
@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
if (!apir_untrack_backend_buffer(buffer)) {
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
return 1;

View File

@ -1,6 +1,6 @@
#include "backend-dispatched.h"
#include "backend-virgl-apir.h"
#include "backend-virgl-apir.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
}
if (!reg->iface.get_device_count(reg)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
size_t device_count = reg->iface.get_device_count(reg);
if (!device_count) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
dev = reg->iface.get_device(reg, 0);
if (!dev) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
bck = dev->iface.init_backend(dev, NULL);
if (!bck) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__);
return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED;
}
return APIR_BACKEND_INITIALIZE_SUCCESS;
}

View File

@ -32,64 +32,6 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
/* backend */
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) {
switch (type) {
/* device */
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
return "backend_device_get_device_count";
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
return "backend_device_get_count";
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
return "backend_device_get_name";
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
return "backend_device_get_description";
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
return "backend_device_get_type";
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
return "backend_device_get_memory";
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
return "backend_device_supports_op";
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
return "backend_device_get_buffer_type";
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
return "backend_device_get_props";
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
return "backend_device_buffer_from_ptr";
/* buffer-type */
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
return "backend_buffer_type_get_name";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
return "backend_buffer_type_get_alignment";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
return "backend_buffer_type_get_max_size";
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
return "backend_buffer_type_is_host (DEPRECATED)";
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
return "backend_buffer_type_alloc_buffer";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
return "backend_buffer_type_get_alloc_size";
/* buffer */
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
return "backend_buffer_get_base";
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
return "backend_buffer_set_tensor";
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
return "backend_buffer_get_tensor";
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
return "backend_buffer_cpy_tensor";
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
return "backend_buffer_clear";
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
return "backend_buffer_free_buffer";
/* backend */
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
return "backend_backend_graph_compute";
default:
return "unknown";
}
}
extern "C" {
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {

View File

@ -1,5 +1,6 @@
#pragma once
// clang-format off
#include <cstdint>
#include <cstddef>
@ -10,6 +11,7 @@
#include "shared/apir_backend.h"
#include "shared/apir_cs.h"
#include "shared/apir_cs_ggml.h"
// clang-format on
#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: "

View File

@ -19,7 +19,7 @@ struct virgl_apir_callbacks {
};
extern "C" {
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs);
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs);
void apir_backend_deinit(uint32_t virgl_ctx_id);
uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
virgl_apir_callbacks * virgl_cbs,

View File

@ -1,6 +1,5 @@
#include "backend-dispatched.h"
#include "backend-virgl-apir.h"
#include "shared/api_remoting.h"
#include "shared/apir_backend.h"
#include "shared/apir_cs.h"
@ -17,10 +16,10 @@
#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init"
static void * backend_library_handle = NULL;
static FILE * apir_logfile = NULL;
static FILE * apir_logfile = NULL;
static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) {
FILE * logfile = (FILE *)user_data;
FILE * logfile = (FILE *) user_data;
fprintf(logfile, "[%d] %s", level, text);
fflush(logfile);
}
@ -48,9 +47,9 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) {
}
#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path"
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) {
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs) {
const char * dlsym_error;
const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
@ -63,15 +62,13 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
}
}
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY);
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
if (!library_name) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: env var '%s' not defined\n", __func__,
APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@ -79,16 +76,14 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
backend_library_handle = dlopen(library_name, RTLD_LAZY);
if (!backend_library_handle) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: %s\n", __func__, dlerror());
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: %s\n", __func__, dlerror());
return APIR_LOAD_LIBRARY_CANNOT_OPEN;
}
if (!library_reg) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot register the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot register the GGML library: env var '%s' not defined\n", __func__,
APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@ -96,11 +91,9 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
dlsym_error = dlerror();
if (dlsym_error) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
__func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
}
@ -132,13 +125,12 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
virgl_apir_context ctx = {
.ctx_id = virgl_ctx_id,
.iface = virgl_cbs,
.iface = virgl_cbs,
};
if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: Received an invalid dispatch index (%d >= %d)\n",
__func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Received an invalid dispatch index (%d >= %d)\n", __func__, cmd_type,
APIR_BACKEND_DISPATCH_TABLE_COUNT);
return APIR_BACKEND_FORWARD_INDEX_INVALID;
}

View File

@ -16,28 +16,32 @@ enum ApirCommandType {
APIR_COMMAND_TYPE_LOADLIBRARY = 1,
APIR_COMMAND_TYPE_FORWARD = 2,
APIR_COMMAND_TYPE_LENGTH = 3,
APIR_COMMAND_TYPE_LENGTH = 3,
};
typedef uint64_t ApirCommandFlags;
enum ApirLoadLibraryReturnCode {
APIR_LOAD_LIBRARY_SUCCESS = 0,
// these error codes are returned by the Virglrenderer APIR component
APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
// any value greater than this is an APIR *backend library* initialization return code
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6,
};
enum ApirForwardReturnCode {
APIR_FORWARD_SUCCESS = 0,
APIR_FORWARD_NO_DISPATCH_FCT = 1,
APIR_FORWARD_TIMEOUT = 2,
APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
} ;
APIR_FORWARD_SUCCESS = 0,
// these error codes are returned by the Virglrenderer APIR component
APIR_FORWARD_NO_DISPATCH_FCT = 1,
APIR_FORWARD_TIMEOUT = 2,
APIR_FORWARD_FAILED_TO_SYNC_STREAMS = 3,
// any value greater than this index an APIR *backend library* forward return code
APIR_FORWARD_BASE_INDEX = 4,
};
__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
switch (type) {
@ -82,6 +86,7 @@ __attribute__((unused)) static const char * apir_forward_error(ApirForwardReturn
APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
APIR_FORWARD_ERROR(APIR_FORWARD_FAILED_TO_SYNC_STREAMS);
APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
return "Unknown APIR_COMMAND_TYPE_FORWARD error";

View File

@ -34,3 +34,61 @@ typedef enum ApirBackendCommandType {
// last command_type index + 1
APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
} ApirBackendCommandType;
static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {
switch (type) {
/* device */
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
return "device_get_device_count";
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
return "device_get_count";
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
return "device_get_name";
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
return "device_get_description";
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
return "device_get_type";
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
return "device_get_memory";
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
return "device_supports_op";
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
return "device_get_buffer_type";
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
return "device_get_props";
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
return "device_buffer_from_ptr";
/* buffer-type */
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
return "buffer_type_get_name";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
return "buffer_type_get_alignment";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
return "buffer_type_get_max_size";
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
return "buffer_type_is_host";
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
return "buffer_type_alloc_buffer";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
return "buffer_type_get_alloc_size";
/* buffer */
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
return "buffer_get_base";
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
return "buffer_set_tensor";
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
return "buffer_get_tensor";
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
return "buffer_cpy_tensor";
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
return "buffer_clear";
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
return "buffer_free_buffer";
/* backend */
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
return "backend_graph_compute";
default:
return "unknown";
}
}

View File

@ -14,7 +14,7 @@
#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6
#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7
#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8
#define APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED 9
// new entries here need to be added to the apir_backend_initialize_error function below
@ -39,6 +39,10 @@ static const char * apir_backend_initialize_error(int code) {
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_ALREADY_INITED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_NO_DEVICE);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED);
return "Unknown APIR_BACKEND_INITIALIZE error:/";

View File

@ -13,7 +13,6 @@ struct apir_encoder {
const char * start;
const char * end;
bool fatal;
};
struct apir_decoder {
@ -28,8 +27,8 @@ struct apir_decoder {
static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
apir_decoder dec = {
.cur = ptr,
.end = ptr + size,
.cur = ptr,
.end = ptr + size,
.fatal = false,
};
@ -79,10 +78,7 @@ static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
* encode peek
*/
static inline bool apir_decoder_peek_internal(apir_decoder * dec,
size_t size,
void * val,
size_t val_size) {
static inline bool apir_decoder_peek_internal(apir_decoder * dec, size_t size, void * val, size_t val_size) {
assert(val_size <= size);
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
@ -332,8 +328,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
size_t alloc_size;
if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
__func__, size, count);
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", __func__, size, count);
return NULL;
}
@ -352,20 +347,19 @@ static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
/* apir_buffer_type_host_handle_t */
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
const apir_buffer_type_host_handle_t * val) {
apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
}
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
apir_buffer_type_host_handle_t * val) {
apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
}
/* apir_buffer_host_handle_t */
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc,
const apir_buffer_host_handle_t * val) {
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, const apir_buffer_host_handle_t * val) {
apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
}

View File

@ -1,11 +1,10 @@
#include "ggml-impl.h"
#include "apir_cs.h"
#include "apir_cs_rpc.h"
#include "ggml-impl.h"
// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
const apir_buffer_host_handle_t * handle);
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle);
static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
@ -22,8 +21,7 @@ static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
}
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
uint32_t n_tensors) {
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, uint32_t n_tensors) {
size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
@ -45,9 +43,9 @@ static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
}
ggml_init_params params{
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
/*.mem_size =*/ggml_tensor_overhead(),
/*.mem_buffer =*/NULL,
/*.no_alloc =*/true,
};
ggml_context * ctx = ggml_init(params);
@ -105,6 +103,19 @@ static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec)
apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
// SECURITY: Validate buffer handle against tracked buffers to prevent
// guest VM from providing arbitrary host memory addresses
if (buffer) {
extern std::unordered_set<ggml_backend_buffer_t> backend_buffers;
if (backend_buffers.find(buffer) == backend_buffers.end()) {
GGML_LOG_WARN("ggml-virtgpu-backend: %s: Invalid buffer handle from guest: %p\n", __func__,
(void *) buffer);
// Set fatal flag to prevent further processing with invalid handle
apir_decoder_set_fatal(dec);
return NULL;
}
}
return buffer;
}

View File

@ -1,3 +1,6 @@
#pragma once
// clang-format off
#include "ggml.h"
#include "ggml-backend-impl.h"
@ -5,6 +8,7 @@
#include <unordered_set>
#include <vector>
#include <cstdint>
// clang-format on
// ggml_tensor is serialized into apir_rpc_tensor
struct apir_rpc_tensor {

View File

@ -34,6 +34,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
// Return the prefixed name that was built once during initialization
return gpu->cached_buffer_type.name;
}
@ -53,9 +54,8 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
const ggml_tensor * tensor) {
virtgpu * gpu = BUFT_TO_GPU(buft);
if (tensor->buffer == NULL
|| !tensor->buffer->context
|| !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
if (tensor->buffer == NULL || !tensor->buffer->context ||
!buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
return ggml_nbytes(tensor);
}

View File

@ -3,6 +3,7 @@
static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
// Return the prefixed name that was built once during initialization
return gpu->cached_device_info.name;
}
@ -22,7 +23,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac
static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
virtgpu * gpu = DEV_TO_GPU(dev);
*free = gpu->cached_device_info.memory_free;
*free = gpu->cached_device_info.memory_free;
*total = gpu->cached_device_info.memory_total;
}
@ -72,7 +73,7 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
static std::atomic<bool> initialized = false;
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
if (!initialized) {
@ -95,7 +96,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
static std::atomic<bool> initialized = false;
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
if (!initialized) {

View File

@ -7,8 +7,8 @@
void ggml_virtgpu_cleanup(virtgpu * gpu);
static virtgpu * apir_initialize() {
static virtgpu * gpu = NULL;
static std::atomic<bool> initialized = false;
static virtgpu * gpu = NULL;
static std::atomic<bool> initialized = false;
if (initialized) {
// fast track
@ -31,29 +31,53 @@ static virtgpu * apir_initialize() {
}
// Pre-fetch and cache all device information, it will not change
gpu->cached_device_info.description = apir_device_get_description(gpu);
gpu->cached_device_info.description = apir_device_get_description(gpu);
if (!gpu->cached_device_info.description) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__);
}
gpu->cached_device_info.name = apir_device_get_name(gpu);
if (!gpu->cached_device_info.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__);
}
gpu->cached_device_info.device_count = apir_device_get_count(gpu);
gpu->cached_device_info.type = apir_device_get_type(gpu);
apir_device_get_memory(gpu,
&gpu->cached_device_info.memory_free,
&gpu->cached_device_info.memory_total);
{
// Get the remote name and create prefixed version
char * rmt_device_name = apir_device_get_name(gpu);
if (!rmt_device_name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu device name", __func__);
}
size_t device_name_len = strlen(rmt_device_name) + 11; // "[virtgpu] " + null terminator
gpu->cached_device_info.name = (char *) malloc(device_name_len);
if (!gpu->cached_device_info.name) {
free(rmt_device_name);
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed device name", __func__);
}
snprintf(gpu->cached_device_info.name, device_name_len, "[virtgpu] %s", rmt_device_name);
free(rmt_device_name);
}
apir_device_get_memory(gpu, &gpu->cached_device_info.memory_free, &gpu->cached_device_info.memory_total);
apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
gpu->cached_buffer_type.host_handle = buft_host_handle;
gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle);
if (!gpu->cached_buffer_type.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__);
{
// Get the remote name and create prefixed version
char * rmt_name = apir_buffer_type_get_name(gpu, buft_host_handle);
if (!rmt_name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu buffer type name", __func__);
}
size_t prefixed_len = strlen(rmt_name) + 11; // "[virtgpu] " + null terminator
gpu->cached_buffer_type.name = (char *) malloc(prefixed_len);
if (!gpu->cached_buffer_type.name) {
free(rmt_name);
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed buffer type name", __func__);
}
snprintf(gpu->cached_buffer_type.name, prefixed_len, "[virtgpu] %s", rmt_name);
free(rmt_name);
}
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
initialized = true;
}
@ -98,7 +122,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
static std::atomic<bool> initialized = false;
if (initialized) {
return; // fast track
return; // fast track
}
{

View File

@ -1,5 +1,5 @@
#include "ggml-remoting.h"
#include "../../include/ggml-virtgpu.h"
#include "ggml-remoting.h"
static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
UNUSED(backend);

View File

@ -9,7 +9,7 @@
#include <string>
#define GGML_VIRTGPU_NAME "ggml-virtgpu"
#define GGML_VIRTGPU "ggml-virtgpu: "
#define GGML_VIRTGPU "ggml-virtgpu: "
// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes

View File

@ -3,7 +3,7 @@
#include <stdint.h>
struct virgl_renderer_capset_apir {
uint32_t apir_version;
uint32_t supports_blob_resources;
uint32_t reserved[4]; // For future expansion
uint32_t apir_version;
uint32_t supports_blob_resources;
uint32_t reserved[4]; // For future expansion
};

View File

@ -145,8 +145,31 @@ class RemotingCodebaseGenerator:
enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},")
enum_lines.append("} ApirBackendCommandType;")
# Generate function name mapping
func_lines = []
func_lines.append("static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {")
func_lines.append(" switch (type) {")
current_group = None
for func in functions:
# Add comment for new group
if func['group_name'] != current_group:
func_lines.append(f" /* {func['group_description']} */")
current_group = func['group_name']
# Generate clean function name without backend_ prefix
clean_name = f"{func['group_name']}_{func['function_name']}"
func_lines.append(f" case {func['enum_name']}:")
func_lines.append(f" return \"{clean_name}\";")
func_lines.append("")
func_lines.append(" default:")
func_lines.append(" return \"unknown\";")
func_lines.append(" }")
func_lines.append("}")
# Full header template
header_content = NL.join(enum_lines) + "\n"
header_content = NL.join(enum_lines) + "\n\n" + NL.join(func_lines) + "\n"
return header_content
@ -170,19 +193,6 @@ class RemotingCodebaseGenerator:
decl_lines.append(f"{signature} {func['backend_function']}({params});")
# Switch cases
switch_lines = []
current_group = None
for func in functions:
if func['group_name'] != current_group:
switch_lines.append(f" /* {func['group_description']} */")
current_group = func['group_name']
deprecated = " (DEPRECATED)" if func['deprecated'] else ""
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
# Dispatch table
table_lines = []
current_group = None
@ -201,15 +211,6 @@ class RemotingCodebaseGenerator:
{NL.join(decl_lines)}
static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
{{
switch (type) {{
{NL.join(switch_lines)}
default: return "unknown";
}}
}}
extern "C" {{
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{
{NL.join(table_lines)}

View File

@ -17,8 +17,8 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (cgraph_size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@ -26,7 +26,7 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
}

View File

@ -62,7 +62,9 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle
return max_size;
}
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
size_t size) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@ -84,7 +86,9 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_t
return buffer_context;
}
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
const ggml_tensor * op) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;

View File

@ -35,8 +35,8 @@ void apir_buffer_set_tensor(virtgpu * gpu,
apir_encode_ggml_tensor(encoder, tensor);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@ -44,7 +44,7 @@ void apir_buffer_set_tensor(virtgpu * gpu,
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
@ -86,8 +86,8 @@ void apir_buffer_get_tensor(virtgpu * gpu,
apir_encode_ggml_tensor(encoder, tensor);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@ -95,7 +95,7 @@ void apir_buffer_get_tensor(virtgpu * gpu,
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);

View File

@ -26,7 +26,7 @@ char * apir_device_get_name(virtgpu * gpu) {
REMOTE_CALL(gpu, encoder, decoder, ret);
const size_t string_size = apir_decode_array_size_unchecked(decoder);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
if (!string) {
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
return NULL;
@ -173,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate %ldb of guest-host shared buffer", __func__, size);
}
apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);

View File

@ -1,29 +1,36 @@
#include "virtgpu.h"
#pragma once
// clang-format off
#include "virtgpu.h"
#include "ggml-remoting.h"
#include "backend/shared/apir_backend.h"
#include "backend/shared/apir_cs_ggml.h"
#include "ggml-backend-impl.h"
// clang-format on
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
do { \
int32_t forward_flag = (int32_t) apir_command_type__; \
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
if (!encoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
} \
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
int32_t REMOTE_CALL_PREPARE_forward_flag = (int32_t) apir_command_type__; \
const char * REMOTE_CALL_PREPARE_command_name = apir_dispatch_command_name(apir_command_type__); \
do { \
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, REMOTE_CALL_PREPARE_forward_flag); \
if (!encoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
} \
} while (0)
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
do { \
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
if (!decoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
} \
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
apir_forward_error(ret_name), ret_name); \
} \
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
do { \
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
if (!decoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
} \
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
apir_forward_error(ret_name), ret_name); \
} \
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
if (ret_name != 0) { \
GGML_ABORT(GGML_VIRTGPU "backend function '%s' failed (return code: %d)", \
REMOTE_CALL_PREPARE_command_name, ret_name); \
} \
} while (0)

View File

@ -20,6 +20,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu,
char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
/* apir_buffer_type_is_host is deprecated. */
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
size_t size);

View File

@ -53,9 +53,9 @@ static int virtgpu_handshake(virtgpu * gpu) {
if (!decoder) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
return 1;
}
@ -65,8 +65,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
uint32_t host_minor;
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
GGML_ABORT(GGML_VIRTGPU
"%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
GGML_ABORT(GGML_VIRTGPU "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
apir_backend_initialize_error(ret_magic));
} else {
apir_decode_uint32_t(decoder, &host_major);
@ -140,15 +139,13 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
"Make sure virglrenderer is correctly configured by the hypervisor. (%s) ",
__func__, apir_load_library_error(ret));
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__,
apir_load_library_error(ret), ret);
GGML_ABORT(GGML_VIRTGPU "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)",
__func__, apir_load_library_error(ret), ret);
}
return ret;
}
GGML_LOG_INFO(GGML_VIRTGPU
"%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
GGML_LOG_INFO(GGML_VIRTGPU "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
@ -158,10 +155,11 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
GGML_ABORT(
GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)",
@ -169,8 +167,8 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
} else {
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
lib_ret);
"%s: the API Remoting backend library failed to initialize its backend library: apir code=%d)",
__func__, lib_ret);
}
return ret;
}
@ -184,55 +182,49 @@ virtgpu * create_virtgpu() {
// Initialize mutex to protect shared data_shmem buffer
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
delete gpu;
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize data_shmem mutex", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize data_shmem mutex", __func__);
return NULL;
}
if (virtgpu_open(gpu) != APIR_SUCCESS) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to open the virtgpu device\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to open the virtgpu device\n", __func__);
return NULL;
}
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
if (gpu->use_apir_capset) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__);
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library "
"supports it.",
__func__);
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu Venus capset", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu Venus capset", __func__);
}
return NULL;
}
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the GPU context", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the GPU context", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared reply memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared reply memory pages", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared data memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared data memory pages", __func__);
return NULL;
}
if (virtgpu_handshake(gpu)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to handshake with the virglrenderer library", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to handshake with the virglrenderer library", __func__);
return NULL;
}
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to load the backend library", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to load the backend library", __func__);
return NULL;
}
@ -243,8 +235,7 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
drmDevicePtr devs[8];
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
if (count < 0) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to enumerate DRM devices\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to enumerate DRM devices\n", __func__);
return APIR_ERROR_INITIALIZATION_FAILED;
}
@ -266,19 +257,17 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
int fd = open(node_path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to open %s", __func__, node_path);
GGML_ABORT(GGML_VIRTGPU "%s: failed to open %s", __func__, node_path);
return APIR_ERROR_INITIALIZATION_FAILED;
}
drmVersionPtr version = drmGetVersion(fd);
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
if (version) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: unknown DRM driver %s version %d\n", __func__, version->name,
version->version_major);
} else {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get DRM driver version\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get DRM driver version\n", __func__);
}
if (version) {
@ -322,9 +311,8 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
if (ret) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get APIR v%d capset: %s\n",
__func__, gpu->capset.version, strerror(errno));
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get APIR v%d capset: %s\n", __func__, gpu->capset.version,
strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
@ -547,13 +535,10 @@ static void log_call_duration(long long call_duration_ns, const char * name) {
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
if (call_duration_s > 1) {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fs for the %s host reply...\n", call_duration_s, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fs for the %s host reply...\n", call_duration_s, name);
} else if (call_duration_ms > 1) {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fms for the %s host reply...\n", call_duration_ms, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fms for the %s host reply...\n", call_duration_ms, name);
} else {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %lldns for the %s host reply...\n", call_duration_ns, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %lldns for the %s host reply...\n", call_duration_ns, name);
}
}

View File

@ -1,5 +1,6 @@
#pragma once
// clang-format off
#include "virtgpu-utils.h"
#include "virtgpu-shm.h"
#include "virtgpu-apir.h"
@ -23,20 +24,21 @@
#include "apir_hw.h"
#include <drm/virtgpu_drm.h>
#include "venus_hw.h"
// clang-format on
#ifndef VIRTGPU_DRM_CAPSET_APIR
// Will be defined include/drm/virtgpu_drm.h when
// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs
// is merged
#define VIRTGPU_DRM_CAPSET_APIR 10
# define VIRTGPU_DRM_CAPSET_APIR 10
#endif
// Mesa/Virlgrenderer Venus internal. Only necessary during the
// Venus->APIR transition in Virglrenderer
#define VENUS_COMMAND_TYPE_LENGTH 331
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
#define VIRTGPU_DRM_CAPSET_VENUS 4
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
# define VIRTGPU_DRM_CAPSET_VENUS 4
#endif
typedef uint32_t virgl_renderer_capset;

File diff suppressed because it is too large Load Diff

View File

@ -3,9 +3,13 @@
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#ifdef FLOAT16
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
#endif
#extension GL_KHR_shader_subgroup_shuffle : enable
#extension GL_KHR_shader_subgroup_vote : enable
@ -15,8 +19,10 @@
const uint32_t HSK_per_thread = HSK / D_split;
const uint32_t HSV_per_thread = HSV / D_split;
const uint32_t cols_per_iter = WorkGroupSize / D_split;
const uint32_t rows_per_thread = Br / row_split;
const uint32_t cols_per_iter = WorkGroupSize / D_split / row_split;
const uint32_t cols_per_thread = Bc / cols_per_iter;
const uint32_t num_subgroups = SubGroupSize == 0 ? 0 : WorkGroupSize / SubGroupSize;
layout (binding = 0) readonly buffer Q {float data_q[];};
@ -27,20 +33,22 @@ layout (binding = 2) readonly buffer V {float16_t data_v[];};
layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
layout (binding = 3) readonly buffer M {float16_t data_m[];};
// Store the output when doing grouped query attention.
// Rows index by Q's dimension 2, and the first N rows are valid.
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
{
uint32_t offset = (iq2 + r) * HSV + c;
data_o[o_offset + offset] = D_TYPE(elem);
return elem;
}
// If SubGroupSize is set to 0 then only use shmem reductions
const uint32_t tmpsh_size = (SubGroupSize > 0) ? (row_split == 1 ? num_subgroups * D_split : num_subgroups) : WorkGroupSize;
shared float tmpsh[tmpsh_size];
shared FLOAT_TYPEV4 tmpshv4[tmpsh_size];
shared FLOAT_TYPE tmpsh[WorkGroupSize];
shared vec4 tmpshv4[WorkGroupSize];
const uint32_t masksh_stride = Br + 1;
shared FLOAT_TYPE masksh[Bc * masksh_stride];
shared float masksh[Bc][Br];
shared vec4 Qf[Br][HSK / 4];
const uint32_t qf_stride = HSK / 4 + 1;
shared FLOAT_TYPEV4 Qf[Br * qf_stride];
const uint32_t D = HSK > HSV ? HSK : HSV;
const uint32_t kvsh_stride = D / 4 + 1;
shared FLOAT_TYPEV4 kvsh[SHMEM_STAGING != 0 ? Bc * kvsh_stride : 1];
shared vec4 occupancy_limiter[LIMIT_OCCUPANCY_SHMEM > 0 ? LIMIT_OCCUPANCY_SHMEM : 1];
void main() {
#ifdef NEEDS_INIT_IQ_SHMEM
@ -50,8 +58,24 @@ void main() {
init_indices();
const uint32_t tid = gl_LocalInvocationIndex;
const uint32_t threads_per_rowgroup = gl_WorkGroupSize.x / row_split;
const uint32_t row_tid = gl_LocalInvocationIndex / threads_per_rowgroup;
const uint32_t rowgroup_tid = gl_LocalInvocationIndex % threads_per_rowgroup;
const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
const uint32_t col_tid = gl_LocalInvocationIndex / D_split;
const uint32_t col_tid = (gl_LocalInvocationIndex % threads_per_rowgroup) / D_split;
if (LIMIT_OCCUPANCY_SHMEM > 0) {
// This just exists to avoid the occupancy_limiter array getting optimized out
occupancy_limiter[tid] = vec4(tid);
barrier();
if (occupancy_limiter[tid] == vec4(99999.0)) {
data_ov4[0] = D_TYPEV4(occupancy_limiter[tid]);
}
}
#define tile_row(r) (row_tid * rows_per_thread + (r))
uint32_t q_offset = gqa_iq1*p.nb01 + (iq2*p.nb02 + iq3*p.nb03) / 4;
@ -60,37 +84,37 @@ void main() {
uint32_t r = (idx + tid) / (HSK / 4);
if (r < Br && d < HSK / 4 &&
i * Br + r < N) {
Qf[r][d] = vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d]) * p.scale;
Qf[r * qf_stride + d] = FLOAT_TYPEV4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
}
}
barrier();
vec4 Of[Br][HSV_per_thread / 4];
FLOAT_TYPEV4 Of[rows_per_thread][HSV_per_thread / 4];
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Of[r][d] = vec4(0.0);
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d] = FLOAT_TYPEV4(0.0);
}
}
float Lf[Br], Mf[Br];
float Lf[rows_per_thread], Mf[rows_per_thread];
// Use -FLT_MAX/2 rather than -inf to reduce the possibility of NaNs, e.g. when computing Mold-M.
const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Lf[r] = 0;
Mf[r] = NEG_FLT_MAX_OVER_2;
}
float slope[Br];
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
slope[r] = 1.0;
ACC_TYPE slope[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
slope[r] = ACC_TYPE(1.0);
}
// ALiBi
if (p.max_bias > 0.0f) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
slope[r] = perElemOpComputeSlope(r, col_tid, ACC_TYPE(0), iq2);
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
slope[r] = perElemOpComputeSlope(tile_row(r), col_tid, ACC_TYPE(0), iq2);
}
}
@ -113,75 +137,141 @@ void main() {
uint32_t mask_opt = 0;
uint32_t mask_opt_idx = ~0;
uint32_t mask_opt_bits = 0;
[[dont_unroll]]
for (uint32_t j = start_j; j < end_j; ++j) {
if (MASK_ENABLE) {
if (USE_MASK_OPT && mask_opt_idx != j / 16) {
mask_opt_idx = j / 16;
mask_opt = data_mask_opt[mo_offset + mask_opt_idx];
}
mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3;
if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) {
// skip this block
continue;
}
// Only load if the block is not all zeros
if (mask_opt_bits != MASK_OPT_ALL_ZERO) {
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
if (USE_MASK_OPT && mask_opt_idx != j / 16) {
mask_opt_idx = j / 16;
mask_opt = data_mask_opt[mo_offset + mask_opt_idx];
float max_mask = NEG_FLT_MAX_OVER_2;
barrier();
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
uint32_t c = (idx + tid) % Bc;
uint32_t r = (idx + tid) / Bc;
if (idx + tid < Bc * Br) {
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
FLOAT_TYPE m = FLOAT_TYPE(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
masksh[c * masksh_stride + r] = m;
max_mask = max(max_mask, float(m));
} else {
masksh[c * masksh_stride + r] = FLOAT_TYPE(0);
}
}
}
// skip the block if the mask is entirely -inf
bool all_less = subgroupAll(max_mask <= NEG_FLT_MAX_OVER_2);
barrier();
if (gl_SubgroupInvocationID == 0) {
tmpsh[gl_SubgroupID] = all_less ? NEG_FLT_MAX_OVER_2 : 0.0f;
}
barrier();
[[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) {
max_mask = max(max_mask, tmpsh[s]);
}
if (max_mask <= NEG_FLT_MAX_OVER_2) {
continue;
}
}
}
uint32_t mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3;
if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) {
// skip this block
continue;
}
// Only load if the block is not all zeros
if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) {
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
float max_mask = NEG_FLT_MAX_OVER_2;
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
uint32_t c = (idx + tid) % Bc;
uint32_t r = (idx + tid) / Bc;
if (idx + tid < Bc * Br) {
if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) {
float m = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]);
masksh[c][r] = m;
max_mask = max(max_mask, m);
ACC_TYPE Sf[rows_per_thread][cols_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
Sf[r][c] = ACC_TYPE(0.0);
}
}
if (SHMEM_STAGING != 0) {
barrier();
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) {
uint32_t d = (idx + tid) % (HSK / 4);
uint32_t c = (idx + tid) / (HSK / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSK / 4 || c < Bc) {
FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
if (!KV_bounds_check || j * Bc + c < KV) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
#endif
}
kvsh[c * kvsh_stride + d] = K_Tf;
}
}
barrier();
}
// More d iterations means Q register caching becomes relevant
// Few iterations means the additional registers needed are worse than the speed-up from caching
if (HSK_per_thread / 4 > 4) {
[[unroll]] for (uint32_t d = 0; d < HSK_per_thread / 4; ++d) {
FLOAT_TYPEV4 Q_cache[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Q_cache[r] = Qf[tile_row(r) * qf_stride + d * D_split + d_tid];
}
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
FLOAT_TYPEV4 K_Tf;
if (SHMEM_STAGING != 0) {
K_Tf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
masksh[c][r] = float(0);
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += ACC_TYPE(dot(Q_cache[r], K_Tf));
}
}
}
// skip the block if the mask is entirely -inf
bool all_less = subgroupAll(max_mask <= NEG_FLT_MAX_OVER_2);
barrier();
if (gl_SubgroupInvocationID == 0) {
tmpsh[gl_SubgroupID] = all_less ? NEG_FLT_MAX_OVER_2 : 0.0f;
}
barrier();
[[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) {
max_mask = max(max_mask, tmpsh[s]);
}
if (max_mask <= NEG_FLT_MAX_OVER_2) {
continue;
}
}
float Sf[Br][cols_per_thread];
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
} else {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
Sf[r][c] = 0.0;
}
}
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
[[unroll]] for (uint32_t d = 0; d < HSK_per_thread / 4; ++d) {
[[unroll]] for (uint32_t d = 0; d < HSK_per_thread / 4; ++d) {
FLOAT_TYPEV4 K_Tf;
if (SHMEM_STAGING != 0) {
K_Tf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
vec4 K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
vec4 K_Tf = vec4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
#endif
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Sf[r][c] += dot(Qf[r][d * D_split + d_tid], K_Tf);
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += ACC_TYPE(dot(Qf[tile_row(r) * qf_stride + d * D_split + d_tid], K_Tf));
}
}
}
}
@ -189,89 +279,109 @@ void main() {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
// Compute sum across the D_split
[[unroll]] for (uint s = D_split / 2; s > 0; s >>= 1) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += subgroupShuffleXor(Sf[r][c], s);
}
}
}
if (LOGIT_SOFTCAP) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
Sf[r][c] = p.logit_softcap * tanh(Sf[r][c]);
Sf[r][c] = ACC_TYPE(p.logit_softcap * tanh(Sf[r][c]));
}
}
}
if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
float mvf = masksh[c * cols_per_iter + col_tid][r];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
FLOAT_TYPE mvf = masksh[(c * cols_per_iter + col_tid) * masksh_stride + tile_row(r)];
Sf[r][c] += slope[r]*mvf;
}
}
barrier();
}
float rowmaxf[Br], Pf[Br][cols_per_thread], rowsumf[Br], eMf[Br], Moldf[Br];
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
rowmaxf[r] = NEG_FLT_MAX_OVER_2;
float eMf[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
float rowmaxf = NEG_FLT_MAX_OVER_2;
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
rowmaxf[r] = max(rowmaxf[r], Sf[r][c]);
rowmaxf = max(rowmaxf, float(Sf[r][c]));
}
Moldf[r] = Mf[r];
float Moldf = Mf[r];
// M = max(rowmax, Mold)
// P = e^(S - M)
// eM = e^(Mold - M)
Mf[r] = max(rowmaxf[r], Moldf[r]);
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
Pf[r][c] = exp(Sf[r][c] - Mf[r]);
}
eMf[r] = exp(Moldf[r] - Mf[r]);
// Compute sum across row of P
rowsumf[r] = 0.0;
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
rowsumf[r] += Pf[r][c];
}
Lf[r] = eMf[r]*Lf[r] + rowsumf[r];
Mf[r] = max(rowmaxf, Moldf);
eMf[r] = exp(Moldf - Mf[r]);
Lf[r] = eMf[r]*Lf[r];
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Of[r][d] = eMf[r] * Of[r][d];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d] = FLOAT_TYPE(eMf[r]) * Of[r][d];
}
}
if (SHMEM_STAGING != 0) {
barrier();
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSV / 4; idx += gl_WorkGroupSize.x) {
uint32_t d = (idx + tid) % (HSV / 4);
uint32_t c = (idx + tid) / (HSV / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSV / 4 || c < Bc) {
FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0);
if (!KV_bounds_check || j * Bc + c < KV) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
#endif
}
kvsh[c * kvsh_stride + d] = V_Tf;
}
}
barrier();
}
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
if (KV_bounds_check && j * Bc + c * cols_per_iter + col_tid >= KV) {
continue;
}
FLOAT_TYPE Pf[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Pf[r] = FLOAT_TYPE(exp(float(Sf[r][c]) - Mf[r]));
Lf[r] += Pf[r];
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
FLOAT_TYPEV4 Vf;
if (SHMEM_STAGING != 0) {
Vf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
vec4 Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
Vf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
#endif
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Of[r][d] += Pf[r][c] * Vf;
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d] += FLOAT_TYPEV4(Pf[r] * Vf);
}
}
}
barrier();
}
// prevent race on tmpsh
@ -279,58 +389,115 @@ void main() {
// reduce across threads
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
float rowmaxf, eMf;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
float rowmaxf = Mf[r];
tmpsh[tid] = Mf[r];
// Compute max across the row
barrier();
[[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
if (tid < s) {
tmpsh[tid] = max(tmpsh[tid], tmpsh[tid + s]);
if (SubGroupSize > 0) {
[[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) {
rowmaxf = max(rowmaxf, subgroupShuffleXor(rowmaxf, s));
}
if (row_split == 1) {
// Reduce inside workgroup with shmem
barrier();
if (gl_SubgroupInvocationID == d_tid) {
tmpsh[gl_SubgroupID * D_split + d_tid] = rowmaxf;
}
barrier();
rowmaxf = tmpsh[d_tid];
[[unroll]] for (uint32_t s = 1; s < num_subgroups; ++s) {
rowmaxf = max(rowmaxf, tmpsh[s * D_split + d_tid]);
}
}
} else {
barrier();
tmpsh[tid] = rowmaxf;
barrier();
[[unroll]] for (int s = int(threads_per_rowgroup) / 2; s >= D_split; s >>= 1) {
if (rowgroup_tid < s) {
tmpsh[tid] = max(tmpsh[tid], tmpsh[tid ^ s]);
}
barrier();
}
rowmaxf = tmpsh[row_tid * threads_per_rowgroup + d_tid];
}
rowmaxf = tmpsh[d_tid];
barrier();
float Moldf = Mf[r];
// M = max(rowmax, Mold)
// eM = e^(Mold - M)
Mf[r] = max(rowmaxf, Moldf);
eMf = exp(Moldf - Mf[r]);
float eMf = exp(Moldf - Mf[r]);
Lf[r] = eMf*Lf[r];
tmpsh[tid] = Lf[r];
// Compute sum across the row
barrier();
[[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
if (tid < s) {
tmpsh[tid] = tmpsh[tid] + tmpsh[tid + s];
if (SubGroupSize > 0) {
[[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) {
Lf[r] += subgroupShuffleXor(Lf[r], s);
}
if (row_split == 1) {
barrier();
if (gl_SubgroupInvocationID == d_tid) {
tmpsh[gl_SubgroupID * D_split + d_tid] = Lf[r];
}
barrier();
Lf[r] = tmpsh[d_tid];
[[unroll]] for (uint32_t s = 1; s < num_subgroups; ++s) {
Lf[r] += tmpsh[s * D_split + d_tid];
}
}
} else {
barrier();
}
Lf[r] = tmpsh[d_tid];
barrier();
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
Of[r][d] = eMf * Of[r][d];
tmpshv4[tid] = Of[r][d];
tmpsh[tid] = Lf[r];
barrier();
[[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
if (tid < s) {
Of[r][d] += tmpshv4[tid + s];
tmpshv4[tid] = Of[r][d];
[[unroll]] for (int s = int(threads_per_rowgroup) / 2; s >= D_split; s >>= 1) {
if (rowgroup_tid < s) {
tmpsh[tid] = tmpsh[tid] + tmpsh[tid ^ s];
}
barrier();
}
Of[r][d] = tmpshv4[d_tid];
barrier();
Lf[r] = tmpsh[row_tid * threads_per_rowgroup + d_tid];
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
Of[r][d] = FLOAT_TYPE(eMf) * Of[r][d];
if (SubGroupSize > 0) {
[[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) {
if (!OLD_AMD_WINDOWS) {
Of[r][d] += subgroupShuffleXor(Of[r][d], s);
} else {
// Something about f16vec4 subgroupShuffleXor is broken on AMD Windows RDNA2 and below.
// Shuffle full vec4 as workaround.
// See https://github.com/ggml-org/llama.cpp/issues/19881#issuecomment-3958643697
Of[r][d] += FLOAT_TYPEV4(subgroupShuffleXor(vec4(Of[r][d]), s));
}
}
if (row_split == 1) {
barrier();
if (gl_SubgroupInvocationID == d_tid) {
tmpshv4[gl_SubgroupID * D_split + d_tid] = Of[r][d];
}
barrier();
Of[r][d] = tmpshv4[d_tid];
[[unroll]] for (uint32_t s = 1; s < num_subgroups; ++s) {
Of[r][d] += tmpshv4[s * D_split + d_tid];
}
}
} else {
barrier();
tmpshv4[tid] = Of[r][d];
barrier();
[[unroll]] for (int s = int(threads_per_rowgroup) / 2; s >= D_split; s >>= 1) {
if (rowgroup_tid < s) {
Of[r][d] += tmpshv4[tid ^ s];
tmpshv4[tid] = Of[r][d];
}
barrier();
}
Of[r][d] = tmpshv4[row_tid * threads_per_rowgroup + d_tid];
}
}
}
@ -338,33 +505,53 @@ void main() {
// If there is split_k, then the split_k resolve shader does the final
// division by L. Store the intermediate O value and per-row m and L values.
if (p.k_num > 1) {
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
if (p.gqa_ratio > 1) {
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3)) / 4;
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
if (r < N) {
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
if (row < N) {
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
gqaStore(row, d * D_split + d_tid, Of[r][d], o_offset, iq2, N);
}
}
}
}
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
if (r < N) {
perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
perElemOpStoreCol0(r, 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
if (row < N) {
perElemOpStoreCol0(row, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
perElemOpStoreCol0(row, 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
}
}
} else {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
const uint global_row = i * Br + row;
if (global_row < N) {
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (global_row + p.ne2 * iq3)) / 4;
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
data_ov4[o_offset + iq2 * HSV/4 + d * D_split + d_tid] = D_TYPEV4(Of[r][d]);
}
}
if (global_row < N && d_tid == 0 && col_tid == 0) {
uint32_t lm_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (global_row + p.ne2 * iq3));
data_o[lm_offset + iq2] = D_TYPE(Lf[r]);
data_o[lm_offset + p.ne1 + iq2] = D_TYPE(Mf[r]);
}
}
}
return;
}
if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
float sink = perElemOpGetSink(r, 0u, ACC_TYPE(0), iq2);
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
float sink = perElemOpGetSink(tile_row(r), 0u, ACC_TYPE(0), iq2);
float ms = 1.0f;
float vs = 1.0f;
@ -373,7 +560,7 @@ void main() {
ms = exp(Mf[r] - sink);
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
Of[r][d] *= ms;
Of[r][d] *= FLOAT_TYPE(ms);
}
} else {
vs = exp(sink - Mf[r]);
@ -383,39 +570,37 @@ void main() {
}
}
float Lfrcp[Br];
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
float Lfrcp[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Of[r][d] *= Lfrcp[r];
#if defined(ACC_TYPE_MAX)
Of[r][d] = clamp(Of[r][d], -vec4(ACC_TYPE_MAX), vec4(ACC_TYPE_MAX));
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d] *= FLOAT_TYPE(Lfrcp[r]);
#if defined(FLOAT_TYPE_MAX)
Of[r][d] = clamp(Of[r][d], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX);
#endif
}
}
uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
uint32_t o_offset = (gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV) / 4;
if (p.gqa_ratio > 1) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
if (r < N) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
if (row < N) {
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
}
gqaStore(row, d * D_split + d_tid, Of[r][d], o_offset, iq2, N);
}
}
}
} else {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
if (i * Br + r < N) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
if (i * Br + row < N) {
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
data_o[o_offset + iq2 * HSV + (i * Br + r) * p.ne1 * HSV + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
}
data_ov4[o_offset + (iq2 * HSV + (i * Br + row) * p.ne1 * HSV) / 4 + d * D_split + d_tid] = D_TYPEV4(Of[r][d]);
}
}
}

View File

@ -1,20 +1,23 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
layout (constant_id = 1) const uint32_t Br = 1;
layout (constant_id = 2) const uint32_t Bc = 32;
layout (constant_id = 3) const uint32_t HSK = 32;
layout (constant_id = 4) const uint32_t HSV = 32;
layout (constant_id = 5) const uint32_t Clamp = 0;
layout (constant_id = 6) const uint32_t D_split = 16;
layout (constant_id = 7) const uint32_t SubGroupSize = 32;
layout (constant_id = 8) const uint32_t K_LOAD_SHMEM = 0;
layout (constant_id = 9) const uint32_t Flags = 0;
layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
layout (constant_id = 1) const uint32_t Br = 1;
layout (constant_id = 2) const uint32_t Bc = 32;
layout (constant_id = 3) const uint32_t HSK = 32;
layout (constant_id = 4) const uint32_t HSV = 32;
layout (constant_id = 5) const uint32_t Clamp = 0;
layout (constant_id = 6) const uint32_t D_split = 16;
layout (constant_id = 7) const uint32_t row_split = 1;
layout (constant_id = 8) const uint32_t SubGroupSize = 32;
layout (constant_id = 9) const uint32_t SHMEM_STAGING = 0;
layout (constant_id = 10) const uint32_t Flags = 0;
layout (constant_id = 11) const uint32_t LIMIT_OCCUPANCY_SHMEM = 0;
const bool USE_MASK_OPT = (Flags & 1) != 0;
const bool MASK_ENABLE = (Flags & 2) != 0;
const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
const bool USE_MASK_OPT = (Flags & 1) != 0;
const bool MASK_ENABLE = (Flags & 2) != 0;
const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
const bool OLD_AMD_WINDOWS = (Flags & 8) != 0;
// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
const uint32_t HSK_pad = (HSK + 15) & ~15;
@ -69,6 +72,7 @@ layout (push_constant) uniform parameter {
layout (binding = 4) readonly buffer S {float data_s[];};
layout (binding = 5) writeonly buffer O {D_TYPE data_o[];};
layout (binding = 5) writeonly buffer OV4 {D_TYPEV4 data_ov4[];};
layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];};
@ -94,12 +98,12 @@ layout (binding = 2) readonly buffer V_PACKED16 {A_TYPE_PACKED16 v_data_packed16
#define BLOCK_SIZE 4
#define BLOCK_BYTE_SIZE 16
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
// iqs is currently always zero in the flash attention shaders
if (binding_idx == BINDING_IDX_K) {
return k_packed.k_data_packed[a_offset + ib];
return FLOAT_TYPEV4(k_packed.k_data_packed[a_offset + ib]);
} else {
return v_packed.v_data_packed[a_offset + ib];
return FLOAT_TYPEV4(v_packed.v_data_packed[a_offset + ib]);
}
}
#endif
@ -107,7 +111,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
#if defined(DATA_A_Q4_0)
#define BLOCK_BYTE_SIZE 18
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
@ -115,7 +119,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
vui_lo >>= shift;
vui_hi >>= shift;
return float(k_packed.k_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
} else {
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
@ -123,24 +127,24 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
vui_lo >>= shift;
vui_hi >>= shift;
return float(v_packed.v_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
}
}
#endif
#if defined(DATA_A_Q8_0)
#define BLOCK_BYTE_SIZE 34
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
const i8vec2 v0 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
return float(k_packed.k_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y);
} else {
const i8vec2 v0 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
return float(v_packed.v_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y);
}
}
#endif
@ -189,10 +193,16 @@ void init_indices()
KV = p.KV;
if (p.k_num > 1) {
i = 0;
// batch and split_k share gl_WorkGroupID.x
gqa_iq1 = gl_WorkGroupID.x / p.k_num;
split_k_index = gl_WorkGroupID.x % p.k_num;
if (p.gqa_ratio > 1) {
i = 0;
// batch and split_k share gl_WorkGroupID.x
gqa_iq1 = gl_WorkGroupID.x / p.k_num;
split_k_index = gl_WorkGroupID.x % p.k_num;
} else {
gqa_iq1 = 0;
split_k_index = gl_WorkGroupID.x % p.k_num;
i = gl_WorkGroupID.x / p.k_num;
}
} else if (p.gqa_ratio > 1) {
i = 0;
gqa_iq1 = gl_WorkGroupID.x;
@ -244,3 +254,11 @@ void init_indices()
// Bias applied to softmax to stay in fp16 range.
// Based on ggml-cuda issue https://github.com/ggml-org/llama.cpp/issues/18606
const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f;
// Store the output when doing grouped query attention.
// Rows index by Q's dimension 2, and the first N rows are valid.
void gqaStore(const in uint32_t r, const in uint32_t c, const in FLOAT_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
{
uint32_t offset = (iq2 + r) * HSV / 4 + c;
data_ov4[o_offset + offset] = D_TYPEV4(elems);
}

View File

@ -19,7 +19,6 @@
const uint32_t MatBr = 16;
const uint32_t MatBc = 16;
const uint32_t row_split = Bc / MatBc;
const uint32_t rows_per_thread = Br / row_split;
const uint32_t cols_per_iter = gl_WorkGroupSize.x / row_split;
const uint32_t cols_per_thread = Bc / cols_per_iter;
@ -33,15 +32,6 @@ layout (binding = 2) readonly buffer V {float16_t data_v[];};
layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
layout (binding = 3) readonly buffer M {float16_t data_m[];};
// Store the output when doing grouped query attention.
// Rows index by Q's dimension 2, and the first N rows are valid.
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
{
uint32_t offset = (iq2 + r) * HSV + c;
data_o[o_offset + offset] = D_TYPE(elem);
return elem;
}
shared float tmpsh[row_split];
const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
@ -54,10 +44,14 @@ shared f16vec4 Psh[Bc * psh_stride];
const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4;
shared ACC_TYPEV4 sfsh[Bc * sfshstride];
const uint32_t kshstride = (K_LOAD_SHMEM != 0 ? HSK_pad : MatBr) / 4 + 2; // in units of f16vec4
const uint32_t D_pad = HSK_pad > HSV_pad ? HSK_pad : HSV_pad;
const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; // in units of f16vec4
const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups
const uint vsh_stride = v_cols;
shared f16vec4 ksh[(kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)];
shared f16vec4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
const uint32_t osh_stride = row_split * MatBr / 4;
shared f16vec4 pvsh[MatBc * osh_stride];
shared ACC_TYPE slope[Br];
@ -84,11 +78,6 @@ void main() {
Qf[i + tid] = f16vec4(0);
}
}
[[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) {
if (i + tid < Bc * kshstride) {
ksh[i + tid] = f16vec4(0);
}
}
barrier();
}
@ -104,10 +93,10 @@ void main() {
}
barrier();
ACC_TYPEV4 Of[rows_per_thread][d_per_thread];
f16vec4 Of[rows_per_thread][d_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
[[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) {
Of[r][d] = ACC_TYPEV4(0.0);
Of[r][d] = f16vec4(0.0);
}
}
@ -153,22 +142,22 @@ void main() {
uint32_t mask_opt = 0;
uint32_t mask_opt_idx = ~0;
uint32_t mask_opt_bits = 0;
f16vec4 mask_cache[Bc * Br / 4 / WorkGroupSize];
[[dont_unroll]]
for (uint32_t j = start_j; j < end_j; ++j) {
f16vec4 mask_cache[Bc * Br / 4 / WorkGroupSize];
[[unroll]] for (uint32_t idx = 0; idx < mask_cache.length(); ++idx) {
mask_cache[idx] = f16vec4(0);
}
if (MASK_ENABLE) {
if (USE_MASK_OPT && mask_opt_idx != j / 16) {
mask_opt_idx = j / 16;
mask_opt = data_mask_opt[mo_offset + mask_opt_idx];
}
uint32_t mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3;
mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3;
if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) {
// skip this block
continue;
@ -231,24 +220,24 @@ void main() {
}
}
if (K_LOAD_SHMEM != 0) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSK / 4; idx += gl_WorkGroupSize.x) {
uint32_t d = (idx + tid) % (HSK / 4);
uint32_t c = (idx + tid) / (HSK / 4);
if (c < Bc && d < HSK / 4) {
if (SHMEM_STAGING != 0) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSK_pad / 4; idx += gl_WorkGroupSize.x) {
uint32_t d = (idx + tid) % (HSK_pad / 4);
uint32_t c = (idx + tid) / (HSK_pad / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSK_pad / 4 || c < Bc) {
f16vec4 K_Tf = f16vec4(0);
if (!KV_bounds_check || j * Bc + c < KV) {
if ((!KV_bounds_check || j * Bc + c < KV) && (HSK == HSK_pad || d < HSK / 4)) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
#endif
}
ksh[c * kshstride + d] = K_Tf;
kvsh[c * kvsh_stride + d] = K_Tf;
}
}
barrier();
@ -262,7 +251,11 @@ void main() {
coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
[[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
if (K_LOAD_SHMEM == 0) {
// If SHMEM_STAGING is set, a Bc * HSK_pad size tile of K is loaded to shmem
// If not, f16 K is loaded directly from global memory if aligned, otherwise
// staged through a Bc * MatBr size staging buffer.
// If K is not type f16, then it is always staged for dequantization.
if (SHMEM_STAGING == 0) {
#if BLOCK_SIZE == 1
if (KV_bounds_check || d * 16 + 16 > HSK) {
#endif
@ -277,13 +270,13 @@ void main() {
uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE + d * 16 + col_vec * 4;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
#endif
}
ksh[row * kshstride + col_vec] = K_Tf;
kvsh[row * kvsh_stride + col_vec] = K_Tf;
}
}
barrier();
@ -295,8 +288,8 @@ void main() {
if (KV_bounds_check || d * 16 + 16 > HSK)
#endif
{
uint coord = (gl_SubgroupID * MatBc) * kshstride;
coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
uint coord = (gl_SubgroupID * MatBc) * kvsh_stride;
coopMatLoad(KMat, kvsh, coord, kvsh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
#if BLOCK_SIZE == 1
else {
@ -305,8 +298,8 @@ void main() {
}
#endif
} else {
uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
uint coord = (gl_SubgroupID * MatBc) * kvsh_stride + d * 16 / 4;
coopMatLoad(KMat, kvsh, coord, kvsh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
@ -329,7 +322,7 @@ void main() {
barrier();
}
if (MASK_ENABLE) {
if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
uint32_t c = (idx + tid) / (Br / 4);
uint32_t r = (idx + tid) % (Br / 4);
@ -374,7 +367,7 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d_local] = ACC_TYPE(eMf[r]) * Of[r][d_local];
Of[r][d_local] = float16_t(eMf[r]) * Of[r][d_local];
}
}
@ -397,19 +390,47 @@ void main() {
}
}
if (SHMEM_STAGING != 0) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * HSV_pad / 4; idx += gl_WorkGroupSize.x) {
uint32_t d = (idx + tid) % (HSV_pad / 4);
uint32_t c = (idx + tid) / (HSV_pad / 4);
if (idx + gl_WorkGroupSize.x <= Bc * HSV_pad / 4 || c < Bc) {
f16vec4 V_Tf = f16vec4(0);
if ((!KV_bounds_check || j * Bc + c < KV) && (HSV == HSV_pad || d < HSV / 4)) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
#endif
}
kvsh[c * kvsh_stride + d] = V_Tf;
}
}
}
barrier();
const uint num_hsv_tiles = (HSV + MatBc * row_split - 1) / (MatBc * row_split); // round up
// Each subgroup handles HSV/4 columns
[[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) {
const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16;
SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
// Preload V tiles for [Bc, 16 * num subgroups]
const uint v_rows = Bc;
const uint v_total = v_rows * v_cols;
const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x;
// If SHMEM_STAGING is set, a Bc * HSV_pad size tile of V is loaded to shmem.
// If not, f16 V is loaded directly from global memory if aligned, otherwise
// staged through a Bc * MatBr size staging buffer.
// If V is not type f16, then it is always staged for dequantization.
if (SHMEM_STAGING == 0) {
#if BLOCK_SIZE == 1
// For f16, only preload if not aligned
if (KV_bounds_check) {
@ -428,44 +449,52 @@ void main() {
if (!KV_bounds_check || (v_row < KV && v_col < HSV)) {
#if BLOCK_SIZE > 1
ksh[row * vsh_stride + col] = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V));
kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
ksh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
#endif
} else {
ksh[row * vsh_stride + col] = f16vec4(0.0f);
kvsh[row * vsh_stride + col] = f16vec4(0.0f);
}
}
#if BLOCK_SIZE == 1
}
#endif
}
barrier();
[[unroll]] for (uint32_t bc_chunk = 0; bc_chunk < Bc / MatBc; ++bc_chunk) {
coopMatLoad(KMat, Psh, bc_chunk * MatBc * psh_stride, psh_stride, gl_CooperativeMatrixLayoutColumnMajor);
const uint o_offset = gl_SubgroupID * MatBr / 4;
if (hsv_offset < HSV_pad) {
[[unroll]] for (uint32_t bc_chunk = 0; bc_chunk < Bc / MatBc; ++bc_chunk) {
coopMatLoad(KMat, Psh, bc_chunk * MatBc * psh_stride, psh_stride, gl_CooperativeMatrixLayoutColumnMajor);
if (SHMEM_STAGING == 0) {
#if BLOCK_SIZE == 1
if (!KV_bounds_check) {
// F16 values can be loaded directly from global memory
const uint v_tile_row = j * Bc + bc_chunk * MatBc;
const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
} else
if (!KV_bounds_check) {
// F16 values can be loaded directly from global memory
const uint v_tile_row = j * Bc + bc_chunk * MatBc;
const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
} else
#endif
{
const uint v_tile_offset = bc_chunk * MatBr * v_cols + gl_SubgroupID * (MatBc / 4);
coopMatLoad(QMat, ksh, v_tile_offset, vsh_stride, gl_CooperativeMatrixLayoutRowMajor);
{
const uint v_tile_offset = bc_chunk * MatBr * v_cols + gl_SubgroupID * (MatBc / 4);
coopMatLoad(QMat, kvsh, v_tile_offset, vsh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
} else {
const uint v_tile_offset = bc_chunk * MatBc * kvsh_stride + (hsv_tile * row_split + gl_SubgroupID) * (MatBc / 4);
coopMatLoad(QMat, kvsh, v_tile_offset, kvsh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
PVMat = coopMatMulAdd(KMat, QMat, PVMat);
}
SfMat = coopMatMulAdd(KMat, QMat, SfMat);
// Store PVMat to pvsh and load into Of
coopMatStore(PVMat, pvsh, o_offset, osh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
// Store SfMat to sfsh and load into Of
const uint osh_stride = row_split * MatBc / 4;
const uint o_offset = gl_SubgroupID * MatBc / 4;
coopMatStore(SfMat, sfsh, o_offset, osh_stride, gl_CooperativeMatrixLayoutRowMajor);
barrier();
const uint hsv_per_tile = row_split * MatBc;
@ -484,7 +513,7 @@ void main() {
if (hsv_col >= hsv_base && hsv_col < hsv_base + hsv_per_tile && hsv_col < HSV) {
const uint local_hsv = (hsv_col - hsv_base) / 4;
Of[r][d_local] += ACC_TYPEV4(sfsh[row * osh_stride + local_hsv]);
Of[r][d_local] += pvsh[row * osh_stride + local_hsv];
}
}
}
@ -500,27 +529,48 @@ void main() {
// If there is split_k, then the split_k resolve shader does the final
// division by L. Store the intermediate O value and per-row m and L values.
if (p.k_num > 1) {
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
if (p.gqa_ratio > 1) {
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3)) / 4;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
if (tile_row(r) < N) {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d = d0 + col_tid;
if (d >= HSV/4) break;
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
perElemOpGqaStore(tile_row(r), 4 * d + comp, float(Of[r][d_local][comp]), o_offset, iq2, N);
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
if (tile_row(r) < N) {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d = d0 + col_tid;
if (d >= HSV/4) break;
const uint d_local = d0 / threads_per_rowgroup;
gqaStore(tile_row(r), d, Of[r][d_local], o_offset, iq2, N);
}
}
}
}
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
if (tile_row(r) < N) {
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
if (tile_row(r) < N) {
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
}
}
} else {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
const uint row = tile_row(r);
const uint global_row = i * Br + row;
if (global_row < N) {
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (global_row + p.ne2 * iq3)) / 4;
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d = d0 + col_tid;
if (d >= HSV/4) break;
data_ov4[o_offset + iq2 * HSV/4 + d] = D_TYPEV4(Of[r][d/threads_per_rowgroup]);
}
}
if (global_row < N && col_tid == 0) {
uint32_t lm_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (global_row + p.ne2 * iq3));
data_o[lm_offset + iq2] = D_TYPE(Lf[r]);
data_o[lm_offset + p.ne1 + iq2] = D_TYPE(Mf[r]);
}
}
}
@ -539,7 +589,7 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
Of[r][d_local] *= ACC_TYPE(ms);
Of[r][d_local] *= float16_t(ms);
}
} else {
vs = exp(sink - Mf[r]);
@ -557,14 +607,14 @@ void main() {
[[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d_local] *= ACC_TYPE(Lfrcp[r]);
#if defined(ACC_TYPE_MAX)
Of[r][d_local] = clamp(Of[r][d_local], -ACC_TYPE_MAX, ACC_TYPE_MAX);
Of[r][d_local] *= float16_t(Lfrcp[r]);
#if defined(FLOAT_TYPE_MAX)
Of[r][d_local] = clamp(Of[r][d_local], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX);
#endif
}
}
uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
uint32_t o_offset = (gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV) / 4;
if (p.gqa_ratio > 1) {
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
@ -573,9 +623,7 @@ void main() {
const uint d = d0 + col_tid;
if (d >= HSV / 4) break;
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
perElemOpGqaStore(tile_row(r), 4 * d + comp, float(Of[r][d_local][comp]), o_offset, iq2, N);
}
gqaStore(tile_row(r), d, Of[r][d_local], o_offset, iq2, N);
}
}
}
@ -586,9 +634,7 @@ void main() {
const uint d = d0 + col_tid;
if (d >= HSV / 4) break;
const uint d_local = d0 / threads_per_rowgroup;
[[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
data_o[o_offset + iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV + 4 * d + comp] = D_TYPE(Of[r][d_local][comp]);
}
data_ov4[o_offset + (iq2 * HSV + (i * Br + tile_row(r)) * p.ne1 * HSV) / 4 + d] = D_TYPEV4(Of[r][d_local]);
}
}
}

View File

@ -72,6 +72,28 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
return elem;
}
// Store O values for non-GQA split_k. Rows are tokens, not heads.
D_TYPE perElemOpNonGqaSplitKStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t unused, const in uint32_t iq2, const in uint32_t N) {
uint32_t global_row = i * Br + r;
if (global_row < N && c < HSV) {
uint32_t o_off = HSV * p.ne1
* (split_k_index + p.k_num * (global_row + p.ne2 * iq3));
data_o[o_off + iq2 * HSV + c] = D_TYPE(elem);
}
return elem;
}
// Store L/M values for non-GQA split_k.
ACC_TYPE perElemOpNonGqaSplitKStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t lm_base, const in uint32_t iq2, const in uint32_t N) {
uint32_t global_row = i * Br + r;
if (global_row < N && c == 0) {
uint32_t lm_off = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3
+ p.ne1 * 2 * (split_k_index + p.k_num * (global_row + p.ne2 * iq3));
data_o[lm_off + lm_base + iq2] = D_TYPE(elem);
}
return elem;
}
void main() {
#ifdef NEEDS_INIT_IQ_SHMEM
init_iq_shmem(gl_WorkGroupSize);
@ -290,13 +312,19 @@ void main() {
if (p.k_num > 1) {
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
if (p.gqa_ratio > 1) {
// note: O and Q have swapped coord 1,2.
uint32_t o_offset = HSV * p.ne1 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
o_offset = HSV * p.ne1 * p.k_num * p.ne2 * p.ne3 + p.ne1 * 2 * (split_k_index + p.k_num * (gqa_iq1 + p.ne2 * iq3));
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
} else {
coopMatPerElementNV(O_D, O_D, perElemOpNonGqaSplitKStore, 0u, iq2, N);
coopMatPerElementNV(L, L, perElemOpNonGqaSplitKStoreCol0, 0u, iq2, N);
coopMatPerElementNV(M, M, perElemOpNonGqaSplitKStoreCol0, p.ne1, iq2, N);
}
return;
}

View File

@ -167,7 +167,9 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
ballots_sh[gl_SubgroupID] = ballot;
if (gl_SubgroupInvocationID == 0) {
ballots_sh[gl_SubgroupID] = ballot;
}
barrier();
uint subgroup_base = 0;

View File

@ -43,7 +43,9 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
ballots_sh[gl_SubgroupID] = ballot;
if (gl_SubgroupInvocationID == 0) {
ballots_sh[gl_SubgroupID] = ballot;
}
barrier();
uint subgroup_base = 0;

View File

@ -595,8 +595,6 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
}
void process_shaders() {
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}};
// matmul
for (const MatMulIdType& matmul_id_type : {MatMulIdType::NONE, MatMulIdType::DEFAULT, MatMulIdType::SUBGROUP}) {
// No coopmats
@ -622,49 +620,63 @@ void process_shaders() {
}
}
// flash attention
for (const auto& f16acc : {false, true}) {
std::map<std::string, std::string> fa_base_dict = base_dict;
fa_base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
fa_base_dict["ACC_TYPEV4"] = f16acc ? "f16vec4" : "vec4";
if (f16acc) {
fa_base_dict["ACC_TYPE_MAX"] = "float16_t(65504.0)";
for (const bool& fp16 : {false, true}) {
std::map<std::string, std::string> base_dict;
if (fp16) {
base_dict = {{"FLOAT_TYPE", "float16_t"}, {"FLOAT_TYPEV4", "f16vec4"}, {"FLOAT16", "1"}, {"FLOAT_TYPE_MAX", "float16_t(65504.0)"}};
} else {
base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV4", "vec4"}};
}
for (const auto& tname : type_names) {
if (tname == "bf16") continue;
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}}), true, false, true, f16acc);
} else {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc);
// flash attention
for (const bool& f16acc : {false, true}) {
std::map<std::string, std::string> fa_base_dict = base_dict;
fa_base_dict["ACC_TYPE"] = fp16 && f16acc ? "float16_t" : "float";
fa_base_dict["ACC_TYPEV4"] = fp16 && f16acc ? "f16vec4" : "vec4";
if (fp16 && f16acc) {
fa_base_dict["ACC_TYPE_MAX"] = "float16_t(65504.0)";
}
for (const auto& tname : type_names) {
if (tname == "bf16") continue;
if (fp16) {
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, true, f16acc);
} else {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, true, f16acc);
}
#endif
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"COOPMAT", "1"}}), true, true, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), true, true, false, f16acc);
}
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
}
#endif
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}}), true, false, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, false, f16acc);
}
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, false, f16acc);
}
}
}
}
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPE_VEC2", "vec2"}};
for (const auto& tname : type_names) {
// mul mat vec
std::string data_a_key = "DATA_A_" + to_uppercase(tname);

View File

@ -1,12 +1,19 @@
ggml_add_backend_library(ggml-zendnn
ggml-zendnn.cpp)
# Get ZenDNN path
if (NOT DEFINED ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "")
set(ZENDNN_ROOT "$ENV{ZENDNN_ROOT}")
endif()
# Check if path is still empty or OFF
if (BUILD_SHARED_LIBS)
set(ZENDNN_SHARED_LIB ON)
set(ZENDNN_ARCHIVE_LIB OFF)
else()
set(ZENDNN_SHARED_LIB OFF)
set(ZENDNN_ARCHIVE_LIB ON)
endif()
# Download and build ZenDNN if not provided
if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
message(STATUS "ZENDNN_ROOT not set. Automatically downloading and building ZenDNN...")
message(STATUS "This will take several minutes on first build...")
@ -21,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
ExternalProject_Add(
zendnn
GIT_REPOSITORY https://github.com/amd/ZenDNN.git
GIT_TAG 21ce8f7879c86bf3637f707fae6f29e0951db5fe
GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08
PREFIX ${ZENDNN_PREFIX}
SOURCE_DIR ${ZENDNN_SOURCE_DIR}
BINARY_DIR ${ZENDNN_BUILD_DIR}
@ -32,7 +39,9 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
-DZENDNNL_BUILD_DOXYGEN=OFF
-DZENDNNL_BUILD_GTEST=OFF
-DZENDNNL_BUILD_BENCHDNN=OFF
# Enable ALL matmul algorithm backends
-DZENDNNL_DEPENDS_FBGEMM=OFF
-DZENDNNL_LIB_BUILD_ARCHIVE=${ZENDNN_ARCHIVE_LIB}
-DZENDNNL_LIB_BUILD_SHARED=${ZENDNN_SHARED_LIB}
-DZENDNNL_DEPENDS_AOCLDLP=ON
-DZENDNNL_DEPENDS_ONEDNN=ON
-DZENDNNL_DEPENDS_LIBXSMM=ON
@ -45,47 +54,37 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
LOG_INSTALL ON
)
# Add dependency so ZenDNN builds before our library
add_dependencies(ggml-zendnn zendnn)
# Set ZENDNN_ROOT to the installation directory
set(ZENDNN_ROOT ${ZENDNN_INSTALL_DIR})
message(STATUS "ZenDNN will be built to: ${ZENDNN_ROOT}")
else()
message(STATUS "Using custom ZenDNN installation at: ${ZENDNN_ROOT}")
endif()
# ZenDNN headers + libs
target_include_directories(ggml-zendnn PRIVATE
${ZENDNN_ROOT}/zendnnl/include
${ZENDNN_ROOT}/deps/aocldlp/include
${ZENDNN_ROOT}/deps/aoclutils/include
${ZENDNN_ROOT}/deps/json/include
${ZENDNN_ROOT}/deps/libxsmm/include
${ZENDNN_ROOT}/deps/aoclutils/include
${ZENDNN_ROOT}/deps/aocldlp/include
${ZENDNN_ROOT}/deps/onednn/include
)
${ZENDNN_ROOT}/deps/libxsmm/include)
target_link_directories(ggml-zendnn PRIVATE
${ZENDNN_ROOT}/zendnnl/lib
${ZENDNN_ROOT}/deps/aocldlp/lib
${ZENDNN_ROOT}/deps/aoclutils/lib
${ZENDNN_ROOT}/deps/libxsmm/lib
${ZENDNN_ROOT}/deps/onednn/lib
)
if (ZENDNN_SHARED_LIB)
target_link_directories(ggml-zendnn PRIVATE ${ZENDNN_ROOT}/zendnnl/lib)
target_link_libraries(ggml-zendnn PRIVATE zendnnl)
elseif (ZENDNN_ARCHIVE_LIB)
target_link_libraries(ggml-zendnn PRIVATE
${ZENDNN_ROOT}/zendnnl/lib/libzendnnl_archive.a
${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libaoclutils.a
${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libau_cpuid.a
${ZENDNN_ROOT}/deps/aocldlp/lib/libaocl-dlp.a
${ZENDNN_ROOT}/deps/onednn/${CMAKE_INSTALL_LIBDIR}/libdnnl.a
${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmm.a
${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmext.a
${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmnoblas.a)
endif()
target_link_libraries(ggml-zendnn PRIVATE
zendnnl_archive # ZenDNN main
aocl-dlp # AOCL libraries
aoclutils
au_cpuid
dnnl # OneDNN
xsmm # libxsmm small matrix math
xsmmext
xsmmnoblas
m
pthread
)
target_link_libraries(ggml-zendnn PRIVATE m pthread)
if (GGML_OPENMP)
target_link_libraries(ggml-zendnn PRIVATE OpenMP::OpenMP_CXX)

View File

@ -41,13 +41,13 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
const TA * A, int64_t lda, const TB * B, int64_t ldb, TC * C,
int64_t ldc) {
zendnnl::lowoha::lowoha_params params;
zendnnl::lowoha::matmul::matmul_params params;
params.dtypes.src = ggml_to_zendnn_type<TB>();
params.dtypes.wei = ggml_to_zendnn_type<TA>();
params.dtypes.dst = ggml_to_zendnn_type<TC>();
params.num_threads = ctx->n_threads;
zendnnl::lowoha::status_t status = zendnnl::lowoha::matmul_direct(
zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct(
'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major)
n, // M: rows of B and C
m, // N: cols of A^T and C
@ -63,7 +63,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
params // params
);
if (status != zendnnl::lowoha::status_t::success) {
if (status != zendnnl::error_handling::status_t::success) {
GGML_LOG_ERROR("%s, ZenDNN matmul failed: status=%d\n", __func__, static_cast<int>(status));
return false;
}

View File

@ -899,7 +899,8 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
};
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
GGML_ASSERT(type < GGML_TYPE_COUNT);
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
return &type_traits[type];
}
@ -1265,27 +1266,33 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
}
int64_t ggml_blck_size(enum ggml_type type) {
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
return type_traits[type].blck_size;
}
size_t ggml_type_size(enum ggml_type type) {
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
return type_traits[type].type_size;
}
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
assert(ne % ggml_blck_size(type) == 0);
return ggml_type_size(type)*ne/ggml_blck_size(type);
}
double ggml_type_sizef(enum ggml_type type) {
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
}
const char * ggml_type_name(enum ggml_type type) {
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
return type_traits[type].type_name;
}
bool ggml_is_quantized(enum ggml_type type) {
assert(type >= 0);
assert(type < GGML_TYPE_COUNT);
return type_traits[type].is_quantized;
}
@ -1629,11 +1636,23 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
const size_t cur_end = cur_offs + cur_size;
// align to GGML_MEM_ALIGN
GGML_ASSERT(size <= SIZE_MAX - (GGML_MEM_ALIGN - 1));
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
// integer overflow checks
if (cur_end > SIZE_MAX - size_needed) {
GGML_LOG_WARN("%s: overflow detected in cur_end (%zu) + size_needed (%zu)\n", __func__, cur_end, size_needed);
return NULL;
}
if (cur_end + size_needed > SIZE_MAX - GGML_OBJECT_SIZE) {
GGML_LOG_WARN("%s: overflow detected in cur_end (%zu) + size_needed (%zu) + GGML_OBJECT_SIZE (%zu)\n", __func__,
cur_end, size_needed, (size_t) GGML_OBJECT_SIZE);
return NULL;
}
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
@ -1702,6 +1721,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
obj_alloc_size = data_size;
}
GGML_ASSERT(GGML_TENSOR_SIZE <= SIZE_MAX - obj_alloc_size);
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
GGML_ASSERT(obj_new);

View File

@ -15,6 +15,17 @@
#include <string>
#include <vector>
#define GGUF_MAX_STRING_LENGTH (1024*1024*1024)
#define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024)
#ifdef _WIN32
# define gguf_ftell _ftelli64
# define gguf_fseek _fseeki64
#else
# define gguf_ftell ftello
# define gguf_fseek fseeko
#endif
template <typename T>
struct type_to_gguf_type;
@ -217,17 +228,64 @@ struct gguf_context {
};
struct gguf_reader {
FILE * file;
gguf_reader(FILE * file) : file(file) {
// read the remaining bytes once and update on each read
nbytes_remain = file_remain(file);
}
gguf_reader(FILE * file) : file(file) {}
// helper for remaining bytes in a file
static uint64_t file_remain(FILE * file) {
const int64_t cur = gguf_ftell(file);
if (cur < 0) {
return 0;
}
if (gguf_fseek(file, 0, SEEK_END) != 0) {
gguf_fseek(file, cur, SEEK_SET);
return 0;
}
const int64_t end = gguf_ftell(file);
if (end < 0) {
gguf_fseek(file, cur, SEEK_SET);
return 0;
}
gguf_fseek(file, cur, SEEK_SET);
return static_cast<uint64_t>(end - cur);
}
template <typename T>
bool read(T & dst) const {
return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
const size_t size = sizeof(dst);
if (nbytes_remain < size) {
return false;
}
const size_t nread = fread(&dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
template <typename T>
bool read(std::vector<T> & dst, const size_t n) const {
if (n > GGUF_MAX_ARRAY_ELEMENTS) {
return false;
}
if constexpr (std::is_same<T, std::string>::value) {
// strings are prefixed with their length, so we need to account for that
if (n > SIZE_MAX / sizeof(uint64_t)) {
return false;
}
if (nbytes_remain < n * sizeof(uint64_t)) {
return false;
}
} else {
if (n > SIZE_MAX / sizeof(T)) {
return false;
}
if (nbytes_remain < n * sizeof(T)) {
return false;
}
}
dst.resize(n);
for (size_t i = 0; i < dst.size(); ++i) {
if constexpr (std::is_same<T, bool>::value) {
@ -277,13 +335,33 @@ struct gguf_reader {
if (!read(size)) {
return false;
}
dst.resize(size);
return fread(dst.data(), 1, dst.length(), file) == dst.length();
if (size > GGUF_MAX_STRING_LENGTH) {
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds maximum %" PRIu64 "\n", __func__, size, (uint64_t) GGUF_MAX_STRING_LENGTH);
return false;
}
if (size > nbytes_remain) {
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes_remain);
return false;
}
dst.resize(static_cast<size_t>(size));
const size_t nread = fread(dst.data(), 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
bool read(void * dst, const size_t size) const {
return fread(dst, 1, size, file) == size;
if (size > nbytes_remain) {
return false;
}
const size_t nread = fread(dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
private:
FILE * file;
mutable uint64_t nbytes_remain;
};
struct gguf_context * gguf_init_empty(void) {
@ -568,8 +646,8 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// check that tensor type is within defined range
if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n",
__func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d. should be in [0, %d)\n",
__func__, info.t.name, info.t.type, GGML_TYPE_COUNT);
ok = false;
break;
}
@ -618,14 +696,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
// we require the data section to be aligned, so take into account any padding
if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
gguf_free(ctx);
return nullptr;
}
// store the current file offset - this is where the data section starts
ctx->offset = ftell(file);
ctx->offset = gguf_ftell(file);
// compute the total size of the data section, taking into account the alignment
{
@ -657,10 +735,34 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context
const size_t mem_size =
params.no_alloc ?
(n_tensors )*ggml_tensor_overhead() :
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
size_t mem_size = 0;
if (params.no_alloc) {
if (n_tensors != 0 && SIZE_MAX / n_tensors < ggml_tensor_overhead()) {
GGML_LOG_ERROR("%s: memory size overflow while allocating ggml context\n", __func__);
gguf_free(ctx);
return nullptr;
}
const size_t overhead = n_tensors * ggml_tensor_overhead();
mem_size = overhead;
} else {
if ((n_tensors + 1) != 0 && SIZE_MAX / (n_tensors + 1) < ggml_tensor_overhead()) {
GGML_LOG_ERROR("%s: memory size overflow while allocating ggml context\n", __func__);
gguf_free(ctx);
return nullptr;
}
const size_t overhead = (n_tensors + 1) * ggml_tensor_overhead();
if (SIZE_MAX - overhead < ctx->size) {
GGML_LOG_ERROR("%s: memory size overflow while allocating ggml context\n", __func__);
gguf_free(ctx);
return nullptr;
}
mem_size = overhead + ctx->size;
}
struct ggml_init_params pdata = {
/*mem_size =*/ mem_size,

View File

@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum):
NEO_BERT = auto()
JINA_BERT_V2 = auto()
JINA_BERT_V3 = auto()
EUROBERT = auto()
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
@ -531,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_EXP = auto()
FFN_DOWN_EXP = auto()
FFN_UP_EXP = auto()
FFN_GATE_UP_EXP = auto()
FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto()
@ -820,6 +822,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.NEO_BERT: "neo-bert",
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
MODEL_ARCH.EUROBERT: "eurobert",
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
@ -978,6 +981,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
@ -1587,6 +1591,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.EUROBERT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_DOWN,
],
MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -1805,6 +1822,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_GATE_UP_EXP,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
@ -1894,6 +1912,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_GATE_UP_EXP,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
@ -2595,6 +2614,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,

View File

@ -175,6 +175,9 @@ class GGUFReader:
if new_align.types != [GGUFValueType.UINT32]:
raise ValueError('Bad type for general.alignment field')
self.alignment = new_align.parts[-1][0]
# Ensure alignment is a non-zero power of two
if self.alignment == 0 or (self.alignment & (self.alignment - 1)) != 0:
raise ValueError('Invalid alignment: must be a non-zero power of two')
padding = offs % self.alignment
if padding != 0:
offs += self.alignment - padding
@ -202,11 +205,11 @@ class GGUFReader:
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields:
# TODO: add option to generate error on duplicate keys
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
# TODO: add option to make this a warning and accept duplicate keys like below
raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
self.fields[field.name + '_{}'.format(field.offset)] = field
# logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
# self.fields[field.name + '_{}'.format(field.offset)] = field
else:
self.fields[field.name] = field
return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)

View File

@ -501,6 +501,8 @@ class GGUFWriter:
self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
def add_custom_alignment(self, alignment: int) -> None:
if alignment <= 0 or (alignment & (alignment - 1)) != 0:
raise ValueError('Invalid alignment: must be a non-zero power of two')
self.data_alignment = alignment
self.add_uint32(Keys.General.ALIGNMENT, alignment)

View File

@ -567,6 +567,10 @@ class TensorNameMap:
"model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
),
MODEL_TENSOR.FFN_GATE_UP_EXP: (
"model.layers.{bid}.mlp.experts.gate_up_proj",
),
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.17.1"
version = "0.18.0"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

View File

@ -25,16 +25,12 @@ Example usage:
"""
def generate_input_prompt(length: int) -> list[str]:
CORPUS = """
You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
### Tool Call Format:
When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
You can make multiple calls in one go by placing them one after another.
"""
words = [w.strip() for w in CORPUS.strip().split(" ")]
def get_remote_corpus(url: str, length: int) -> list[str]:
response = requests.get(url)
response.raise_for_status()
corpus = response.text
words = [w.strip() for w in corpus.strip().split(" ")]
words = [w for w in words if "<" not in w] # make sure nothing looks like special tokens
words = [w for w in words if len(w) > 0] # filter out empty strings
while len(words) < length:
words += words
@ -226,9 +222,9 @@ def parse_args() -> argparse.Namespace:
)
parser_dump.add_argument(
"--file",
type=Path,
default=None,
help="File containing prompt to use instead of the default",
type=str,
default="https://raw.githubusercontent.com/ggml-org/llama.cpp/eaba92c3dcc980ebe753348855d4a5d75c069997/tools/server/README.md",
help="File containing prompt to use instead of the default (can also be an URL)",
)
parser_dump.add_argument(
"--pattern",
@ -259,17 +255,19 @@ def main():
if args.verb == "dump":
pattern = parse_pattern(args.pattern)
input_length = sum(n for _, n in pattern)
input_words = generate_input_prompt(input_length)
if args.file is not None:
with args.file.open("r") as f:
required_words = sum(n for _, n in pattern)
if args.file.startswith("http"):
input_words = get_remote_corpus(args.file, required_words)
logger.info(f"Fetched {len(input_words)} words from remote {args.file}")
else:
with open(args.file, "r") as f:
input_words = f.read().strip().split(" ")
if input_length < sum(n for _, n in pattern):
input_words = [w for w in input_words if len(w) > 0] # filter out empty strings
if len(input_words) < required_words:
raise ValueError(
f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
f"Input file has only {len(input_words)} words, but pattern requires at least {required_words} words."
)
input_length = len(input_words)
logger.info(f"Using {input_length} words")
logger.info(f"Using {len(input_words)} words")
dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
elif args.verb == "compare":
compare_logits(args.input1, args.input2, args.output)

View File

@ -54,6 +54,6 @@ adb $adbserial $adbhost shell " \
$verbose $experimental $sched $opmask $profile $nhvx $ndev $hb \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -fa on \
-ngl 99 --device $device $cli_opts $@ \
--ctx-size 8192 --ubatch-size 256 -fa on \
-ngl 99 --device $device $cli_opts $@ \
"

Some files were not shown because too many files have changed in this diff Show More