Merge branch 'master' into compilade/imatrix-batched-chunks

This commit is contained in:
Francis Couture-Harpin 2025-07-12 13:31:19 -04:00
commit 0ee322cd0f
279 changed files with 42835 additions and 9772 deletions

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e
# Read the first argument into a variable

View File

@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true

View File

@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true

11
.github/labeler.yml vendored
View File

@ -1,10 +1,4 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
@ -93,3 +87,8 @@ Ascend NPU:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**

51
.github/workflows/build-cmake-pkg.yml vendored Normal file
View File

@ -0,0 +1,51 @@
name: Build relocatable cmake package
on:
workflow_dispatch:
workflow_call:
jobs:
linux:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl
- name: Build
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
tclsh <<'EOF'
set build(commit) [string trim [exec git rev-parse --short HEAD]]
set build(number) [string trim [exec git rev-list --count HEAD]]
set build(version) "0.0.$build(number)"
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
puts -nonewline "Checking llama-config.cmake version... "
foreach check $checks {
if {![regexp -expanded -- $check $llamaconfig]} {
puts "\"$check\" failed!"
exit 1
}
}
puts "success."
EOF
cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
cmake --build build

View File

@ -5,10 +5,43 @@ on:
push:
branches:
- master
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -51,7 +84,8 @@ jobs:
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@ -308,7 +342,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ctest -L main --verbose --timeout 4200
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
@ -478,6 +512,9 @@ jobs:
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
build-cmake-pkg:
uses: ./.github/workflows/build-cmake-pkg.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@ -628,7 +665,7 @@ jobs:
./build-xcframework.sh
windows-msys2:
runs-on: windows-latest
runs-on: windows-2025
strategy:
fail-fast: false
@ -678,7 +715,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
runs-on: windows-2025
env:
OPENBLAS_VERSION: 0.3.23
@ -689,17 +726,20 @@ jobs:
matrix:
include:
- build: 'cpu-x64 (static)'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
- build: 'openblas-x64'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
arch: 'x64'
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
# - build: 'kompute-x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
steps:
- name: Clone
@ -713,12 +753,6 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@ -734,7 +768,7 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@ -769,6 +803,8 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
@ -789,7 +825,7 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
if: ${{ matrix.arch == 'x64' }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@ -894,7 +930,7 @@ jobs:
cmake --build build --config Release
windows-latest-cmake-sycl:
runs-on: windows-latest
runs-on: windows-2022
defaults:
run:
@ -928,7 +964,7 @@ jobs:
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-latest
runs-on: windows-2022
steps:
- name: Clone

View File

@ -49,7 +49,8 @@ jobs:
run: |
sysctl -a
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
@ -103,7 +104,8 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
@ -160,6 +162,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
@ -211,6 +215,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
@ -235,7 +241,7 @@ jobs:
name: llama-bin-ubuntu-vulkan-x64.zip
windows-cpu:
runs-on: windows-latest
runs-on: windows-2025
strategy:
matrix:
@ -271,7 +277,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DGGML_NATIVE=OFF ^
@ -288,7 +294,7 @@ jobs:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
@ -298,7 +304,7 @@ jobs:
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
runs-on: windows-latest
runs-on: windows-2025
env:
OPENBLAS_VERSION: 0.3.23
@ -448,7 +454,7 @@ jobs:
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
runs-on: windows-latest
runs-on: windows-2022
defaults:
run:
@ -520,7 +526,7 @@ jobs:
name: llama-bin-win-sycl-x64.zip
windows-hip:
runs-on: windows-latest
runs-on: windows-2022
strategy:
matrix:

40
.github/workflows/update-ops-docs.yml vendored Normal file
View File

@ -0,0 +1,40 @@
name: Update Operations Documentation
on:
push:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
pull_request:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
jobs:
update-ops-docs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Generate operations documentation to temporary file
run: |
mkdir -p /tmp/ops_check
./scripts/create_ops_docs.py /tmp/ops_check/ops.md
- name: Check if docs/ops.md matches generated version
run: |
if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
echo "Differences found:"
diff docs/ops.md /tmp/ops_check/ops.md || true
exit 1
fi
echo "Operations documentation is up to date."

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "kompute"]
path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git

View File

@ -95,7 +95,7 @@ endif()
if (NOT DEFINED LLAMA_BUILD_COMMIT)
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
endif()
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
# override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
@ -120,7 +120,6 @@ endfunction()
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)

View File

@ -6,9 +6,9 @@
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
LLM inference in C/C++
## Recent API changes
@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@ -134,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
#### Multimodal

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
# Options
IOS_MIN_OS_VERSION=16.4

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
# sample usage:
#

View File

@ -86,8 +86,7 @@ if (LLAMA_CURL)
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
endif ()
if (LLAMA_LLGUIDANCE)
@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
# v1.0.1:
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release
BUILD_COMMAND cargo build --release --package llguidance
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""

View File

@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--api-prefix"}, "PREFIX",
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
[](common_params & params, const std::string & value) {
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@ -2794,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),

View File

@ -17,6 +17,8 @@
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
@ -140,6 +142,7 @@ struct templates_params {
bool add_generation_prompt = true;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
json extra_context;
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
static std::string apply(
const common_chat_template & tmpl,
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
const struct templates_params & inputs,
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt)
{
minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages;
tmpl_inputs.tools = tools;
tmpl_inputs.add_generation_prompt = add_generation_prompt;
tmpl_inputs.extra_context = extra_context;
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
if (tools_override) {
tmpl_inputs.tools = *tools_override;
} else {
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
}
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
tmpl_inputs.extra_context = inputs.extra_context;
if (additional_context) {
tmpl_inputs.extra_context.merge_patch(*additional_context);
}
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
}
@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
data.preserved_tokens = {
"[TOOL_CALLS]",
};
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
}
@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
adjusted_messages.push_back(msg);
}
}
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
if (!inputs.enable_thinking) {
@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
{"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
auto prompt = apply(tmpl, inputs);
// Hacks to fix the official (broken) prompt.
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__);
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs);
// TODO: if (has_raw_python)
return data;
}
@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
json additional_context = {
json extra_context = json {
{"enable_thinking", inputs.enable_thinking},
};
extra_context.update(inputs.extra_context);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
if (!extra_context["enable_thinking"]) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
}
if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema);
}

View File

@ -7,6 +7,7 @@
#include <chrono>
#include <string>
#include <vector>
#include <map>
struct common_chat_templates;
@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
};
struct common_chat_params {

View File

@ -8,6 +8,7 @@
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#ifdef _WIN32
@ -369,6 +370,7 @@ struct common_params {
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
@ -381,6 +383,8 @@ struct common_params {
std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT
std::map<std::string, std::string> default_template_kwargs;
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;

View File

@ -300,6 +300,7 @@ class ModelBase:
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SHORTCONV_CONV,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
@ -310,6 +311,8 @@ class ModelBase:
gguf.MODEL_TENSOR.POSNET_NORM2,
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
)
)
or not new_name.endswith(".weight")
@ -320,7 +323,11 @@ class ModelBase:
self.match_model_tensor_name(new_name, key, bid)
for key in (
gguf.MODEL_TENSOR.TOKEN_EMBD,
gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
gguf.MODEL_TENSOR.OUTPUT,
gguf.MODEL_TENSOR.ALTUP_ROUTER,
gguf.MODEL_TENSOR.LAUREL_L,
gguf.MODEL_TENSOR.LAUREL_R,
)
):
if self.ftype in (
@ -809,6 +816,30 @@ class TextModel(ModelBase):
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
res = "hunyuan"
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
# ref: https://huggingface.co/skt/A.X-4.0
res = "a.x-4.0"
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
res = "falcon-h1"
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
res = "falcon-h1"
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
res = "falcon-h1"
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
res = "falcon-h1"
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
res = "midm-2.0"
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
res = "lfm2"
if res is None:
logger.warning("\n")
@ -921,13 +952,20 @@ class TextModel(ModelBase):
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
vocab_size = self.find_hparam([
"vocab_size_per_layer_input", # gemma3n
"vocab_size",
], optional=True) or tokenizer.vocab_size()
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
for token_id in range(tokenizer.vocab_size()):
if token_id >= vocab_size:
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
break
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
@ -2730,6 +2768,52 @@ class Qwen2Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Ernie4_5_ForCausalLM")
class Ernie4_5Model(TextModel):
model_arch = gguf.MODEL_ARCH.ERNIE4_5
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
super().set_gguf_parameters()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"]
num_kv_heads = self.hparams["num_key_value_heads"]
head_dim = self.hparams["head_dim"]
if "ernie." in name:
name = name.replace("ernie.", "model.")
# split the qkv weights
# qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
if "qkv_proj" in name:
name_q = name.replace("qkv_proj.weight", "q_proj.weight")
name_k = name.replace("qkv_proj.weight", "k_proj.weight")
name_v = name.replace("qkv_proj.weight", "v_proj.weight")
total_q_dim = num_heads * head_dim
total_k_dim = num_kv_heads * head_dim
total_v_dim = num_kv_heads * head_dim
q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
return [
(self.map_tensor_name(name_q), q_proj_weight),
(self.map_tensor_name(name_k), k_proj_weight),
(self.map_tensor_name(name_v), v_proj_weight)
]
# split the up_gate_proj into gate and up
# up_gate_proj shape: [2 * intermediate_size, hidden_size]
if "up_gate_proj" in name:
name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
dim_half = data_torch.shape[0] // 2
gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
return [
(self.map_tensor_name(name_gate), gate_proj_weight),
(self.map_tensor_name(name_up), up_proj_weight)
]
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register(
"Qwen2VLModel",
"Qwen2VLForConditionalGeneration",
@ -4217,6 +4301,7 @@ class Gemma2Model(TextModel):
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
class Gemma3Model(TextModel):
model_arch = gguf.MODEL_ARCH.GEMMA3
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
def set_vocab(self):
self._set_vocab_sentencepiece()
@ -4238,9 +4323,8 @@ class Gemma3Model(TextModel):
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
# attn_logit_softcapping is removed in Gemma3
assert hparams.get("attn_logit_softcapping") is None
assert hparams.get("final_logit_softcapping") is None
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
if hparams.get("rope_scaling") is not None:
@ -4252,7 +4336,7 @@ class Gemma3Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("language_model."):
if "language_model." in name:
name = name.replace("language_model.", "")
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@ -4267,8 +4351,9 @@ class Gemma3Model(TextModel):
# ref code in Gemma3RMSNorm
# output = output * (1.0 + self.weight.float())
# note: this is not the case on gemma3n
if name.endswith("norm.weight"):
data_torch = data_torch + 1
data_torch = data_torch + self.norm_shift
return [(self.map_tensor_name(name), data_torch)]
@ -4325,6 +4410,101 @@ class Gemma3VisionModel(MmprojModel):
return [] # skip other tensors
@ModelBase.register("Gemma3nForConditionalGeneration")
class Gemma3NModel(Gemma3Model):
model_arch = gguf.MODEL_ARCH.GEMMA3N
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
_altup_proj: list[Tensor] = []
_altup_unembd: list[Tensor] = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
self._altup_proj = [
torch.Tensor(), # to be replaced
torch.Tensor(), # to be replaced
torch.Tensor(), # to be replaced
]
self._altup_unembd = [
torch.Tensor(), # to be replaced
torch.Tensor(), # to be replaced
torch.Tensor(), # to be replaced
]
def set_vocab(self):
super().set_vocab()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
activation_sparsity_scale = []
for s in self.hparams["activation_sparsity_pattern"]:
normal_dist = torch.distributions.normal.Normal(0, 1)
std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
activation_sparsity_scale.append(std_multiplier.item())
self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
sliding_window_pattern = []
for t in self.hparams["layer_types"]:
sliding_window_pattern.append(t == "sliding_attention")
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
has_all = all(m.numel() > 0 for m in matrices)
if not has_all:
return None
else:
return torch.stack(matrices, dim=0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith("_scale"):
name = name + ".weight"
# TODO: implement self.prediction_coefs.weight.clamp_(...)
if "language_model." not in name:
return [] # skip non-language model tensors
if "altup_unembed_projections" in name:
data_torch = data_torch.to(device="cpu")
if ".0." in name:
self._altup_unembd[0] = data_torch
elif ".1." in name:
self._altup_unembd[1] = data_torch
elif ".2." in name:
self._altup_unembd[2] = data_torch
else:
raise ValueError(f"Unknown name: {name}")
out = self._stack_matrices(self._altup_unembd)
if out is not None:
return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
else:
return []
if "altup_projections" in name:
data_torch = data_torch.to(device="cpu")
if ".0." in name:
self._altup_proj[0] = data_torch
elif ".1." in name:
self._altup_proj[1] = data_torch
elif ".2." in name:
self._altup_proj[2] = data_torch
else:
raise ValueError(f"Unknown name: {name}")
out = self._stack_matrices(self._altup_proj)
if out is not None:
return [(self.map_tensor_name("model.altup_projections.weight"), out)]
else:
return []
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Starcoder2ForCausalLM")
class StarCoder2Model(TextModel):
model_arch = gguf.MODEL_ARCH.STARCODER2
@ -4623,6 +4803,14 @@ class ARwkv7Model(Rwkv7Model):
class MambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA
def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 8
@ -4697,6 +4885,216 @@ class MambaModel(TextModel):
return [(new_name, data_torch)]
@ModelBase.register("Mamba2ForCausalLM")
class Mamba2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA2
def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 16
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
# pad using ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size
if (self.dir_model / "tokenizer.model").is_file():
self._set_vocab_sentencepiece()
elif (self.dir_model / "tokenizer.model.v3").is_file():
# mamba-codestral
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
elif (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present
self._set_vocab_builtin("gpt-neox", vocab_size)
def set_gguf_parameters(self):
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2
# TODO: does this really matter?
# skip the assertion for FalconH1 Model
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
assert self.d_inner == 2 * self.d_model
assert self.d_inner % head_dim == 0
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(self.d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(self.d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
self.gguf_writer.add_ssm_group_count(self.n_group)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
name = name.removeprefix("model.")
if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
new_name = self.map_tensor_name(name)
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]):
# unsqueeze A to use similar shape semantics as Mamba-1
# (D is also unsqueezed, but for more straightforward broadcast internally)
data_torch = data_torch.reshape((*data_torch.shape, 1))
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
if name.endswith(".A_log"):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
yield (new_name, data_torch)
@ModelBase.register("JambaForCausalLM")
class JambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.JAMBA
def get_vocab_base_pre(self, tokenizer) -> str:
del tokenizer # unused
return "gpt-2"
def set_vocab(self):
if (self.dir_model / "tokenizer.model").is_file():
# Using Jamba's tokenizer.json causes errors on model load
# (something about "byte not found in vocab"),
# but there's a working tokenizer.model
self._set_vocab_sentencepiece()
else:
# Some Jamba models only have a tokenizer.json, which works.
self._set_vocab_gpt2()
def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
d_inner = self.hparams["mamba_expand"] * d_model
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
# ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
n_kv_head = self.hparams["num_key_value_heads"]
attn_offset = self.hparams["attn_layer_offset"]
attn_period = self.hparams["attn_layer_period"]
n_kv_vec = [0 for _ in range(attn_offset)] + [
n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
]
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(n_kv_vec)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_file_type(self.ftype)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Mini-Jamba
name = name.replace(".moe.", ".feed_forward.")
if bid is not None:
moe_offset = self.hparams["expert_layer_offset"]
moe_period = self.hparams["expert_layer_period"]
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
name = name.replace(".experts.0.", ".")
# process the experts separately
if ".feed_forward.experts." in name:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
for wid in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
# using the same merged name as qwen2moe
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
new_name = self.map_tensor_name(merged_name)
yield new_name, data_torch
return
new_name = self.map_tensor_name(name)
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
if name.endswith(".A_log"):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
yield (new_name, data_torch)
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("CohereForCausalLM")
class CommandR2Model(TextModel):
model_arch = gguf.MODEL_ARCH.COMMAND_R
@ -6058,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
]
has_experts = bool(self.hparams.get('num_local_experts'))
if name.endswith("shared_mlp.input_linear.weight"):
ffn_dim = self.hparams["shared_intermediate_size"]
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
gate, up = data_torch.split(ffn_dim, dim=-2)
if has_experts:
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
]
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
]
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
"""GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
layers and optionally uses MoE w/ a shared expert"""
model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
undo_permute = True
def __init__(self, *args, **kwargs):
# Hybrid mamba models use a prefix for the mamba-specific params.
# TODO: Extend this if the prefix(es) need to be configurable
self.hparam_prefixes = ["mamba"]
super().__init__(*args, **kwargs)
# Lists of which layers use ssm vs attention
self._attn_layers = self.get_attn_layers()
self._ssm_layers = [
i for i in range(self.block_count)
if i not in self._attn_layers
]
# n_group and d_inner are used during reshape_tensors for mamba2
self.d_model = self.find_hparam(["hidden_size", "d_model"])
self.n_group = self.find_hparam(["n_groups"])
self.d_inner = self.find_hparam(["expand"]) * self.d_model
def get_attn_layers(self):
# Explicit list of layer type names
if layer_types := self.hparams.get("layer_types"):
return [
i for i, typ in enumerate(layer_types)
if typ == "attention"
]
# Layer types indicated by index or period
attn_layers = self.hparams.get("attn_layer_indices", [])
if not attn_layers:
attn_period = self.hparams.get("attn_layer_period")
assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
attn_offset = self.hparams.get("attn_layer_offset")
assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
attn_layers = [
i for i in range(self.block_count)
if i % attn_period == attn_offset
]
return attn_layers
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
prefixed = []
for pfx in self.hparam_prefixes:
prefixed.extend(
"_".join([pfx, k])
for k in keys
)
keys = list(keys) + prefixed
return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
if (
name.endswith("block_sparse_moe.input_linear.weight")
or "shared_mlp" in name
):
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
# Determine whether this is a mamba layer or an attention layer
if bid in self._ssm_layers:
return Mamba2Model.modify_tensors(self, data_torch, name, bid)
elif bid in self._attn_layers:
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
return [(self.map_tensor_name(name), data_torch)]
def set_gguf_parameters(self):
"""This method merges params from both parents and some that are
specific to this model. The result is some duplication of how the params
get set. The following warnings are expected during conversion:
WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
WARNING:Duplicated key name 'granitehybrid.context_length'
"""
GraniteMoeModel.set_gguf_parameters(self)
## Mamba mixer params ##
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
self.gguf_writer.add_ssm_group_count(self.n_group)
self.gguf_writer.add_ssm_inner_size(self.d_inner)
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
# in llama.cpp
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
## Attention params ##
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
head_count_kv_vec = [
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
]
if rope_dim := self.hparams.get("attn_rotary_emb"):
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
## If Bamba, use rope, otherwise don't
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
if not use_rope:
self.gguf_writer.add_context_length(2**20)
## Validation ##
d_head = self.find_hparam(["d_head"], optional=True) or 64
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
def set_vocab(self):
self.hparams["pad_vocab_size_multiple"] = 8
Mamba2Model.set_vocab(self)
@ModelBase.register("BailingMoeForCausalLM")
class BailingMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.BAILINGMOE
@ -6278,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
super().set_gguf_parameters()
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
@ModelBase.register("FalconH1ForCausalLM")
class FalconH1Model(Mamba2Model):
model_arch = gguf.MODEL_ARCH.FALCON_H1
def __init__(self, *args, **kwargs):
# Set the hparam prefixes for Falcon Mamba2
self.hparam_prefixes = ["mamba"]
# Initialize the base Mamba2Model
super().__init__(*args, **kwargs)
# Use Llama conversion for attention
self._transformer_model_class = LlamaModel
# n_group and d_inner are used during reshape_tensors for mamba2
self.n_group = self.find_hparam(["n_groups"])
self.d_inner = self.find_hparam(["mamba_d_ssm"])
self.d_head = self.find_hparam(["d_head"])
# Initialize any Falcon Mamba2 specific attributes
self.has_attention = True # Falcon Mamba2 has attention components
# Load Falcon-H1 multipliers from hyperparameters
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
self.intermediate_size = self.find_hparam(["intermediate_size"])
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
prefixed = []
for pfx in self.hparam_prefixes:
prefixed.extend(
"_".join([pfx, k])
for k in keys
)
keys = list(keys) + prefixed
return super().find_hparam(keys, *args, **kwargs)
def set_vocab(self):
self._set_vocab_gpt2()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
tensors = list(super().modify_tensors(data_torch, name, bid))
tensor = tensors[0][1]
if "down_proj" in name:
tensor = tensor * self.mlp_multipliers[1]
elif "gate_proj" in name:
tensor = tensor * self.mlp_multipliers[0]
elif "k_proj" in name:
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
elif "q_proj" in name:
tensor = tensor * self.attention_in_multiplier
elif "v_proj" in name:
tensor = tensor * self.attention_in_multiplier
elif "o_proj" in name:
tensor = tensor * self.attention_out_multiplier
elif "out_proj" in name:
tensor = tensor * self.ssm_out_multiplier
elif "in_proj" in name:
tensor = tensor * self.ssm_in_multiplier
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
intermediate_size = self.hparams["mamba_d_ssm"]
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
elif "lm_head" in name:
tensor = tensor * self.hparams["lm_head_multiplier"]
elif "embed_tokens" in name:
tensor = tensor * self.hparams["embedding_multiplier"]
elif "mamba.norm" in name:
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
tensors = [(tensors[0][0], tensor)]
return tensors
def set_gguf_parameters(self):
super().set_gguf_parameters()
## General Params ##
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
# Override some Mamba2 defaults
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
## Attention params ##
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_key_length(self.hparams["head_dim"])
self.gguf_writer.add_value_length(self.hparams["head_dim"])
## Validation ##
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
# Add any other Falcon Mamba2 specific configuration
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
@ModelBase.register("HunYuanMoEV1ForCausalLM")
class HunYuanMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# For handling tied embeddings
self._tok_embd = None
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
# 1. Get the pre-tokenizer identifier hash
tokpre = self.get_vocab_base_pre(tokenizer)
# 2. Reverse-engineer the merges list from mergeable_ranks
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
if len(merged) == 2: # todo this is an assert in Qwen, why?
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# 3. Generate the tokens and toktypes lists
vocab_size = self.hparams["vocab_size"]
assert tokenizer.vocab_size == vocab_size
special_tokens = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
token = reverse_vocab[i]
tokens.append(token)
if i in special_tokens.values():
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)
# 4. Write all vocab-related fields to the GGUF writer
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_token_merges(merges)
# 5. Add special tokens and chat templates
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
# FIX for BOS token: Overwrite incorrect id read from config.json
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
moe_intermediate_size = hparams["moe_intermediate_size"]
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
moe_topk = hparams["moe_topk"]
assert all(topk == moe_topk[0] for topk in moe_topk)
self.gguf_writer.add_expert_used_count(moe_topk[0])
moe_shared_expert = hparams["num_shared_expert"]
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
# Rope
rope_scaling = hparams.get("rope_scaling", {})
if rope_scaling.get("type") == "dynamic":
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
alpha = rope_scaling.get("alpha", 1000)
base = hparams.get("rope_theta", 10000.0)
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
self.gguf_writer.add_rope_freq_base(scaled_base)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(1)
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "model.embed_tokens.weight":
self._tok_embd = data_torch.clone()
if name == "lm_head.weight":
if self.hparams.get("tie_word_embeddings", False):
logger.info("Skipping tied output layer 'lm_head.weight'")
return []
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
tensors: list[tuple[str, Tensor]] = []
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3
def set_vocab(self):
super().set_vocab()
# remove unsupported array slicing in chat template
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
if tokenizer.chat_template is not None:
chat_template = tokenizer.chat_template.replace("[:]", "")
self.gguf_writer.add_chat_template(chat_template)
@ModelBase.register("Lfm2ForCausalLM")
@ModelBase.register("LFM2ForCausalLM")
class LFM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.LFM2
def _add_feed_forward_length(self):
ff_dim = self.hparams["block_ff_dim"]
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
ff_dim = self.hparams["block_ff_dim"]
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
multiple_of = self.hparams["block_multiple_of"]
if auto_adjust_ff_dim:
ff_dim = int(2 * ff_dim / 3)
# custom dim factor multiplier
if ffn_dim_multiplier is not None:
ff_dim = int(ffn_dim_multiplier * ff_dim)
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
self.gguf_writer.add_feed_forward_length(ff_dim)
def set_gguf_parameters(self):
# set num_key_value_heads only for attention layers
self.hparams["num_key_value_heads"] = [
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
for layer_type in self.hparams["layer_types"]
]
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
self._add_feed_forward_length()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# conv op requires 2d tensor
if 'conv.conv' in name:
data_torch = data_torch.squeeze(1)
return [(self.map_tensor_name(name), data_torch)]
###### CONVERSION LOGIC ######
@ -6457,12 +7300,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# maybe we should fallback to text model's arch in that case, since not many models have both
text_config = hparams.get("text_config", {})
vision_config = hparams.get("vision_config", {})
arch = hparams["architectures"][0]
arch = None
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
arch = arches[0]
elif "ssm_cfg" in hparams:
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
arch = vision_config["architectures"][0]
if arch is None:
raise ValueError("Failed to detect model architecture")
return arch

View File

@ -128,6 +128,9 @@ models = [
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
]
# some models are known to be broken upstream, so we will skip them as exceptions
@ -137,6 +140,12 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
]

View File

@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@ -16,7 +16,7 @@ cd llama.cpp
## CPU Build with BLAS
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements.
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
```bash
cmake -S . -B build \
@ -28,8 +28,9 @@ cmake --build build --config Release -j $(nproc)
```
**Notes**:
- For faster repeated compilation, install [ccache](https://ccache.dev/)
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
- For faster repeated compilation, install [ccache](https://ccache.dev/)
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
```bash
cmake -S . -B build \
@ -41,18 +42,29 @@ cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
```
- For debug builds:
- By default, NNPA is enabled when available. To disable it (not recommended):
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NNPA=OFF
cmake --build build --config Release -j $(nproc)
```
- For debug builds:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Debug \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Debug -j $(nproc)
```
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
```bash
cmake -S . -B build \
@ -70,12 +82,18 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE.
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
```bash
python3 convert_hf_to_gguf.py \
--outfile model-name-be.f16.gguf \
@ -96,32 +114,42 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
3. **Convert existing GGUF Little-Endian model to Big-Endian**
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
```
For example,
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
```
**Notes:**
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
## IBM Accelerators
### 1. SIMD Acceleration
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
### 2. zDNN Accelerator
### 2. NNPA Vector Intrinsics Acceleration
*Only available in IBM z16 or later system. No direction at the moment.*
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
### 3. Spyre Accelerator
### 3. zDNN Accelerator
*No direction at the moment.*
_Only available in IBM z16 or later system. No direction at the moment._
### 4. Spyre Accelerator
_No direction at the moment._
## Performance Tuning
@ -145,6 +173,22 @@ It is strongly recommended to disable SMT via the kernel boot parameters as it n
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
## Frequently Asked Questions (FAQ)
1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
2. I'm getting extremely poor performance when running inference on a model
Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
## Getting Help on IBM Z & LinuxONE
1. **Bugs, Feature Requests**
@ -155,3 +199,48 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
## Appendix A: Hardware Support Matrix
| | Support | Minimum Compiler Version |
| ------- | ------- | ------------------------ |
| IBM z15 | ✅ | |
| IBM z16 | ✅ | |
| IBM z17 | ✅ | GCC 15.1.0 |
- ✅ - supported and verified to run as intended
- 🚫 - unsupported, we are unlikely able to provide support
## Appendix B: SIMD Support Matrix
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
| ---------- | ----------- | ---- | ---- | ----- |
| FP32 | ✅ | ✅ | ❓ | ❓ |
| FP16 | ✅ | ✅ | ❓ | ❓ |
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ |
| Q4_K | ✅ | ✅ | ❓ | ❓ |
| Q5_K | ✅ | ✅ | ❓ | ❓ |
| Q6_K | ✅ | ✅ | ❓ | ❓ |
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
- ✅ - acceleration available
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself

View File

@ -557,6 +557,10 @@ ninja
To read documentation for how to build on Android, [click here](./android.md)
## IBM Z & LinuxONE
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## Notes about GPU-accelerated backends
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

View File

@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non-standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

View File

@ -25,6 +25,9 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

95
docs/ops.md Normal file
View File

@ -0,0 +1,95 @@
# GGML Operations
List of GGML operations and backend support status.
Legend:
- ✅ Fully supported by this backend
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
| Operation | BLAS | CPU | CUDA | Metal |
|-----------|------|------|------|------|
| ABS | ❌ | ✅ | 🟡 | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ |
| ADD | ❌ | ✅ | ✅ | 🟡 |
| ADD1 | ❌ | ✅ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
| CONT | ❌ | ✅ | 🟡 | ✅ |
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
| COS | ❌ | ✅ | ✅ | 🟡 |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
| DIV | ❌ | ✅ | ✅ | 🟡 |
| DUP | ❌ | ✅ | 🟡 | 🟡 |
| ELU | ❌ | ✅ | ❌ | 🟡 |
| EXP | ❌ | ✅ | 🟡 | ❌ |
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
| GELU | ❌ | ✅ | 🟡 | 🟡 |
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
| LOG | ❌ | ✅ | ✅ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ |
| MUL | ❌ | ✅ | ✅ | 🟡 |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
| NEG | ❌ | ✅ | 🟡 | 🟡 |
| NORM | ❌ | ✅ | ✅ | 🟡 |
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ |
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
| REGLU | ❌ | ✅ | ✅ | 🟡 |
| RELU | ❌ | ✅ | 🟡 | 🟡 |
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
| ROPE | ❌ | ✅ | ✅ | ✅ |
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
| SCALE | ❌ | ✅ | ✅ | ✅ |
| SET | ❌ | ✅ | ❌ | ✅ |
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
| SGN | ❌ | ✅ | 🟡 | ❌ |
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
| SILU | ❌ | ✅ | 🟡 | 🟡 |
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
| SIN | ❌ | ✅ | ✅ | 🟡 |
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
| SQR | ❌ | ✅ | ✅ | 🟡 |
| SQRT | ❌ | ✅ | ✅ | 🟡 |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
| STEP | ❌ | ✅ | 🟡 | ❌ |
| SUB | ❌ | ✅ | ✅ | 🟡 |
| SUM | ❌ | ✅ | ✅ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
| TANH | ❌ | ✅ | 🟡 | 🟡 |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |

6534
docs/ops/BLAS.csv Normal file

File diff suppressed because it is too large Load Diff

6534
docs/ops/CPU.csv Normal file

File diff suppressed because it is too large Load Diff

6534
docs/ops/CUDA.csv Normal file

File diff suppressed because it is too large Load Diff

6534
docs/ops/Metal.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e
AI_NAME="${AI_NAME:-Miku}"

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -euo pipefail

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
# Temporary script - will be removed in the future

View File

@ -55,6 +55,8 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
@ -134,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
cd `dirname $0`
cd ..

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}
if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
int ret = llama_decode(ctx, batch);
if (ret != 0) {
GGML_ABORT("failed to decode, ret = %d\n", ret);
}
// sample the next token

View File

@ -1,4 +1,4 @@
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2024 Intel Corporation

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2025 Intel Corporation

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json

View File

@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_NNPA "ggml: enable nnpa" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@ -180,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@ -265,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
include/ggml-cann.h
include/ggml-cpp.h
include/ggml-cuda.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
@ -359,6 +358,13 @@ write_basic_package_version_file(
VERSION ${GGML_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
target_compile_definitions(ggml-base PRIVATE
GGML_VERSION="${GGML_INSTALL_VERSION}"
GGML_COMMIT="${GGML_BUILD_COMMIT}"
)
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)

View File

@ -339,7 +339,7 @@ extern "C" {
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
// Tensor initialization
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

View File

@ -101,6 +101,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
@ -133,6 +134,7 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

View File

@ -1,50 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_KOMPUTE_MAX_DEVICES 16
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
const char * name;
const char * vendor;
int subgroupSize;
uint64_t bufferAlignment;
uint64_t maxAlloc;
};
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void);
struct ggml_vk_device ggml_vk_current_device(void);
//
// backend API
//
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -314,6 +314,13 @@
extern "C" {
#endif
// Function type used in fatal error callbacks
typedef void (*ggml_abort_callback_t)(const char * error_message);
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
// Returns the old callback for chaining
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
@ -470,6 +477,7 @@ extern "C" {
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_GET_ROWS_BACK,
GGML_OP_SET_ROWS,
GGML_OP_DIAG,
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
@ -481,12 +489,13 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_UPSCALE,
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ROLL,
@ -519,6 +528,8 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
GGML_OP_GLU,
GGML_OP_COUNT,
};
@ -542,6 +553,16 @@ extern "C" {
GGML_UNARY_OP_COUNT,
};
enum ggml_glu_op {
GGML_GLU_OP_REGLU,
GGML_GLU_OP_GEGLU,
GGML_GLU_OP_SWIGLU,
GGML_GLU_OP_GEGLU_ERF,
GGML_GLU_OP_GEGLU_QUICK,
GGML_GLU_OP_COUNT,
};
enum ggml_object_type {
GGML_OBJECT_TYPE_TENSOR,
GGML_OBJECT_TYPE_GRAPH,
@ -627,6 +648,9 @@ extern "C" {
// misc
GGML_API const char * ggml_version(void);
GGML_API const char * ggml_commit(void);
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
GGML_API int64_t ggml_time_ms(void);
GGML_API int64_t ggml_time_us(void);
@ -657,6 +681,7 @@ extern "C" {
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@ -687,6 +712,9 @@ extern "C" {
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -758,6 +786,7 @@ extern "C" {
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
@ -1086,6 +1115,89 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// gated linear unit ops
// A: n columns, r rows,
// result is n / 2 columns, r rows,
// expects gate in second half of row, unless swapped is true
GGML_API struct ggml_tensor * ggml_glu(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_glu_op op,
bool swapped);
GGML_API struct ggml_tensor * ggml_reglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_reglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_swiglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_swiglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_erf(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_quick(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
// A: n columns, r rows,
// B: n columns, r rows,
GGML_API struct ggml_tensor * ggml_glu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
enum ggml_glu_op op);
GGML_API struct ggml_tensor * ggml_reglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_swiglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
@ -1185,6 +1297,19 @@ extern "C" {
struct ggml_tensor * a,
float s);
// x = s * a + b
GGML_API struct ggml_tensor * ggml_scale_bias(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set(
struct ggml_context * ctx,
@ -1375,6 +1500,23 @@ extern "C" {
struct ggml_tensor * b, // row indices
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
// a TD [n_embd, ne1, ne2, ne3]
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
//
// undefined behavior if destination rows overlap
//
// broadcast:
// ne2 % ne11 == 0
// ne3 % ne12 == 0
//
// return view(a)
GGML_API struct ggml_tensor * ggml_set_rows(
struct ggml_context * ctx,
struct ggml_tensor * a, // destination
struct ggml_tensor * b, // source
struct ggml_tensor * c); // row indices
GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx,
struct ggml_tensor * a);
@ -1412,8 +1554,14 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// a [ne0, ne01, ne02, ne03]
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
//
// broadcast:
// ne02 % ne12 == 0
// ne03 % ne13 == 0
//
// fused soft_max(a*scale + mask*(ALiBi slope))
// mask is optional
// max_bias = 0.0f for no ALiBi
GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
@ -1723,6 +1871,17 @@ extern "C" {
struct ggml_tensor * b,
int stride);
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
struct ggml_tensor * b, // input data [W, H, C, N]
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
@ -1765,6 +1924,12 @@ extern "C" {
enum ggml_scale_mode {
GGML_SCALE_MODE_NEAREST = 0,
GGML_SCALE_MODE_BILINEAR = 1,
GGML_SCALE_MODE_COUNT
};
enum ggml_scale_flag {
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
};
// interpolate
@ -1777,14 +1942,26 @@ extern "C" {
// interpolate
// interpolate scale to specified dimensions
GGML_API struct ggml_tensor * ggml_upscale_ext(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3,
enum ggml_scale_mode mode);
enum ggml_scale_mode mode),
"use ggml_interpolate instead");
// Up- or downsamples the input to the specified size.
// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
GGML_API struct ggml_tensor * ggml_interpolate(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3,
uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
@ -1847,11 +2024,17 @@ extern "C" {
#define GGML_KQ_MASK_PAD 64
// q: [n_embd_k, n_batch, n_head, 1]
// k: [n_embd_k, n_kv, n_head_kv, 1]
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
// q: [n_embd_k, n_batch, n_head, ne3 ]
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
//
// broadcast:
// n_head % n_head_kv == 0
// n_head % ne32 == 0
// ne3 % ne33 == 0
//
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,
@ -1890,7 +2073,8 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C);
struct ggml_tensor * C,
struct ggml_tensor * ids);
// partition into non-overlapping windows with padding if needed
// example:

View File

@ -365,7 +365,6 @@ ggml_add_backend(BLAS)
ggml_add_backend(CANN)
ggml_add_backend(CUDA)
ggml_add_backend(HIP)
ggml_add_backend(Kompute)
ggml_add_backend(METAL)
ggml_add_backend(MUSA)
ggml_add_backend(RPC)

View File

@ -61,10 +61,6 @@
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
@ -189,9 +185,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@ -817,8 +817,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
if (sched->debug > 1) {
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
@ -1826,7 +1827,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_free(copy.ctx_unallocated);
}
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
if (copy.buffer == NULL) {
return false;
@ -1837,28 +1838,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
assert(g1->n_nodes == g2->n_nodes);
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];
if (test_node != nullptr) {
// Compute the whole graph and only test the output for a specific tensor
ggml_backend_graph_compute(backend1, g1);
ggml_backend_graph_compute(backend2, g2);
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t1->op)) {
continue;
int test_node_idx = -1;
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
if (t1 == test_node) {
test_node_idx = i;
break;
}
}
GGML_ASSERT(test_node_idx != -1);
// compare results, calculate rms etc
if (!callback(i, t1, t2, user_data)) {
break;
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
} else {
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t1->op)) {
continue;
}
// compare results, calculate rms etc
if (!callback(i, t1, t2, user_data)) {
break;
}
}
}
ggml_backend_graph_copy_free(copy);
return true;

View File

@ -65,8 +65,9 @@
#include <aclnnop/aclnn_eq_tensor.h>
#include <aclnnop/aclnn_gt_scalar.h>
#include <aclnnop/aclnn_pow.h>
#include <aclnnop/aclnn_grouped_matmul_v2.h>
#include <aclnnop/aclnn_grouped_matmul_v3.h>
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
#include <aclnnop/aclnn_zero.h>
#include <float.h>
#include <cmath>
@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
nb[i] = nb[i - 1] * ne[i - 1];
}
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
aclTensor* zero =
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
return zero;
GGML_UNUSED(n_bytes);
}
/**
@ -2654,6 +2656,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
}
#ifdef ASCEND_310P
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
if (src0->type == GGML_TYPE_F16) {
src0_row.type = GGML_TYPE_F32;
}
// src0_row [D, M, 1, 1] weight without permute
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[0] = ori_src0_nb[0];
src0_row.nb[1] = ori_src0_nb[1];
src0_row.nb[2] = ori_src0_nb[1];
src0_row.nb[3] = ori_src0_nb[1];
// src1_row [D, 1, 1, 1] -> input
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
// dst_row [M, 1, 1, 1] -> out
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
//create weight for one row
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
// expert index
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
// If B = 1 (broadcast), always use 0; otherwise, use id.
int64_t i11 = (ne11 == 1 ? 0 : id);
int64_t i12 = iid1;
int64_t i1 = id;
int64_t i2 = i12;
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
src0_row.data = src0_tmp_ptr;
src1_row.data = src1_tmp_ptr;
dst_row.data = dst_tmp_ptr;
dst_row.src[0] = &src0_row;
dst_row.src[1] = &src1_row;
ggml_cann_mul_mat(ctx, &dst_row);
}
}
return;
#endif
std::vector<aclTensor*> src0_tensor_vec;
std::vector<aclTensor*> src1_tensor_vec;
std::vector<aclTensor*> dst_tensor_vec;
@ -2701,9 +2764,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
}
size_t GROUP_SIZE = 128;
// GroupedMatmulV2 required tensor_list.size < 128
// GroupedMatmulV3 required tensor_list.size < 128
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
// split and call GroupedMatmulV2
// split and call GroupedMatmulV3
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@ -2713,7 +2776,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);

View File

@ -359,7 +359,7 @@ struct ggml_backend_cann_context {
ggml_cann_set_device(device);
description = aclrtGetSocName();
bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
device, async_mode ? "ON" : "OFF");
}

View File

@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return false;
}
} break;
case GGML_OP_SET_ROWS:
{
// TODO: add support
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
return false;
} break;
case GGML_OP_CPY: {
ggml_tensor *src = op->src[0];
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@ -2182,12 +2188,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_CLAMP:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
@ -2205,6 +2209,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_COUNT_EQUAL:
return true;
case GGML_OP_SCALE:
float bias;
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
return bias == 0.0f; // TODO: support bias != 0.0f
case GGML_OP_SOFT_MAX:
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
case GGML_OP_FLASH_ATTN_EXT:{
// derived from [ggml-cuda.cu]
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@ -2227,6 +2239,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// DeepSeek MLA
return false;
}
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
if (op->src[0]->ne[3] != 1) {
return false;
}

View File

@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
set(GGML_NNPA OFF)
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
if (GGML_VXE)
message(STATUS "VX/VXE/VXE2 enabled")
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
if (GGML_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
@ -581,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (EMSCRIPTEN)
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
endif()
endfunction()

View File

@ -8,6 +8,7 @@
#include "mmq.h"
#include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "quants.h"
#include "ggml-quants.h"
#include <algorithm>
@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
// Quantize these floats
const float iscale = 127.f / amax;
y[i].d = GGML_FP32_TO_FP16(1 / iscale);
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
const float id = ( amax != 0.0f ) ? iscale : 0.f;
const __m512 vscale = _mm512_set1_ps(id);
@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
va[k] = _mm512_set1_epi32(a_ptr[k]);
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
}
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
}
// load b
@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
}
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
}
// load b
@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
va[k] = _mm512_set1_epi32(a_ptr[k]);
va[k] = _mm512_add_epi8(va[k], off);
}
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
}
// load b

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -62,7 +63,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
@ -104,7 +105,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
int32x4_t accv = vdupq_n_s32(0);
@ -120,7 +121,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
accv = vaddq_s32(accv, vi);
}
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv));
}
#else
GGML_UNUSED(nb);
@ -194,10 +195,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// dot product
sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// dot product
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// dot product
sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
float32_t summs_t[4] = {
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s),
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s)
};
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// mmla into int32x4_t
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
const uint8x16_t m4b = vdupq_n_u8(0x0F);
@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8x16_t m4b = vdupq_n_u8(0x0F);
summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
// extract the 5th bit via lookup table ((b) << 4)
memcpy(&qh0, x0->qh, sizeof(qh0));
@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
float32_t _scale[4] = {
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
};
float32x4_t scale = vld1q_f32(_scale);
@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
// scale creation
const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d);
const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d);
// duplicate deq1 in first half of vector and deq2 in second half of vector
const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
#if defined(__ARM_FEATURE_DOTPROD)
sumi0 = vaddq_s32(sumi0, sumi1);
@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
}
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
}
*s = sumf;
@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
#if defined(__ARM_FEATURE_DOTPROD)
sumi0 = vaddq_s32(sumi0, sumi1);
@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
sumf += (float) sumi * d;
}
@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
switch (vector_length) {
case 128:
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
case 256:
case 512:
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sum = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
const float32x4_t dmins = {
GGML_FP16_TO_FP32(x0->dmin) * y0->d,
GGML_FP16_TO_FP32(x0->dmin) * y1->d,
GGML_FP16_TO_FP32(x1->dmin) * y0->d,
GGML_FP16_TO_FP32(x1->dmin) * y1->d,
GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d,
GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d,
GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d,
GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d,
};
vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
const float32x4_t superblock_scale = {
GGML_FP16_TO_FP32(x0->d) * y0->d,
GGML_FP16_TO_FP32(x0->d) * y1->d,
GGML_FP16_TO_FP32(x1->d) * y0->d,
GGML_FP16_TO_FP32(x1->d) * y1->d,
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
};
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
}
@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
const float32x4_t superblock_scale = {
GGML_FP16_TO_FP32(x0->d) * y0->d,
GGML_FP16_TO_FP32(x0->d) * y1->d,
GGML_FP16_TO_FP32(x1->d) * y0->d,
GGML_FP16_TO_FP32(x1->d) * y1->d,
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
};
visum = vsubq_s32(visum, vibias);
@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_FP16_TO_FP32(x[i].d);
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_FP16_TO_FP32(x[i].d);
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
float sumf1 = 0, sumf2 = 0;
@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const uint8x8_t scales8 = vld1_u8(x[i].scales);
@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const uint8_t * GGML_RESTRICT sc = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
}
*s = sumf;
@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 4;
}
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
}
*s = sumf;
@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qh += 2;
}
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
}
*s = sumf;
@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
sumf +=
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
}
#endif
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
}
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
}
*s = sumf;
@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -6,6 +6,7 @@
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "traits.h"
#include <cmath>
@ -51,7 +52,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < 8; j++) {
@ -102,7 +103,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
@ -145,7 +146,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < 4; j++) {
@ -221,7 +222,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
@ -311,7 +312,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -399,7 +400,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -514,7 +515,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -608,7 +609,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -1117,7 +1118,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -1570,7 +1571,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -2039,7 +2040,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -2147,7 +2148,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Quantize these floats
const float d = max_scalar / 127.f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Quantize these floats
const float d = max_scalar / 127.f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
const __m256 mul = __lasx_xvreplfr2vr_s( id );
@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Compute the sum of the quants and set y[i].s
const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
// Convert int32 to int16
ni0 = lsx_packs_w( ni0, ni1 );
@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
/* Compute combined scale for the block */
const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (; ib + 1 < nb; ib += 2) {
// Compute combined scale for the block 0 and 1
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
const float d0 = GGML_FP16_TO_FP32(x[ib].d);
const float d1 = GGML_FP16_TO_FP32(y[ib].d);
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
/* Compute combined scale for the block */
const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
qx = __lasx_xvor_v(qx, bxhi);
const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
// Compute combined scale for the block
const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
__m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
__m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
// Set up scales
@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = __lasx_xvldi(0);
@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const uint8_t * GGML_RESTRICT sc = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = (__m256)__lasx_xvldi(0);
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
accum1 += d * sumi1;
}
@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 4;
}
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const __m256i p_1 = lasx_madd_h(p16_1, mone);
const __m256i p_2 = lasx_madd_h(p16_2, mone);
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
__lasx_xvffint_s_w(p_1), accum1);
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
__lasx_xvffint_s_w(p_2), accum2);
}
@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
#endif
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
sumi1 = __lasx_xvadd_w(p_1, sumi1);
sumi2 = __lasx_xvadd_w(p_2, sumi2);
}
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
__lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
}
@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float id = d ? 1.0f/d : 0.0f;
const vector float vid = vec_splats(id);
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const vector float v = vec_round(vec_mul(srcv[j], vid));
@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float id = d ? 1.0f/d : 0.0f;
const vector float vid = vec_splats(id);
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
vector int accv = vec_splats(0);
@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
accv = vec_add(accv, vec_sld(accv, accv, 4));
accv = vec_add(accv, vec_sld(accv, accv, 8));
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
}
#else
@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
vsumf0 = vec_madd(vxmin, vys, vsumf0);
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
vsumf0 = vec_madd(vxmin, vys, vsumf0);
vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector signed char q8x0 = vec_xl( 0, x[ib].qs);
@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
vector float vdmin = vec_mul(vxmin, vyd);
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
vector float vdmin = vec_mul(vxmin, vyd);
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
vector float vdmin = vec_mul(vxmin, vyd);
UNUSED(kmask1);
@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const uint8_t * GGML_RESTRICT sc = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
vector float vsumf3 = vec_splats(0.0f);
for (int i = 0; i < nb; ++i) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
vector float vyd = vec_splats(y[i].d);
vector float vd = vec_mul(vxd, vyd);
@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 4;
}
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
__builtin_prefetch(y[ib].qs, 0, 1);
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
vector float vd = vec_mul(vxd, vyd);
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
#endif
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
for (int ibl = 0; ibl < nb; ++ibl) {
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
vector float vyd = vec_splats(y[ibl].d);
vector float vd = vec_mul(vxd, vyd);
@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// set y[i].s
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
y[i].s = GGML_FP32_TO_FP16(sum*d);
y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
}
#else
@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
#endif
@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
#endif
@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
#endif
@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
#endif
@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
#endif
@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp;
@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
size_t vl = 16;
@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp;
@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
q3 += 32; q8 += 128; scale += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d * isum;
}
@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d*sum_t;
@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
q3 += 32; q8 += 128; scale += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d * isum;
}
break;
@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
__asm__ __volatile__(
@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
size_t vl = 8;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
break;
case 128:
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
__asm__ __volatile__(
@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT hm = x[i].qh;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
case 256:
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
case 128:
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];

View File

@ -6,6 +6,7 @@
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "traits.h"
#include <cmath>
@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
// vector version needs Zvfhmin extension
const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
const float b_scales[8] = {
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
GGML_FP16_TO_FP32(b_ptr[l].d[7])
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
};
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
// vector version needs Zvfhmin extension
const float a_scales[4] = {
GGML_FP16_TO_FP32(a_ptr[l].d[0]),
GGML_FP16_TO_FP32(a_ptr[l].d[1]),
GGML_FP16_TO_FP32(a_ptr[l].d[2]),
GGML_FP16_TO_FP32(a_ptr[l].d[3])
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
};
const float b_scales[8] = {
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
GGML_FP16_TO_FP32(b_ptr[l].d[7])
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
};
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f / d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const __vector float v = vec_mul(srcv[j], vec_splats(id));
@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f / d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
__vector int32_t acc = vec_splats(0);
@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
acc = vec_add(acc, vi);
}
y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
}
#else
GGML_UNUSED(nb);
@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
acc = vec_madd(v_xy, v_d, acc);
}
@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
__builtin_prefetch(x[ib].qs, 0, 1);
__builtin_prefetch(y[ib].qs, 0, 1);
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
const float32x4_t v_xy = vec_float(v_xy_);
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
acc = vec_madd(v_xy, v_d, acc);
}
@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
const float32x4_t v_xy = vec_float(v_xy_);
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
acc = vec_madd(v_xy, v_d, acc);
}
@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sum = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * restrict x0l = x[i].qs;
const uint8_t * restrict x0h = x[i].hmask;
@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
int8x16_t v_y[4];
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_FP16_TO_FP32(x[i].d);
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT x0l = x[i].ql;
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
// float sumf = 0;
// for (int i = 0; i < nb; ++i) {
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
// float sumf = 0.f;
// for (int i = 0; i < nb; ++i) {
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
// int32_t bsum = 0;
@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
}
#endif
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
}
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
}
*s = sumf;
@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
v128_t accv = wasm_i32x4_splat(0);
@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
accv = wasm_i32x4_add(accv, vi);
}
y[i].s = GGML_FP32_TO_FP16(
y[i].s = GGML_CPU_FP32_TO_FP16(
d * (wasm_i32x4_extract_lane(accv, 0) +
wasm_i32x4_extract_lane(accv, 1) +
wasm_i32x4_extract_lane(accv, 2) +
@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
);
// Accumulate results with scaling
float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
}
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
const v128_t m4b = wasm_i8x16_splat(0x0F);
@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
}
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
// Convert to float and accumulate
const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
}
@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
isum += wasm_i32x4_extract_lane(isum_vec, 0);
}
const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf += dall * isum - dmin * summs;
}
@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
// Accumulate results
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const v128_t v_d = wasm_f32x4_splat(d);
v128_t v_sum = wasm_f32x4_add(
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
wasm_v128_store(&aux32[0], acc0);
wasm_v128_store(&aux32[4], acc1);
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) {
sums[l] += d * aux32[l];
}
@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];

View File

@ -3,6 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "simd-mappings.h"
#include "../../quants.h"
#include "../../ggml-cpu-impl.h"
@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_
// quad fp16 delta calculation
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
// GGML_FP16_TO_FP32 is faster than Intel F16C
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
// GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
}
#endif
#elif defined(__SSSE3__)
@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Quantize these floats
const float d = maxScalar / 127.f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Quantize these floats
const float d = max_scalar / 127.f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].d = GGML_CPU_FP32_TO_FP16(d);
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
#if defined(__AVX2__)
// Compute the sum of the quants and set y[i].s
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
// Compute the sum of the quants and set y[i].s
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
_mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 0 and 1
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
const float d0 = GGML_FP16_TO_FP32(x[ib].d);
const float d1 = GGML_FP16_TO_FP32(y[ib].d);
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
const __m256 d0v = _mm256_set1_ps( d0 );
const __m256 d1v = _mm256_set1_ps( d1 );
@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
qx = _mm256_or_si256(qx, bxhi);
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
bxh = _mm_or_si128(bxh, bxhih);
bx_0 = MM256_SET_M128I(bxh, bxl);
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
// Main loop
for (; ib < nb; ++ib) {
// Compute combined scale for the block
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
__m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
sumi0 = _mm256_sub_epi16(sumi0, ysum);
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
}
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
}
*s = sumf;
@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
sumi0 = _mm256_add_epi16(sumi0, sumi1);
sumi0 = _mm256_sub_epi16(sumi0, ysum);
@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
}
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
sumf += (float) sumi * d;
}
@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m256i sumi1 = _mm256_setzero_si256();
@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
__m128i sumi1_0 = _mm_setzero_si128();
@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const uint8_t * GGML_RESTRICT sc = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
accum1 += d * sumi1;
@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
accum1 += d * sumi1;
@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 4;
}
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 8; qh += 4;
}
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qs += 8; qh += 4;
}
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
qh += 2;
}
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
}
*s = sumf;
@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
_mm256_cvtepi32_ps(p_1), accum1);
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
_mm256_cvtepi32_ps(p_2), accum2);
}
@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
#endif
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
sumi1 = _mm256_add_epi32(p_1, sumi1);
sumi2 = _mm256_add_epi32(p_2, sumi2);
}
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
}
@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
}
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
}
@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
#else
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -6,6 +6,7 @@
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "traits.h"
#include <cmath>
@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
float tmp[16];
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
}
for (int i = 0; i < 8; i++) {
tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
}
return _mm512_loadu_ps(tmp);
@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
_mm_storeu_si128((__m128i*)tmphalf, x);
for (int i = 0; i < 4; i++) {
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
}
return _mm512_loadu_ps(tmp);
@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
float tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
}
return _mm256_loadu_ps(tmp);
@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
float tmp[8];
for (int i = 0; i < 4; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
}
return _mm256_loadu_ps(tmp);
@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
}
return _mm256_loadu_ps(tmp);
@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
// Store the scale for the individual block
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
srcv[row_iter][0] = v0;
@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
// Load and convert to FP32 scale from block_q8_0
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
for(int m = 0; m < 4; m++) {
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for(int j = 0; j < ncols_interleaved; j++) {
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}

View File

@ -4,6 +4,7 @@
#include "traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-impl.h"
#include "simd-mappings.h"
#ifdef __cplusplus
@ -12,11 +13,11 @@
// convenience functions/macros for use in template calls
// note: these won't be required after the 'traits' lookup table is used.
static inline ggml_fp16_t f32_to_f16(float x) {
return GGML_FP32_TO_FP16(x);
return GGML_CPU_FP32_TO_FP16(x);
}
static inline float f16_to_f32(ggml_fp16_t x) {
return GGML_FP16_TO_FP32(x);
return GGML_CPU_FP16_TO_FP32(x);
}
static inline ggml_bf16_t f32_to_bf16(float x) {

View File

@ -62,11 +62,17 @@ struct ggml_compute_params {
#if defined(__s390x__) && defined(__VEC__)
#ifndef __VXE__
#define __VXE__
#endif
#endif // __VXE__
#ifndef __VXE2__
#define __VXE2__
#endif
#endif
#endif // __VXE2__
#endif // __s390x__ && __VEC__
#if defined(__s390x__) && defined(GGML_NNPA)
#ifndef __NNPA__
#define __NNPA__
#endif // __NNPA__
#endif // __s390x__ && GGML_NNPA
#if defined(__ARM_FEATURE_SVE)
#include <sys/prctl.h>

View File

@ -72,6 +72,9 @@
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
float ggml_table_f32_f16[1 << 16];
#if defined(__ARM_ARCH)
struct ggml_arm_arch_features_type {
int sve_cnt;
@ -192,6 +195,7 @@ typedef pthread_t ggml_thread_t;
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
@ -736,7 +740,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
}
} break;
case GGML_TYPE_BF16:
@ -795,7 +799,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
}
} break;
case GGML_TYPE_BF16:
@ -846,7 +850,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
case GGML_TYPE_BF16:
{
@ -891,7 +895,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
@ -920,7 +924,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
case GGML_TYPE_I32:
return ((int32_t *) data)[0];
case GGML_TYPE_F16:
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
case GGML_TYPE_BF16:
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
case GGML_TYPE_F32:
@ -947,7 +951,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
} break;
case GGML_TYPE_F16:
{
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
@ -985,7 +989,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
}
case GGML_TYPE_F16:
{
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
case GGML_TYPE_BF16:
{
@ -1024,7 +1028,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
} break;
case GGML_TYPE_F16:
{
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
@ -1051,7 +1055,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
case GGML_TYPE_I32:
return ((int32_t *) data)[0];
case GGML_TYPE_F16:
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
case GGML_TYPE_BF16:
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
case GGML_TYPE_F32:
@ -1078,7 +1082,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
} break;
case GGML_TYPE_F16:
{
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
@ -1189,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
}
}
static void ggml_compute_forward_mul_mat(
void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@ -1814,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_get_rows_back(params, tensor);
} break;
case GGML_OP_SET_ROWS:
{
ggml_compute_forward_set_rows(params, tensor);
} break;
case GGML_OP_DIAG:
{
ggml_compute_forward_diag(params, tensor);
@ -1858,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor);
} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
@ -1941,6 +1953,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_unary(params, tensor);
} break;
case GGML_OP_GLU:
{
ggml_compute_forward_glu(params, tensor);
} break;
case GGML_OP_GET_REL_POS:
{
ggml_compute_forward_get_rel_pos(params, tensor);
@ -2151,6 +2167,20 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
GGML_ABORT("fatal error");
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(node)) {
case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
{
n_tasks = n_threads;
} break;
default:
GGML_ABORT("fatal error");
}
break;
case GGML_OP_SILU_BACK:
case GGML_OP_MUL:
case GGML_OP_DIV:
@ -2167,6 +2197,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
n_tasks = n_threads;
} break;
case GGML_OP_GET_ROWS:
case GGML_OP_SET_ROWS:
{
// FIXME: get_rows can use additional threads, but the cost of launching additional threads
// decreases performance with GPU offloading
@ -2203,6 +2234,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
@ -2721,6 +2753,10 @@ struct ggml_cplan ggml_graph_plan(
GGML_ABORT("fatal error");
}
} break;
case GGML_OP_CONV_2D:
{
cur = GGML_IM2COL_WORK_SIZE;
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
const int64_t ne00 = node->src[0]->ne[0]; // W
@ -3121,6 +3157,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return ggml_graph_compute(cgraph, &cplan);
}
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
memcpy(y, x, n * sizeof(float));
}
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
int64_t i = 0;
#if defined(__F16C__)
@ -3141,9 +3181,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
}
for (; i + 3 < n; i += 4) {
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
}
}
@ -3167,9 +3222,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i + 0));
vec_xst(v_yl, 0, (float *)(y + i + 4));
}
for (; i + 3 < n; i += 4) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i));
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
}
}
@ -3369,6 +3440,14 @@ int ggml_cpu_has_vxe(void) {
#endif
}
int ggml_cpu_has_nnpa(void) {
#if defined(GGML_NNPA)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
return 1;
@ -3418,7 +3497,7 @@ int ggml_cpu_has_sme(void) {
}
void ggml_cpu_init(void) {
// needed to initialize f16 tables
// needed to initialize ggml_time
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
@ -3439,9 +3518,10 @@ void ggml_cpu_init(void) {
uint16_t u16;
ggml_fp16_t fp16;
} u = {i};
float f = GGML_FP16_TO_FP32(u.fp16);
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
ggml_table_f32_f16[i] = f;
ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

View File

@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
switch (op->op) {
case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
return
op->type != GGML_TYPE_IQ3_XXS &&
op->type != GGML_TYPE_IQ3_S &&
@ -578,6 +579,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_vxe()) {
features.push_back({ "VXE", "1" });
}
if (ggml_cpu_has_nnpa()) {
features.push_back({ "NNPA", "1" });
}
if (ggml_cpu_has_wasm_simd()) {
features.push_back({ "WASM_SIMD", "1" });
}

View File

@ -52,6 +52,7 @@
#include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#include "simd-mappings.h"
#include <array>
#include <type_traits>
@ -73,7 +74,7 @@
namespace {
inline float unhalf(ggml_fp16_t d) {
return GGML_FP16_TO_FP32(d);
return GGML_CPU_FP16_TO_FP32(d);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
float tmp[4];
for (int i = 0; i < 4; i++) {
tmp[i] = GGML_FP16_TO_FP32(p[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
}
return vec_xl(0, (const float *)(tmp));

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,9 @@
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
// Work buffer size for im2col operations in CONV2D
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
#ifdef __cplusplus
extern "C" {
#endif
@ -53,6 +56,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@ -64,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@ -93,6 +98,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@ -105,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}

View File

@ -2,6 +2,7 @@
#include "ggml-common.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "ggml-quants.h"
#include "quants.h"
@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
}
*s = sumf;
@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
}
*s = sumf;
@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
}
*s = sumf;
@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
sumi += x[ib].qs[j]*y[ib].qs[j];
}
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
}
*s = sumf;
@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
}
*s = sumf;
@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
sumf += (float) sumi * d;
}
@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int isum = 0;
int is = 0;
@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
int32_t bsum = 0;
@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
const uint8_t * GGML_RESTRICT sc = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
float sumf = 0;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
qs += 4;
}
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
qh += 2;
}
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
}
*s = sumf;
@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
float sumf = 0;
for (; ib < nb; ++ib) {
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
float sumf = 0;
for (int ibl = 0; ibl < nb; ++ibl) {
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
uint16_t h = x[ibl].scales_h;
const uint8_t * qs = x[ibl].qs;
const int8_t * q8 = y[ibl].qs;

View File

@ -6,6 +6,7 @@
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "traits.h"
#include "arch-fallback.h"
@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
const float d = amax / ((1 << 7) - 1);
id[row_iter] = d ? 1.0f / d : 0.0f;
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
}
for (int j = 0; j < QK8_0 * 4; j++) {
@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
}
}
}
@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}
@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
for(int m = 0; m < 4; m++) {
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for(int j = 0; j < ncols_interleaved; j++) {
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}
@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
}
}
}

View File

@ -2,10 +2,167 @@
#include "ggml-cpu-impl.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#endif
#if defined(__F16C__)
#include <immintrin.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
//
// simd mappings
//
// FP16 to FP32 conversion
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
//
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
//
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
__fp16 tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
__fp16 tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#elif defined(__F16C__)
#ifdef _MSC_VER
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
float f;
double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
double d;
ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#elif defined(__riscv) && defined(__riscv_zfhmin)
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
float f;
__asm__(
"fmv.h.x %[f], %[h]\n\t"
"fcvt.s.h %[f], %[f]"
: [f] "=&f" (f)
: [h] "r" (h)
);
return f;
}
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
__asm__(
"fcvt.h.s %[f], %[f]\n\t"
"fmv.x.h %[h], %[f]"
: [h] "=&r" (res)
: [f] "f" (f)
);
return res;
}
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
#elif defined(__NNPA__)
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
uint16x8_t v_h = vec_splats(h);
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
return vec_extend_to_fp32_hi(v_hd, 0)[0];
}
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
float32x4_t v_f = vec_splats(f);
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
return vec_extract(v_h, 0);
}
#endif
// precomputed f32 table for f16 (256 KB)
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_CPU_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_CPU_FP32_TO_FP16)
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
// we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros
@ -32,7 +189,7 @@
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
}
return _mm256_loadu_ps(tmp);
@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
_mm256_storeu_ps(arr, y);
for (int i = 0; i < 8; i++)
x[i] = GGML_FP32_TO_FP16(arr[i]);
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
}
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(p[0]);
tmp[1] = GGML_FP16_TO_FP32(p[1]);
tmp[2] = GGML_FP16_TO_FP32(p[2]);
tmp[3] = GGML_FP16_TO_FP32(p[3]);
tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
return wasm_v128_load(tmp);
}
@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
wasm_v128_store(tmp, x);
p[0] = GGML_FP32_TO_FP16(tmp[0]);
p[1] = GGML_FP32_TO_FP16(tmp[1]);
p[2] = GGML_FP32_TO_FP16(tmp[2]);
p[3] = GGML_FP32_TO_FP16(tmp[3]);
p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
}
#define GGML_F16x4 v128_t
@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]);
tmp[1] = GGML_FP16_TO_FP32(x[1]);
tmp[2] = GGML_FP16_TO_FP32(x[2]);
tmp[3] = GGML_FP16_TO_FP32(x[3]);
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
return _mm_loadu_ps(tmp);
}
@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
_mm_storeu_ps(arr, y);
x[0] = GGML_FP32_TO_FP16(arr[0]);
x[1] = GGML_FP32_TO_FP16(arr[1]);
x[2] = GGML_FP32_TO_FP16(arr[2]);
x[3] = GGML_FP32_TO_FP16(arr[3]);
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
}
#define GGML_F32Cx4 __m128
@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
#define GGML_F32x4_ZERO __lsx_vldi(0)
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
#define GGML_F32x4_MUL __lsx_vfmul_s
@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]);
tmp[1] = GGML_FP16_TO_FP32(x[1]);
tmp[2] = GGML_FP16_TO_FP32(x[2]);
tmp[3] = GGML_FP16_TO_FP32(x[3]);
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
return __lsx_vld(tmp, 0);
}
@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
__lsx_vst(y, arr, 0);
x[0] = GGML_FP32_TO_FP16(arr[0]);
x[1] = GGML_FP32_TO_FP16(arr[1]);
x[2] = GGML_FP32_TO_FP16(arr[2]);
x[3] = GGML_FP32_TO_FP16(arr[3]);
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
}
#define GGML_F32Cx4 __m128
@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
#define GGML_F32_STEP 32
#define GGML_F32_EPR 4
#define GGML_F32x4 __vector float
#define GGML_F32x4 float32x4_t
#define GGML_F32x4_ZERO vec_splats(0.0f)
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
@ -962,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
#define GGML_F16_STEP GGML_F32_STEP
#define GGML_F16_EPR GGML_F32_EPR
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
#if defined(__NNPA__)
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
return vec_extend_to_fp32_hi(v_xd, 0);
#else
float tmp[4];
for (int i = 0; i < 4; i++) {
tmp[i] = GGML_FP16_TO_FP32(x[i]);
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
}
// note: keep type-cast here to prevent compiler bugs
// see: https://github.com/ggml-org/llama.cpp/issues/12846
return vec_xl(0, (const float *)(tmp));
#endif
}
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
#if defined(__NNPA__)
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
x[0] = vec_extract(v_x, 0);
x[1] = vec_extract(v_x, 1);
x[2] = vec_extract(v_x, 2);
x[3] = vec_extract(v_x, 3);
#else
float arr[4];
// note: keep type-cast here to prevent compiler bugs
// see: https://github.com/ggml-org/llama.cpp/issues/12846
vec_xst(y, 0, (float *)(arr));
vec_xst(v_y, 0, (float *)(arr));
for (int i = 0; i < 4; i++) {
x[i] = GGML_FP32_TO_FP16(arr[i]);
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
}
#endif
}
#define GGML_F16_VEC GGML_F32x4
@ -1004,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#endif
#ifdef __cplusplus
}
#endif

View File

@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
for (int i = 0; i < np; i += ggml_f32_step) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
}
// leftovers
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
}
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np2 < n) {
@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
// leftovers
for (int i = np; i < n; ++i) {
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
#else
for (int i = 0; i < n; ++i) {
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
#endif
@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
}
}
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
int i = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
for (; i + 15 < n; i += 16) {
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
}
#elif defined(__AVX2__) && defined(__FMA__)
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
}
#elif defined(__SSE2__)
for (; i + 3 < n; i += 4) {
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
}
#elif defined(__ARM_NEON) && defined(__aarch64__)
for (; i + 3 < n; i += 4) {
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
}
#endif
for (; i < n; ++i) {
y[i] = ggml_silu_f32(x[i]) * g[i];
}
}
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
int i = 0;
ggml_float sum = 0;

View File

@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
for (int i = 0; i < n; ++i) {
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
}
}
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
for (int i = 0; i < n; ++i) {
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
}
}
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
}
}
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
for (int i = 0; i < n; ++i) {
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
}
}
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
for (int i = 0; i < n; ++i) {
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
}
}
@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
// leftovers
for (int i = np; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
}
#else
for (int i = 0; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
}
#endif
@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
GGML_F32_VEC_STORE(y + i, ay1);
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
}
@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
GGML_F32_VEC_STORE(y + i, ay1);
}
@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#endif
}
@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
#endif
}
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
#if defined(GGML_USE_ACCELERATE)
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
#elif defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
// scalar ; TODO: Write SVE code
for (int i = 0; i < n; ++i) {
y[i] = x[i]*s + b;
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] = x[i]*s + b;
}
#endif
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = x[i]*s + b;
}
#endif
}
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#if defined(GGML_USE_ACCELERATE)
@ -430,12 +469,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
}
#else
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
}
#endif
}
@ -444,103 +483,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16(v*v);
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
}
}
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
}
}
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
}
}
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
}
}
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
}
}
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
}
}
// TODO: optimize performance
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
}
}
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
}
}
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
}
}
@ -562,9 +601,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float xi = GGML_FP16_TO_FP32(x[i]);
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
y[i] = GGML_FP32_TO_FP16(res);
y[i] = GGML_CPU_FP32_TO_FP16(res);
}
}
@ -577,9 +616,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
} else if (x[i] >= 10.0f) {
y[i] = x[i];
} else {
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
}
}
}
@ -613,9 +652,9 @@ inline static float ggml_gelu_quick_f32(float x) {
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
uint16_t t;
for (int i = 0; i < n; ++i) {
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
}
}
#else
@ -628,8 +667,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_FP16_TO_FP32(x[i]);
y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
}
}
@ -638,8 +677,8 @@ inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x));
}
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
float v = GGML_FP16_TO_FP32(x);
return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
float v = GGML_CPU_FP16_TO_FP32(x);
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
}
#if __FINITE_MATH_ONLY__
@ -888,9 +927,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
}
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
const float v = GGML_FP16_TO_FP32(x);
const float v = GGML_CPU_FP16_TO_FP32(x);
const float s = 1.0f/(1.0f + expf(-v));
return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
}
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
@ -905,6 +944,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
}
}
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
for (int i = 0; i < n; ++i) {
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
}
}
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
}
}
#ifdef GGML_GELU_FP16
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
uint16_t t;
for (int i = 0; i < n; ++i) {
if (x[i] <= -10.0f) {
y[i] = 0.0f;
} else if (x[i] >= 10.0f) {
y[i] = x[i] * g[i];
} else {
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
}
}
}
#else
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
for (int i = 0; i < n; ++i) {
y[i] = ggml_gelu_f32(x[i]) * g[i];
}
}
#endif
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
const uint16_t * i16 = (const uint16_t *) x;
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(g[i]);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
}
}
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(x[i]);
float w = GGML_CPU_FP16_TO_FP32(g[i]);
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
}
}
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
for (int i = 0; i < n; ++i) {
float xi = x[i];
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
}
}
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
for (int i = 0; i < n; ++i) {
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
}
}
#ifdef GGML_GELU_QUICK_FP16
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
uint16_t t;
for (int i = 0; i < n; ++i) {
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
memcpy(&t, &fp16, sizeof(uint16_t));
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
}
}
#else
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
for (int i = 0; i < n; ++i) {
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
}
}
#endif
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
const uint16_t * i16 = (const uint16_t *) x;
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(g[i]);
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
}
}
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
ggml_float sum = 0.0;
@ -928,7 +1061,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
float sum = 0.0f;
for (int i = 0; i < n; ++i) {
sum += GGML_FP16_TO_FP32(x[i]);
sum += GGML_CPU_FP16_TO_FP32(x[i]);
}
*s = sum;
}

View File

@ -76,11 +76,9 @@
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@ -177,6 +175,23 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
const int id = ggml_cuda_get_device(); \
if (!shared_memory_limit_raised[id]) { \
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
shared_memory_limit_raised[id] = true; \
} \
} while (0)
#else
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \
GGML_UNUSED(nbytes); \
} while (0)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
@ -203,9 +218,9 @@ typedef float2 dfloat2;
#define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#define FP16_MMA_AVAILABLE
@ -219,9 +234,9 @@ typedef float2 dfloat2;
#define CP_ASYNC_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
#define FLASH_ATTN_AVAILABLE
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
@ -233,7 +248,8 @@ static bool fast_fp16_available(const int cc) {
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fast_fp16_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
}
// Any FP16 tensor core instructions are available for ggml code.
@ -242,7 +258,8 @@ static bool fp16_mma_available(const int cc) {
return false;
#else
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
GGML_CUDA_CC_IS_MTHREADS(cc)) {
return true;
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@ -259,11 +276,16 @@ static bool fp16_mma_available(const int cc) {
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fp16_mma_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
}
static bool bf16_mma_hardware_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
}
static bool fp32_mma_hardware_available(const int cc) {
return GGML_CUDA_CC_IS_CDNA(cc);
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.

View File

@ -728,3 +728,25 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
return nullptr;
}
}
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float, nv_bfloat16>;
case GGML_TYPE_F16:
return convert_unary_cuda<half, nv_bfloat16>;
default:
return nullptr;
}
}
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F16:
return convert_unary_cuda<half, float>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16, float>;
default:
return nullptr;
}
}

View File

@ -22,5 +22,10 @@ using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<float> to_fp32_nc_cuda_t;
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);

View File

@ -123,13 +123,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32<true>), smpbo);
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
} else {
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
@ -175,13 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
} else {
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);

View File

@ -32,7 +32,9 @@ typedef void (* fattn_kernel_t)(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -851,7 +853,8 @@ void launch_fattn(
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0,
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0,
Q->nb[1], Q->nb[2], Q->nb[3],
nb11, nb12, nb13,
nb21, nb22, nb23,

View File

@ -1223,7 +1223,9 @@ static __global__ void flash_attn_ext_f16(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -1288,7 +1290,8 @@ static __global__ void flash_attn_ext_f16(
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
@ -1327,7 +1330,8 @@ static __global__ void flash_attn_ext_f16(
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
@ -1348,8 +1352,8 @@ static __global__ void flash_attn_ext_f16(
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);

View File

@ -6,7 +6,7 @@
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
__launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q,
@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f16(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -64,7 +66,7 @@ static __global__ void flash_attn_tile_ext_f16(
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
const int stride_KV2 = nb11 / sizeof(half2);
@ -288,8 +290,8 @@ static __global__ void flash_attn_tile_ext_f16(
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);

View File

@ -6,7 +6,7 @@
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
__launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q,
@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f32(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -58,8 +60,8 @@ static __global__ void flash_attn_tile_ext_f32(
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
@ -76,7 +78,7 @@ static __global__ void flash_attn_tile_ext_f32(
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
const int stride_KV2 = nb11 / sizeof(half2);
@ -297,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32(
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}

View File

@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f16(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -68,7 +70,7 @@ static __global__ void flash_attn_vec_ext_f16(
K += nb12*(blockIdx.z / gqa_ratio);
V += nb22*(blockIdx.z / gqa_ratio);
const half * maskh = (const half *) mask + ne11*ic0;
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
@ -342,8 +344,8 @@ static __global__ void flash_attn_vec_ext_f16(
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);

View File

@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f32(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -51,8 +53,8 @@ static __global__ void flash_attn_vec_ext_f32(
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
@ -79,7 +81,8 @@ static __global__ void flash_attn_vec_ext_f32(
Q += nb02* blockIdx.z + nb01*ic0;
K += nb12*(blockIdx.z / gqa_ratio);
V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic0;
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
@ -334,13 +337,15 @@ static __global__ void flash_attn_vec_ext_f32(
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}

View File

@ -9,7 +9,11 @@
#ifdef FP16_MMA_AVAILABLE
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
#include <mma.h>
#ifdef GGML_USE_MUSA
namespace wmma = mtmusa::wmma;
#else // GGML_USE_MUSA
namespace wmma = nvcuda::wmma;
#endif // GGML_USE_MUSA
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
#include <rocwmma/rocwmma.hpp>
@ -42,7 +46,9 @@ static __global__ void flash_attn_ext_f16(
const int ne12,
const int ne13,
const int ne31,
const int ne32,
const int nb31,
const int nb32,
const int nb01,
const int nb02,
const int nb03,
@ -90,11 +96,11 @@ static __global__ void flash_attn_ext_f16(
constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0;
const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2);
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
const half2 * mask2 = (const half2 *) maskh;
const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half);
@ -436,7 +442,7 @@ static __global__ void flash_attn_ext_f16(
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);

View File

@ -168,6 +168,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_I32:
get_rows_cuda_float((const int32_t *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
@ -210,6 +214,10 @@ void get_rows_cuda(
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_I32:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (int32_t *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);

View File

@ -43,6 +43,7 @@
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv.cuh"
#include "ggml-cuda/gla.cuh"
#include "ggml-cuda/set-rows.cuh"
#include "ggml.h"
#include <algorithm>
@ -1227,9 +1228,12 @@ static void ggml_cuda_op_mul_mat_cublas(
const int cc = ggml_cuda_info().devices[id].cc;
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
if (src1->type != GGML_TYPE_BF16) {
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@ -1257,7 +1261,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
} else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
} else if (fast_fp16_hardware_available(cc) && use_fp16) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
if (src0->type != GGML_TYPE_F16) {
@ -1746,7 +1750,7 @@ static void ggml_cuda_op_mul_mat(
}
static __global__ void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void * src0_as_f16, const void * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst,
int64_t ne12, int64_t ne13,
int64_t ne23,
@ -1769,83 +1773,131 @@ static __global__ void k_compute_batched_ptrs(
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
}
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// Type traits for mapping ggml types to CUDA/cuBLAS types
template<ggml_type T>
struct batched_mul_mat_traits;
template<>
struct batched_mul_mat_traits<GGML_TYPE_F32> {
using cuda_type = float;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
static inline const cudaDataType_t data_type = CUDA_R_32F;
static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
static inline const float alpha = 1.0f;
static inline const float beta = 0.0f;
static inline const void* get_alpha() { static const float val = alpha; return &val; }
static inline const void* get_beta() { static const float val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
};
template<>
struct batched_mul_mat_traits<GGML_TYPE_BF16> {
using cuda_type = nv_bfloat16;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
static inline const cudaDataType_t data_type = CUDA_R_16BF;
static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
static inline const float alpha = 1.0f;
static inline const float beta = 0.0f;
static inline const void* get_alpha() { static const float val = alpha; return &val; }
static inline const void* get_beta() { static const float val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
};
template<>
struct batched_mul_mat_traits<GGML_TYPE_F16> {
using cuda_type = half;
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
static inline const cudaDataType_t data_type = CUDA_R_16F;
static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
static inline const half alpha = 1.0;
static inline const half beta = 0.0;
static inline const void* get_alpha() { static const half val = alpha; return &val; }
static inline const void* get_beta() { static const half val = beta; return &val; }
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
};
template<ggml_type src0_type>
static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
using traits = batched_mul_mat_traits<src0_type>;
using cuda_t = typename traits::cuda_type;
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == src0_type);
GGML_ASSERT(ggml_is_contiguous(dst));
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
// As long as dst is contiguous this does not matter though.
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t ne_dst = ggml_nelements(dst);
cudaStream_t main_stream = ctx.stream();
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
const half * src0_f16 = (const half *) src0->data;
float * dst_ddf = (float *) dst->data;
const half * src1_f16 = (const half *) src1->data;
const size_t ts_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == ts_src1);
int64_t s11 = nb11 / ts_src1;
int64_t s12 = nb12 / ts_src1;
int64_t s13 = nb13 / ts_src1;
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
// convert src1 to fp16
if (src1->type != GGML_TYPE_F16) {
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
const cuda_t * src0_ptr = nullptr;
const cuda_t * src1_ptr = nullptr;
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
// Handle src0
src0_ptr = (const cuda_t *) src0->data;
// Handle src1 - convert if necessary
if (src1->type == src0_type) {
src1_ptr = (const cuda_t *) src1->data;
} else {
// Convert src1 to target type using traits conversion functions
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
GGML_ASSERT(to_fp16_cuda != nullptr);
src1_alloc.alloc(ne_src1);
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
src1_f16 = src1_f16_alloc.get();
const auto convert_func = traits::get_nc_converter(src1->type);
GGML_ASSERT(convert_func != nullptr);
convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
src1_ptr = src1_alloc.get();
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
}
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
// Setup destination buffer
ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
char * dst_t;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = traits::compute_type;
cudaDataType_t cu_data_type = traits::data_type;
cudaDataType_t cu_data_type_a = traits::data_type;
cudaDataType_t cu_data_type_b = traits::data_type;
const void * alpha = traits::get_alpha();
const void * beta = traits::get_beta();
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
const void * alpha = &alpha_f16;
const void * beta = &beta_f16;
const float beta_f32 = 0.0f;
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
dst_t = (char *) dst_f16.alloc(ne_dst);
nbd2 /= sizeof(float) / sizeof(half);
nbd3 /= sizeof(float) / sizeof(half);
if constexpr (src0_type == GGML_TYPE_F32) {
dst_t = (char *) dst_ddf; // Direct F32 output
} else {
dst_t = (char *) dst_temp.alloc(ne_dst);
nbd2 /= sizeof(float) / sizeof(cuda_t);
nbd3 /= sizeof(float) / sizeof(cuda_t);
}
} else {
dst_t = (char *) dst_ddf;
cu_compute_type = CUBLAS_COMPUTE_32F;
cu_data_type = CUDA_R_32F;
cu_data_type = CUDA_R_32F;
alpha = &alpha_f32;
beta = &beta_f32;
beta = &beta_f32;
}
int id = ggml_cuda_get_device();
@ -1853,7 +1905,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
cu_compute_type = CUBLAS_COMPUTE_32F;
alpha = &alpha_f32;
beta = &beta_f32;
beta = &beta_f32;
}
GGML_ASSERT(ne12 % ne02 == 0);
@ -1863,35 +1915,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
#if 0
// use cublasGemmEx
{
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
int i03 = i13 / r3;
int i02 = i12 / r2;
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
#else
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
src1_f16, CUDA_R_16F, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
src1_ptr, cu_data_type_b, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -1902,34 +1934,55 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
size_t src1_stride_size = sizeof(cuda_t);
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
src0_f16, src1_f16, dst_t,
src0_ptr, src1_ptr, dst_t,
ptrs_src.get(), ptrs_dst.get(),
ne12, ne13,
ne23,
nb02, nb03,
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
(src1->type == src0_type) ? nb12 : s12*src1_stride_size,
(src1->type == src0_type) ? nb13 : s13*src1_stride_size,
nbd2, nbd3,
r2, r3);
CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK(
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
ne23,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
#endif
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
// Convert output back to F32 if needed
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream);
}
}
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32);
switch (src0->type) {
case GGML_TYPE_F32:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
break;
case GGML_TYPE_BF16:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
break;
case GGML_TYPE_F16:
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
break;
default:
GGML_ABORT("Unsupported type");
}
}
@ -1981,6 +2034,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
//TODO update for generic tensor parallelism
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
if (!split && use_mul_mat_vec) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
@ -1989,8 +2048,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_q) {
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
} else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_mul_mat_vec) {
@ -2172,6 +2231,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_GET_ROWS_BACK:
ggml_cuda_op_get_rows_back(ctx, dst);
break;
case GGML_OP_SET_ROWS:
ggml_cuda_op_set_rows(ctx, dst);
break;
case GGML_OP_DUP:
ggml_cuda_dup(ctx, dst);
break;
@ -2245,6 +2307,27 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
return false;
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(dst)) {
case GGML_GLU_OP_REGLU:
ggml_cuda_op_reglu(ctx, dst);
break;
case GGML_GLU_OP_GEGLU:
ggml_cuda_op_geglu(ctx, dst);
break;
case GGML_GLU_OP_SWIGLU:
ggml_cuda_op_swiglu(ctx, dst);
break;
case GGML_GLU_OP_GEGLU_ERF:
ggml_cuda_op_geglu_erf(ctx, dst);
break;
case GGML_GLU_OP_GEGLU_QUICK:
ggml_cuda_op_geglu_quick(ctx, dst);
break;
default:
return false;
}
break;
case GGML_OP_NORM:
ggml_cuda_op_norm(ctx, dst);
break;
@ -3038,6 +3121,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return false;
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(op)) {
case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
return ggml_is_contiguous_1(op->src[0]);
default:
return false;
}
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
@ -3061,9 +3156,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return false;
}
#ifdef GGML_USE_MUSA
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
return false;
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
return false;
}
if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
return false;
}
}
#endif // GGML_USE_MUSA
switch (a->type) {
@ -3090,11 +3192,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_BF16:
#ifdef GGML_USE_MUSA
if (a->type == GGML_TYPE_Q3_K) {
return false;
}
#endif // GGML_USE_MUSA
return true;
default:
return false;
@ -3107,6 +3204,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_BF16:
case GGML_TYPE_I32:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -3121,6 +3220,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
{
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
} break;
case GGML_OP_SET_ROWS:
{
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_I64;
} break;
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
@ -3236,12 +3341,26 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_LOG:
case GGML_OP_SSM_SCAN:
case GGML_OP_SSM_CONV:
return true;
case GGML_OP_SSM_SCAN: {
if (op->src[3]->ne[0] == 1) {
// Mamba2
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
} else {
// Mamba
// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
}
}
case GGML_OP_SSM_CONV: {
// assumes d_inner % threads == 0
return op->src[0]->ne[1] % 128 == 0;
}
case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_DIAG_MASK_INF:
return true;
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_SOFT_MAX_BACK: {
@ -3266,7 +3385,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
@ -3290,6 +3408,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (op->src[0]->ne[0] == 192) {
return false;
}
// TODO: support broadcast
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
if (op->src[0]->ne[3] != 1) {
return false;
}

View File

@ -3016,14 +3016,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc);
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, false>), nbytes_shared);
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, true>), nbytes_shared);
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;

View File

@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
return ne11 <= 4;
}
return ne11 <= 3;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp32_mma_hardware_available(cc)) {
return ne11 <= 3;
}
return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_F16:
@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp16_mma_hardware_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
return ne11 <= 5;
}
return ne11 <= 2;
}
return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_BF16:
@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (bf16_mma_hardware_available(cc)) {
return ne11 <= 3;
}
return ne11 <= 8;
}
return ne11 <= 8;
default:

View File

@ -50,21 +50,19 @@ static __global__ void rope_norm(
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) {
const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const int idst = row_dst*ne0 + i0;
const int ix = channel_x*s2 + row_x*s1 + i0;
if (i0 >= n_dims) {
dst[idst + 0] = x[ix + 0];
dst[idst + 1] = x[ix + 1];
return;
}
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -94,21 +92,19 @@ static __global__ void rope_neox(
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) {
const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const int idst = row_dst*ne0 + i0/2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
if (i0 >= n_dims) {
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
return;
}
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -138,21 +134,19 @@ static __global__ void rope_multi(
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) {
const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int row_x = row_dst % ne1;
const int channel_x = row_dst / ne1;
const int idst = row_dst*ne0 + i0/2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
if (i0 >= n_dims) {
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
return;
}
const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
const int sec_w = sections.v[1] + sections.v[0];
const int sector = (i0 / 2) % sect_dims;

View File

@ -1,18 +1,18 @@
#include "scale.cuh"
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = scale * x[i];
dst[i] = scale * x[i] + bias;
}
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
}
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -25,7 +25,9 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32);
float scale;
memcpy(&scale, dst->op_params, sizeof(float));
float bias;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&bias, (float *) dst->op_params + 1, sizeof(float));
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream);
}

View File

@ -0,0 +1,130 @@
#include "set-rows.cuh"
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
template<typename src_t, typename dst_t>
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
template<>
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
*dst_h = __float2half(*src_f);
}
template<>
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
*dst_f = *src_f;
}
template<typename src_t, typename dst_t>
static __global__ void k_set_rows(
const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t s10, const int64_t s11, const int64_t s12,
const int64_t s1, const int64_t s2, const int64_t s3) {
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
if (i >= ne_total) {
return;
}
const int64_t i03 = i / (ne00 * ne01 * ne02);
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
const int64_t i12 = i03 % ne12;
const int64_t i11 = i02 % ne11;
const int64_t i10 = i01;
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
const src_t* src_elem = src0_row + i00;
dst_t* dst_elem = dst_row_ptr + i00;
set_rows_1(src_elem, dst_elem);
}
template<typename src_t, typename dst_t>
static void set_rows_cuda(
const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
const dim3 grid_size(num_blocks);
const int64_t s01 = nb01/sizeof(src_t);
const int64_t s02 = nb02/sizeof(src_t);
const int64_t s03 = nb03/sizeof(src_t);
const int64_t s10 = nb10/sizeof(int64_t);
const int64_t s11 = nb11/sizeof(int64_t);
const int64_t s12 = nb12/sizeof(int64_t);
const int64_t s1 = nb1/sizeof(dst_t);
const int64_t s2 = nb2/sizeof(dst_t);
const int64_t s3 = nb3/sizeof(dst_t);
if (ne_total > 0) {
k_set_rows<<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
s01, s02, s03,
s10, s11, s12,
s1, s2, s3);
}
}
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I64);
GGML_TENSOR_BINARY_OP_LOCALS
const float * src0_d = (const float *)src0->data;
const int64_t * src1_d = (const int64_t *)src1->data;
cudaStream_t stream = ctx.stream();
if (dst->type == GGML_TYPE_F32) {
set_rows_cuda(
src0_d, src1_d, (float*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else if (dst->type == GGML_TYPE_F16) {
set_rows_cuda(
src0_d, src1_d, (half*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else {
GGML_ABORT("unsupported type");
}
}

View File

@ -0,0 +1,7 @@
#pragma once
#include "common.cuh"
#define CUDA_SET_ROWS_BLOCK_SIZE 256
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -2,6 +2,7 @@
#include "ggml.h"
#include "softmax.cuh"
#include <cstdint>
#include <utility>
template <typename T>
static __device__ __forceinline__ float t2f32(T val) {
@ -13,6 +14,29 @@ __device__ float __forceinline__ t2f32<half>(half val) {
return __half2float(val);
}
struct soft_max_params {
int64_t nheads;
uint32_t n_head_log2;
int64_t ncols;
int64_t nrows_x;
int64_t nrows_y;
int64_t ne00;
int64_t ne01;
int64_t ne02;
int64_t ne03;
int64_t nb11;
int64_t nb12;
int64_t nb13;
int64_t ne12;
int64_t ne13;
float scale;
float max_bias;
float m0;
float m1;
};
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
#ifdef __clang__
@ -21,16 +45,24 @@ __device__ float __forceinline__ t2f32<half>(half val) {
#endif // __clang__
template <bool use_shared, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const float * x, const T * mask, float * dst, const soft_max_params p) {
const int ncols = ncols_template == 0 ? p.ncols : ncols_template;
const int tid = threadIdx.x;
const int rowx = blockIdx.x;
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
const int64_t i03 = blockIdx.z;
const int64_t i02 = blockIdx.y;
const int64_t i01 = blockIdx.x;
//TODO: noncontigous inputs/outputs
const int rowx = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
const int64_t i11 = i01;
const int64_t i12 = i02 % p.ne12;
const int64_t i13 = i03 % p.ne13;
x += int64_t(rowx)*ncols;
mask += int64_t(rowy)*ncols * (mask != nullptr);
mask += (i11*p.nb11 + i12*p.nb12 + i13*p.nb13) / sizeof(T) * (mask != nullptr);
dst += int64_t(rowx)*ncols;
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
@ -38,7 +70,7 @@ static __global__ void soft_max_f32(
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);
extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
@ -55,7 +87,7 @@ static __global__ void soft_max_f32(
break;
}
const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
const float val = x[col]*p.scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
vals[col] = val;
max_val = max(max_val, val);
@ -150,64 +182,58 @@ static __global__ void soft_max_back_f32(
}
}
template<int... Ns, typename T>
static void launch_soft_max_kernels(const float * x, const T * mask, float * dst,
const soft_max_params & p, cudaStream_t stream, dim3 block_dims, dim3 block_nums, size_t nbytes_shared)
{
const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
auto launch_kernel = [=](auto I) -> bool {
constexpr int ncols = decltype(I)::value;
constexpr int block = (ncols > 1024 ? 1024 : ncols);
if (p.ncols == ncols) {
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, ncols, block, T>), smpbo);
soft_max_f32<true, ncols, block><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, p);
return true;
}
return false;
};
// unary fold over launch_kernel
if ((launch_kernel(std::integral_constant<int, Ns>{}) || ...)) {
return;
}
//default case
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, 0, 0, T>), smpbo);
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, dst, p);
}
template<typename T>
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) {
int nth = WARP_SIZE;
const int64_t ncols_x = params.ncols;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1);
const dim3 block_nums(nrows_x, 1, 1);
const dim3 block_nums(params.ne01, params.ne02, params.ne03);
const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) {
case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
}
if (nbytes_shared <= smpbo) {
launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, dst, params, stream, block_dims, block_nums, nbytes_shared);
} else {
const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, params);
}
}
@ -235,10 +261,11 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1];
const int64_t ne00 = src0->ne[0];
float scale = 1.0f;
float max_bias = 0.0f;
@ -247,10 +274,44 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
const int64_t nb11 = src1 ? src1->nb[1] : 1;
const int64_t nb12 = src1 ? src1->nb[2] : 1;
const int64_t nb13 = src1 ? src1->nb[3] : 1;
const int64_t ne12 = src1 ? src1->ne[2] : 1;
const int64_t ne13 = src1 ? src1->ne[3] : 1;
const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
soft_max_params params = {};
params.nheads = src0->ne[2];
params.n_head_log2 = n_head_log2;
params.ncols = ne00;
params.nrows_x = nrows_x;
params.nrows_y = nrows_y;
params.ne00 = src0->ne[0];
params.ne01 = src0->ne[1];
params.ne02 = src0->ne[2];
params.ne03 = src0->ne[3];
params.nb11 = nb11;
params.nb12 = nb12;
params.nb13 = nb13;
params.ne12 = ne12;
params.ne13 = ne13;
params.scale = scale;
params.max_bias = max_bias;
params.m0 = m0;
params.m1 = m1;
if (use_f16) {
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, params, stream);
} else {
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, params, stream);
}
}

View File

@ -107,8 +107,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
if (nc == 4) {
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else if (nc == 3) {
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
GGML_ABORT("Only support kernel size = 4 now.");
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
}
} else {
if (nc == 4) {
@ -116,8 +119,13 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else if (nc == 3) {
const int64_t split_n_t = 32;
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
GGML_ABORT("Only support kernel size = 4 right now.");
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
}
}
}

Some files were not shown because too many files have changed in this diff Show More