Compare commits
80 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
4150da9a95 | |
|
|
8e2da778da | |
|
|
ce3bf9b1a4 | |
|
|
2bbe4c2cf8 | |
|
|
1051ecd289 | |
|
|
0c3b7a9efe | |
|
|
0e76501e1d | |
|
|
4b060bf240 | |
|
|
9789e28459 | |
|
|
84ae04f163 | |
|
|
506bb6e010 | |
|
|
79456a690a | |
|
|
28068af789 | |
|
|
707cbafcaa | |
|
|
b137718878 | |
|
|
d2ff4e23ac | |
|
|
657a2e644b | |
|
|
f307926482 | |
|
|
7fdc8c893d | |
|
|
23f82f2420 | |
|
|
2656c0d265 | |
|
|
600a366478 | |
|
|
ea23c15990 | |
|
|
9ac2693a30 | |
|
|
a61c8bc3bf | |
|
|
593da7fa49 | |
|
|
9e41884dce | |
|
|
ec8fd7876b | |
|
|
a180ba78c7 | |
|
|
53eb9435da | |
|
|
d3435efc8a | |
|
|
f5f8812f7c | |
|
|
8ece3836b4 | |
|
|
046d5fd44e | |
|
|
480160d472 | |
|
|
15bff84bf5 | |
|
|
2524c26164 | |
|
|
cb14b06995 | |
|
|
55abc39355 | |
|
|
f2f6c88067 | |
|
|
945bf10627 | |
|
|
64848deb18 | |
|
|
9a5724dee2 | |
|
|
9c142e3a2a | |
|
|
df7fb92170 | |
|
|
2038101bd9 | |
|
|
568371a726 | |
|
|
5b8844ae53 | |
|
|
7e16fef085 | |
|
|
f5245b5e4e | |
|
|
ae9f8df778 | |
|
|
56d2fed2b3 | |
|
|
56426673cb | |
|
|
bb77764c2d | |
|
|
9dfa8ee950 | |
|
|
ca4a8370bc | |
|
|
03023296cf | |
|
|
8c77a04cc7 | |
|
|
ffba4f29e6 | |
|
|
3333951d86 | |
|
|
193ee38a1b | |
|
|
95ea9e0861 | |
|
|
ccbc84a537 | |
|
|
68b4d516c3 | |
|
|
24af22fc36 | |
|
|
07fbe19f1f | |
|
|
ea13cba850 | |
|
|
090b137e56 | |
|
|
968929528c | |
|
|
3d26a09dc7 | |
|
|
bd2a93d475 | |
|
|
e75ee11024 | |
|
|
da9b8d3300 | |
|
|
e443fbcfa5 | |
|
|
73d284a250 | |
|
|
df17a4c94f | |
|
|
1871f0ba56 | |
|
|
f47edb8c19 | |
|
|
da143b9940 | |
|
|
f1768d8f03 |
|
|
@ -33,6 +33,7 @@ FROM ubuntu:$UBUNTU_VERSION AS base
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||||
|
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||||
&& apt autoremove -y \
|
&& apt autoremove -y \
|
||||||
&& apt clean -y \
|
&& apt clean -y \
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
&& rm -rf /tmp/* /var/tmp/* \
|
||||||
|
|
|
||||||
|
|
@ -152,13 +152,13 @@ jobs:
|
||||||
DAWN_VERSION="v2.0.0"
|
DAWN_VERSION="v2.0.0"
|
||||||
DAWN_OWNER="reeselevine"
|
DAWN_OWNER="reeselevine"
|
||||||
DAWN_REPO="dawn"
|
DAWN_REPO="dawn"
|
||||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
|
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
curl -L -o artifact.zip \
|
curl -L -o artifact.zip \
|
||||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
mkdir dawn
|
mkdir dawn
|
||||||
unzip artifact.zip
|
unzip artifact.zip
|
||||||
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
|
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
@ -532,13 +532,13 @@ jobs:
|
||||||
DAWN_VERSION="v2.0.0"
|
DAWN_VERSION="v2.0.0"
|
||||||
DAWN_OWNER="reeselevine"
|
DAWN_OWNER="reeselevine"
|
||||||
DAWN_REPO="dawn"
|
DAWN_REPO="dawn"
|
||||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
|
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
|
||||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
curl -L -o artifact.zip \
|
curl -L -o artifact.zip \
|
||||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
mkdir dawn
|
mkdir dawn
|
||||||
unzip artifact.zip
|
unzip artifact.zip
|
||||||
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
|
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
@ -1418,7 +1418,6 @@ jobs:
|
||||||
echo "FIXME: test on devices"
|
echo "FIXME: test on devices"
|
||||||
|
|
||||||
openEuler-latest-cmake-cann:
|
openEuler-latest-cmake-cann:
|
||||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
shell: bash -el {0}
|
shell: bash -el {0}
|
||||||
|
|
@ -1705,6 +1704,34 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
|
ggml-ci-mac-webgpu:
|
||||||
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Dawn Dependency
|
||||||
|
id: dawn-depends
|
||||||
|
run: |
|
||||||
|
DAWN_VERSION="v2.0.0"
|
||||||
|
DAWN_OWNER="reeselevine"
|
||||||
|
DAWN_REPO="dawn"
|
||||||
|
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||||
|
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
|
curl -L -o artifact.zip \
|
||||||
|
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||||
|
mkdir dawn
|
||||||
|
unzip artifact.zip
|
||||||
|
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: ggml-ci
|
||||||
|
run: |
|
||||||
|
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||||
|
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||||
|
|
||||||
ggml-ci-mac-vulkan:
|
ggml-ci-mac-vulkan:
|
||||||
runs-on: [self-hosted, macOS, ARM64]
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -130,6 +130,7 @@ poetry.toml
|
||||||
# Local scripts
|
# Local scripts
|
||||||
/run-vim.sh
|
/run-vim.sh
|
||||||
/run-chat.sh
|
/run-chat.sh
|
||||||
|
/run-spec.sh
|
||||||
/.ccache/
|
/.ccache/
|
||||||
|
|
||||||
# IDE
|
# IDE
|
||||||
|
|
|
||||||
|
|
@ -182,6 +182,9 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
include("cmake/license.cmake")
|
||||||
|
license_add_file("llama.cpp" "LICENSE")
|
||||||
|
|
||||||
#
|
#
|
||||||
# 3rd-party
|
# 3rd-party
|
||||||
#
|
#
|
||||||
|
|
@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||||
add_subdirectory(tools)
|
add_subdirectory(tools)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Automatically add all files from the 'licenses' directory
|
||||||
|
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||||
|
|
||||||
|
foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||||
|
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
|
||||||
|
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
|
||||||
|
license_add_file("${NAME}" "${FILE_PATH}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if (LLAMA_BUILD_COMMON)
|
||||||
|
license_generate(common)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# install
|
# install
|
||||||
#
|
#
|
||||||
|
|
|
||||||
17
README.md
17
README.md
|
|
@ -200,6 +200,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
|
|
@ -482,21 +483,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## [`llama-run`](tools/run)
|
|
||||||
|
|
||||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
|
||||||
|
|
||||||
- <details>
|
|
||||||
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
llama-run granite-code
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
[^3]: [RamaLama](https://github.com/containers/ramalama)
|
|
||||||
|
|
||||||
## [`llama-simple`](examples/simple)
|
## [`llama-simple`](examples/simple)
|
||||||
|
|
||||||
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
||||||
|
|
@ -600,7 +586,6 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
|
||||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||||
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||||
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
|
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
|
||||||
|
|
|
||||||
58
SECURITY.md
58
SECURITY.md
|
|
@ -1,12 +1,52 @@
|
||||||
# Security Policy
|
# Security Policy
|
||||||
|
|
||||||
|
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
||||||
|
- [**Requirements**](#requirements)
|
||||||
|
- [**Covered Topics**](#covered-topics)
|
||||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
||||||
- [Untrusted models](#untrusted-models)
|
- [Untrusted models](#untrusted-models)
|
||||||
- [Untrusted inputs](#untrusted-inputs)
|
- [Untrusted inputs](#untrusted-inputs)
|
||||||
- [Data privacy](#data-privacy)
|
- [Data privacy](#data-privacy)
|
||||||
- [Untrusted environments or networks](#untrusted-environments-or-networks)
|
- [Untrusted environments or networks](#untrusted-environments-or-networks)
|
||||||
- [Multi-Tenant environments](#multi-tenant-environments)
|
- [Multi-Tenant environments](#multi-tenant-environments)
|
||||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
|
||||||
|
## Reporting a vulnerability
|
||||||
|
|
||||||
|
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||||
|
|
||||||
|
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||||
|
|
||||||
|
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Before submitting your report, ensure you meet the following requirements:
|
||||||
|
|
||||||
|
- You have read this policy and fully understand it.
|
||||||
|
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
|
||||||
|
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
|
||||||
|
|
||||||
|
Maintainers reserve the right to close the report if these requirements are not fulfilled.
|
||||||
|
|
||||||
|
## Covered Topics
|
||||||
|
|
||||||
|
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
|
||||||
|
|
||||||
|
- `src/**/*`
|
||||||
|
- `ggml/**/*`
|
||||||
|
- `gguf-py/**/*`
|
||||||
|
- `tools/server/*`, **excluding** the following topics:
|
||||||
|
- Web UI
|
||||||
|
- Features marked as experimental
|
||||||
|
- Features not recommended for use in untrusted environments (e.g., router, MCP)
|
||||||
|
- Bugs that can lead to Denial-of-Service attack
|
||||||
|
|
||||||
|
Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
|
||||||
|
|
||||||
|
For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
|
||||||
|
|
||||||
## Using llama.cpp securely
|
## Using llama.cpp securely
|
||||||
|
|
||||||
|
|
@ -55,19 +95,3 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
||||||
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
||||||
|
|
||||||
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
||||||
|
|
||||||
## Reporting a vulnerability
|
|
||||||
|
|
||||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
|
||||||
|
|
||||||
<!-- normal version -->
|
|
||||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
|
||||||
|
|
||||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
|
||||||
|
|
||||||
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
|
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
|
||||||
|
|
|
||||||
18
ci/run.sh
18
ci/run.sh
|
|
@ -105,7 +105,20 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||||
|
|
||||||
|
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||||
|
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||||
|
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
|
||||||
|
else
|
||||||
|
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
|
||||||
|
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_MUSA} ]; then
|
if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||||
|
|
@ -284,7 +297,8 @@ function gg_sum_test_scripts {
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
|
#local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
|
||||||
|
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
|
||||||
if [[ -s $gguf_0 ]]; then
|
if [[ -s $gguf_0 ]]; then
|
||||||
echo -n "$gguf_0"
|
echo -n "$gguf_0"
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
define_property(GLOBAL PROPERTY LICENSE_TEXT
|
||||||
|
BRIEF_DOCS "Embedded licenses"
|
||||||
|
FULL_DOCS "Global string containing all aggregated licenses"
|
||||||
|
)
|
||||||
|
|
||||||
|
function(license_add_file NAME FILE)
|
||||||
|
if(NOT IS_ABSOLUTE "${FILE}")
|
||||||
|
set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
|
||||||
|
endif()
|
||||||
|
if(EXISTS "${FILE}")
|
||||||
|
set(TITLE "License for ${NAME}")
|
||||||
|
string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
|
||||||
|
file(READ "${FILE}" TEXT)
|
||||||
|
get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
|
||||||
|
string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
|
||||||
|
set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
|
||||||
|
else()
|
||||||
|
message(WARNING "License file '${FILE}' not found")
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
function(license_generate TARGET_NAME)
|
||||||
|
message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
|
||||||
|
get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
|
||||||
|
|
||||||
|
set(CPP_CONTENT "// Generated by CMake\n\n")
|
||||||
|
string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
|
||||||
|
string(APPEND CPP_CONTENT "${TEXT}")
|
||||||
|
string(APPEND CPP_CONTENT "nullptr\n")
|
||||||
|
string(APPEND CPP_CONTENT "};\n")
|
||||||
|
|
||||||
|
set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
|
||||||
|
file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
|
||||||
|
|
||||||
|
if(TARGET ${TARGET_NAME})
|
||||||
|
target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
@ -155,27 +155,3 @@ if (LLAMA_LLGUIDANCE)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# copy the license files
|
|
||||||
#
|
|
||||||
|
|
||||||
# Check if running in GitHub Actions
|
|
||||||
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
|
||||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
|
||||||
|
|
||||||
# Copy all files from licenses/ to build/bin/
|
|
||||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
|
||||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
|
||||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
|
||||||
add_custom_command(
|
|
||||||
POST_BUILD
|
|
||||||
TARGET ${TARGET}
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
|
||||||
"${LICENSE_FILE}"
|
|
||||||
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
|
||||||
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
|
||||||
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
|
|
|
||||||
370
common/arg.cpp
370
common/arg.cpp
|
|
@ -2,10 +2,11 @@
|
||||||
|
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "download.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "download.h"
|
#include "preset.h"
|
||||||
|
|
||||||
// fix problem with std::min and std::max
|
// fix problem with std::min and std::max
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
|
@ -47,6 +48,8 @@
|
||||||
|
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
|
extern const char * LICENSES[];
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
using namespace common_arg_utils;
|
using namespace common_arg_utils;
|
||||||
|
|
||||||
|
|
@ -268,6 +271,55 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string clean_file_name(const std::string & fname) {
|
||||||
|
std::string clean_fname = fname;
|
||||||
|
string_replace_all(clean_fname, "\\", "_");
|
||||||
|
string_replace_all(clean_fname, "/", "_");
|
||||||
|
return clean_fname;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
||||||
|
GGML_ASSERT(!params.model.hf_repo.empty());
|
||||||
|
|
||||||
|
// the returned hf_repo is without tag
|
||||||
|
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
|
||||||
|
|
||||||
|
// "latest" tag (default if not specified) is translated to "default" preset
|
||||||
|
if (hf_tag == "latest") {
|
||||||
|
hf_tag = "default";
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool offline = params.offline;
|
||||||
|
std::string model_endpoint = get_model_endpoint();
|
||||||
|
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
||||||
|
|
||||||
|
// prepare local path for caching
|
||||||
|
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
|
||||||
|
auto preset_path = fs_get_cache_file(preset_fname);
|
||||||
|
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
||||||
|
const bool has_preset = status >= 200 && status < 400;
|
||||||
|
|
||||||
|
// remote preset is optional, so we don't error out if not found
|
||||||
|
if (has_preset) {
|
||||||
|
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
||||||
|
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
||||||
|
common_preset global;
|
||||||
|
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
||||||
|
remote_presets = ctx.cascade(global, remote_presets);
|
||||||
|
if (remote_presets.find(hf_tag) != remote_presets.end()) {
|
||||||
|
common_preset preset = remote_presets.at(hf_tag);
|
||||||
|
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
||||||
|
preset.apply_to_params(params);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG_INF("%s", "no remote preset found, skipping\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return has_preset;
|
||||||
|
}
|
||||||
|
|
||||||
struct handle_model_result {
|
struct handle_model_result {
|
||||||
bool found_mmproj = false;
|
bool found_mmproj = false;
|
||||||
common_params_model mmproj;
|
common_params_model mmproj;
|
||||||
|
|
@ -309,9 +361,7 @@ static handle_model_result common_params_handle_model(
|
||||||
// make sure model path is present (for caching purposes)
|
// make sure model path is present (for caching purposes)
|
||||||
if (model.path.empty()) {
|
if (model.path.empty()) {
|
||||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||||
std::string filename = model.hf_repo + "_" + model.hf_file;
|
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
|
||||||
// to make sure we don't have any slashes in the filename
|
|
||||||
string_replace_all(filename, "/", "_");
|
|
||||||
model.path = fs_get_cache_file(filename);
|
model.path = fs_get_cache_file(filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -425,61 +475,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::set<std::string> seen_args;
|
auto parse_cli_args = [&]() {
|
||||||
|
std::set<std::string> seen_args;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
|
|
||||||
std::string arg = argv[i];
|
std::string arg = argv[i];
|
||||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
}
|
|
||||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
||||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
||||||
}
|
|
||||||
if (!seen_args.insert(arg).second) {
|
|
||||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
||||||
}
|
|
||||||
auto & tmp = arg_to_options[arg];
|
|
||||||
auto opt = *tmp.first;
|
|
||||||
bool is_positive = tmp.second;
|
|
||||||
if (opt.has_value_from_env()) {
|
|
||||||
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
if (opt.handler_void) {
|
|
||||||
opt.handler_void(params);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
if (opt.handler_bool) {
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||||
opt.handler_bool(params, is_positive);
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
if (!seen_args.insert(arg).second) {
|
||||||
|
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||||
|
}
|
||||||
|
auto & tmp = arg_to_options[arg];
|
||||||
|
auto opt = *tmp.first;
|
||||||
|
bool is_positive = tmp.second;
|
||||||
|
if (opt.has_value_from_env()) {
|
||||||
|
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
if (opt.handler_void) {
|
||||||
|
opt.handler_void(params);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, is_positive);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// arg with single value
|
// arg with single value
|
||||||
check_arg(i);
|
check_arg(i);
|
||||||
std::string val = argv[++i];
|
std::string val = argv[++i];
|
||||||
if (opt.handler_int) {
|
if (opt.handler_int) {
|
||||||
opt.handler_int(params, std::stoi(val));
|
opt.handler_int(params, std::stoi(val));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (opt.handler_string) {
|
if (opt.handler_string) {
|
||||||
opt.handler_string(params, val);
|
opt.handler_string(params, val);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// arg with 2 values
|
// arg with 2 values
|
||||||
check_arg(i);
|
check_arg(i);
|
||||||
std::string val2 = argv[++i];
|
std::string val2 = argv[++i];
|
||||||
if (opt.handler_str_str) {
|
if (opt.handler_str_str) {
|
||||||
opt.handler_str_str(params, val, val2);
|
opt.handler_str_str(params, val, val2);
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
throw std::invalid_argument(string_format(
|
||||||
|
"error while handling argument \"%s\": %s\n\n"
|
||||||
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||||
|
arg.c_str(), e.what(), opt.to_string().c_str()));
|
||||||
}
|
}
|
||||||
} catch (std::exception & e) {
|
}
|
||||||
throw std::invalid_argument(string_format(
|
};
|
||||||
"error while handling argument \"%s\": %s\n\n"
|
|
||||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
// parse the first time to get -hf option (used for remote preset)
|
||||||
arg.c_str(), e.what(), opt.to_string().c_str()));
|
parse_cli_args();
|
||||||
|
|
||||||
|
// maybe handle remote preset
|
||||||
|
if (!params.model.hf_repo.empty()) {
|
||||||
|
std::string cli_hf_repo = params.model.hf_repo;
|
||||||
|
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
||||||
|
|
||||||
|
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
||||||
|
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
||||||
|
std::string preset_hf_repo = params.model.hf_repo;
|
||||||
|
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
||||||
|
|
||||||
|
if (has_preset) {
|
||||||
|
// re-parse CLI args to override preset values
|
||||||
|
parse_cli_args();
|
||||||
|
}
|
||||||
|
|
||||||
|
// preserve hf_repo from preset if needed
|
||||||
|
if (preset_has_hf_repo) {
|
||||||
|
params.model.hf_repo = preset_hf_repo;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -679,7 +755,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
||||||
"llama-quantize",
|
"llama-quantize",
|
||||||
"llama-qwen2vl-cli",
|
"llama-qwen2vl-cli",
|
||||||
"llama-retrieval",
|
"llama-retrieval",
|
||||||
"llama-run",
|
|
||||||
"llama-save-load-state",
|
"llama-save-load-state",
|
||||||
"llama-server",
|
"llama-server",
|
||||||
"llama-simple",
|
"llama-simple",
|
||||||
|
|
@ -854,6 +929,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
||||||
return value == "auto" || value == "-1";
|
return value == "auto" || value == "-1";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Simple CSV parser that handles quoted fields and escaped quotes
|
||||||
|
// example:
|
||||||
|
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
|
||||||
|
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
|
||||||
|
static std::vector<std::string> parse_csv_row(const std::string& input) {
|
||||||
|
std::vector<std::string> fields;
|
||||||
|
std::string field;
|
||||||
|
bool in_quotes = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < input.length(); ++i) {
|
||||||
|
char ch = input[i];
|
||||||
|
|
||||||
|
if (ch == '"') {
|
||||||
|
if (!in_quotes) {
|
||||||
|
// start of quoted field (only valid if at beginning of field)
|
||||||
|
if (!field.empty()) {
|
||||||
|
// quote appeared in middle of unquoted field, treat as literal
|
||||||
|
field += '"';
|
||||||
|
} else {
|
||||||
|
in_quotes = true; // start
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (i + 1 < input.length() && input[i + 1] == '"') {
|
||||||
|
// escaped quote: ""
|
||||||
|
field += '"';
|
||||||
|
++i; // skip the next quote
|
||||||
|
} else {
|
||||||
|
in_quotes = false; // end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (ch == ',') {
|
||||||
|
if (in_quotes) {
|
||||||
|
field += ',';
|
||||||
|
} else {
|
||||||
|
fields.push_back(std::move(field));
|
||||||
|
field.clear();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
field += ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the last field
|
||||||
|
fields.push_back(std::move(field));
|
||||||
|
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
// per-example default params
|
// per-example default params
|
||||||
// we define here to make sure it's included in llama-gen-docs
|
// we define here to make sure it's included in llama-gen-docs
|
||||||
|
|
@ -918,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--license"},
|
||||||
|
"show source code license and dependencies",
|
||||||
|
[](common_params &) {
|
||||||
|
for (int i = 0; LICENSES[i]; ++i) {
|
||||||
|
printf("%s\n", LICENSES[i]);
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cl", "--cache-list"},
|
{"-cl", "--cache-list"},
|
||||||
"show list of models in cache",
|
"show list of models in cache",
|
||||||
|
|
@ -1162,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--context-shift"},
|
{"--context-shift"},
|
||||||
{"--no-context-shift"},
|
{"--no-context-shift"},
|
||||||
|
|
@ -1250,7 +1383,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--in-file"}, "FNAME",
|
{"--in-file"}, "FNAME",
|
||||||
"an input file (use comma-separated values to specify multiple files)",
|
"an input file (use comma-separated values to specify multiple files)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
std::ifstream file(item);
|
std::ifstream file(item);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||||
|
|
@ -1397,7 +1530,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.warmup = value;
|
params.warmup = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--spm-infill"},
|
{"--spm-infill"},
|
||||||
string_format(
|
string_format(
|
||||||
|
|
@ -1713,7 +1846,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
||||||
else { throw std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--attention"}, "{causal,non-causal}",
|
{"--attention"}, "{causal,non-causal}",
|
||||||
"attention type for embeddings, use model default if unspecified",
|
"attention type for embeddings, use model default if unspecified",
|
||||||
|
|
@ -2002,7 +2135,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--image", "--audio"}, "FILE",
|
{"--image", "--audio"}, "FILE",
|
||||||
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
params.image.emplace_back(item);
|
params.image.emplace_back(item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2041,11 +2174,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mmap"},
|
{"--mmap"},
|
||||||
{"--no-mmap"},
|
{"--no-mmap"},
|
||||||
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.use_mmap = value;
|
params.use_mmap = value;
|
||||||
|
if (value) {
|
||||||
|
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
||||||
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MMAP"));
|
).set_env("LLAMA_ARG_MMAP"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-dio", "--direct-io"},
|
||||||
|
{"-ndio", "--no-direct-io"},
|
||||||
|
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
||||||
|
[](common_params & params, bool value) {
|
||||||
|
params.use_direct_io = value;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_DIO"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--numa"}, "TYPE",
|
{"--numa"}, "TYPE",
|
||||||
"attempt optimizations that help on some NUMA systems\n"
|
"attempt optimizations that help on some NUMA systems\n"
|
||||||
|
|
@ -2197,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
std::vector<std::string> split_arg{ it, {} };
|
std::vector<std::string> split_arg{ it, {} };
|
||||||
if (split_arg.size() >= llama_max_devices()) {
|
if (split_arg.size() >= llama_max_devices()) {
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||||
|
|
@ -2237,10 +2381,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_FIT"));
|
).set_env("LLAMA_ARG_FIT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{ "-fitt", "--fit-target" }, "MiB",
|
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
||||||
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
string_format("target margin per device for --fit, comma-separated list of values, "
|
||||||
[](common_params & params, int value) {
|
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
||||||
params.fit_params_target = value * size_t(1024*1024);
|
[](common_params & params, const std::string & value) {
|
||||||
|
std::string arg_next = value;
|
||||||
|
|
||||||
|
// split string by , and /
|
||||||
|
const std::regex regex{ R"([,/]+)" };
|
||||||
|
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
||||||
|
std::vector<std::string> split_arg{ it, {} };
|
||||||
|
if (split_arg.size() >= llama_max_devices()) {
|
||||||
|
throw std::invalid_argument(
|
||||||
|
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (split_arg.size() == 1) {
|
||||||
|
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < split_arg.size(); i++) {
|
||||||
|
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_FIT_TARGET"));
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2259,37 +2421,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
||||||
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
|
||||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::vector<std::string> kv_overrides;
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
|
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
|
||||||
std::string current;
|
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
|
||||||
bool escaping = false;
|
|
||||||
|
|
||||||
for (const char c : value) {
|
|
||||||
if (escaping) {
|
|
||||||
current.push_back(c);
|
|
||||||
escaping = false;
|
|
||||||
} else if (c == '\\') {
|
|
||||||
escaping = true;
|
|
||||||
} else if (c == ',') {
|
|
||||||
kv_overrides.push_back(current);
|
|
||||||
current.clear();
|
|
||||||
} else {
|
|
||||||
current.push_back(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (escaping) {
|
|
||||||
current.push_back('\\');
|
|
||||||
}
|
|
||||||
|
|
||||||
kv_overrides.push_back(current);
|
|
||||||
|
|
||||||
for (const auto & kv_override : kv_overrides) {
|
|
||||||
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
||||||
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2306,7 +2443,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--lora"}, "FNAME",
|
{"--lora"}, "FNAME",
|
||||||
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2317,7 +2454,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
||||||
"note: use comma-separated values",
|
"note: use comma-separated values",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
auto parts = string_split<std::string>(item, ':');
|
auto parts = string_split<std::string>(item, ':');
|
||||||
if (parts.size() != 2) {
|
if (parts.size() != 2) {
|
||||||
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
||||||
|
|
@ -2331,7 +2468,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--control-vector"}, "FNAME",
|
{"--control-vector"}, "FNAME",
|
||||||
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
params.control_vectors.push_back({ 1.0f, item, });
|
params.control_vectors.push_back({ 1.0f, item, });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2341,7 +2478,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
"add a control vector with user defined scaling SCALE\n"
|
"add a control vector with user defined scaling SCALE\n"
|
||||||
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
auto parts = string_split<std::string>(item, ':');
|
auto parts = string_split<std::string>(item, ':');
|
||||||
if (parts.size() != 2) {
|
if (parts.size() != 2) {
|
||||||
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
||||||
|
|
@ -2439,7 +2576,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--context-file"}, "FNAME",
|
{"--context-file"}, "FNAME",
|
||||||
"file to load context from (use comma-separated values to specify multiple files)",
|
"file to load context from (use comma-separated values to specify multiple files)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
for (const auto & item : string_split<std::string>(value, ',')) {
|
for (const auto & item : parse_csv_row(value)) {
|
||||||
std::ifstream file(item, std::ios::binary);
|
std::ifstream file(item, std::ios::binary);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
||||||
|
|
@ -2586,7 +2723,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.embd_normalize = value;
|
params.embd_normalize = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--embd-output-format"}, "FORMAT",
|
{"--embd-output-format"}, "FORMAT",
|
||||||
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
||||||
|
|
@ -2664,7 +2801,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--rerank", "--reranking"},
|
{"--rerank", "--reranking"},
|
||||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||||
|
|
@ -2675,9 +2812,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--api-key"}, "KEY",
|
{"--api-key"}, "KEY",
|
||||||
"API key to use for authentication (default: none)",
|
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.api_keys.push_back(value);
|
for (const auto & key : parse_csv_row(value)) {
|
||||||
|
if (!key.empty()) {
|
||||||
|
params.api_keys.push_back(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2691,7 +2832,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
std::string key;
|
std::string key;
|
||||||
while (std::getline(key_file, key)) {
|
while (std::getline(key_file, key)) {
|
||||||
if (!key.empty()) {
|
if (!key.empty()) {
|
||||||
params.api_keys.push_back(key);
|
params.api_keys.push_back(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
key_file.close();
|
key_file.close();
|
||||||
|
|
@ -2713,7 +2854,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template-kwargs"}, "STRING",
|
{"--chat-template-kwargs"}, "STRING",
|
||||||
string_format("sets additional params for the json template parser"),
|
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
auto parsed = json::parse(value);
|
auto parsed = json::parse(value);
|
||||||
for (const auto & item : parsed.items()) {
|
for (const auto & item : parsed.items()) {
|
||||||
|
|
@ -3351,6 +3492,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--save-logits"},
|
||||||
|
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.save_logits = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--logits-output-dir"}, "PATH",
|
||||||
|
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.logits_output_dir = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--tensor-filter"}, "REGEX",
|
||||||
|
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.tensor_filter.push_back(value);
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
||||||
|
|
||||||
// presets
|
// presets
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
|
||||||
|
|
@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||||
|
|
||||||
// initialize argument parser context - used by test-arg-parser and preset
|
// initialize argument parser context - used by test-arg-parser and preset
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
struct common_remote_params {
|
|
||||||
std::vector<std::string> headers;
|
|
||||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
||||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
||||||
};
|
|
||||||
// get remote file content, returns <http_code, raw_response_body>
|
|
||||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
|
||||||
|
|
|
||||||
|
|
@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||||
if (params.fit_params) {
|
if (params.fit_params) {
|
||||||
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
||||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
||||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1366,6 +1366,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
mparams.split_mode = params.split_mode;
|
mparams.split_mode = params.split_mode;
|
||||||
mparams.tensor_split = params.tensor_split;
|
mparams.tensor_split = params.tensor_split;
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
|
mparams.use_direct_io = params.use_direct_io;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,8 @@ int32_t cpu_get_num_math();
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_example {
|
enum llama_example {
|
||||||
|
LLAMA_EXAMPLE_BATCHED,
|
||||||
|
LLAMA_EXAMPLE_DEBUG,
|
||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_COMPLETION,
|
LLAMA_EXAMPLE_COMPLETION,
|
||||||
|
|
@ -331,12 +333,14 @@ struct common_params {
|
||||||
// offload params
|
// offload params
|
||||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||||
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
|
||||||
|
// margin per device in bytes for fitting parameters to free memory:
|
||||||
|
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
@ -372,6 +376,11 @@ struct common_params {
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
|
|
||||||
|
// llama-debug specific options
|
||||||
|
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
||||||
|
bool save_logits = false; // whether to save logits to files // NOLINT
|
||||||
|
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
@ -422,7 +431,8 @@ struct common_params {
|
||||||
bool kv_unified = false; // enable unified KV cache
|
bool kv_unified = false; // enable unified KV cache
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // enable mmap to use filesystem cache
|
||||||
|
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
|
|
|
||||||
|
|
@ -157,6 +157,20 @@ static std::string read_etag(const std::string & path) {
|
||||||
return none;
|
return none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_http_status_ok(int status) {
|
||||||
|
return status >= 200 && status < 400;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
|
||||||
|
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||||
|
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||||
|
std::string hf_repo = parts[0];
|
||||||
|
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
||||||
|
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
||||||
|
}
|
||||||
|
return {hf_repo, tag};
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef LLAMA_USE_CURL
|
#ifdef LLAMA_USE_CURL
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -306,11 +320,14 @@ static bool common_download_head(CURL * curl,
|
||||||
}
|
}
|
||||||
|
|
||||||
// download one single file from remote URL to local path
|
// download one single file from remote URL to local path
|
||||||
static bool common_download_file_single_online(const std::string & url,
|
// returns status code or -1 on error
|
||||||
|
static int common_download_file_single_online(const std::string & url,
|
||||||
const std::string & path,
|
const std::string & path,
|
||||||
const std::string & bearer_token) {
|
const std::string & bearer_token,
|
||||||
|
const common_header_list & custom_headers) {
|
||||||
static const int max_attempts = 3;
|
static const int max_attempts = 3;
|
||||||
static const int retry_delay_seconds = 2;
|
static const int retry_delay_seconds = 2;
|
||||||
|
|
||||||
for (int i = 0; i < max_attempts; ++i) {
|
for (int i = 0; i < max_attempts; ++i) {
|
||||||
std::string etag;
|
std::string etag;
|
||||||
|
|
||||||
|
|
@ -330,6 +347,11 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
common_load_model_from_url_headers headers;
|
common_load_model_from_url_headers headers;
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||||
curl_slist_ptr http_headers;
|
curl_slist_ptr http_headers;
|
||||||
|
|
||||||
|
for (const auto & h : custom_headers) {
|
||||||
|
std::string s = h.first + ": " + h.second;
|
||||||
|
http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
|
||||||
|
}
|
||||||
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
|
||||||
if (!was_perform_successful) {
|
if (!was_perform_successful) {
|
||||||
head_request_ok = false;
|
head_request_ok = false;
|
||||||
|
|
@ -365,7 +387,7 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||||
if (remove(path.c_str()) != 0) {
|
if (remove(path.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -374,14 +396,14 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
if (std::filesystem::exists(path_temporary)) {
|
if (std::filesystem::exists(path_temporary)) {
|
||||||
if (remove(path_temporary.c_str()) != 0) {
|
if (remove(path_temporary.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (std::filesystem::exists(path)) {
|
if (std::filesystem::exists(path)) {
|
||||||
if (remove(path.c_str()) != 0) {
|
if (remove(path.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -408,23 +430,27 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
|
|
||||||
long http_code = 0;
|
long http_code = 0;
|
||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||||
if (http_code < 200 || http_code >= 400) {
|
|
||||||
|
int status = static_cast<int>(http_code);
|
||||||
|
if (!is_http_status_ok(http_code)) {
|
||||||
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||||
return false;
|
return status; // TODO: maybe only return on certain codes
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return static_cast<int>(http_code);
|
||||||
} else {
|
} else {
|
||||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
return 304; // Not Modified - fake cached response
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return -1; // max attempts reached
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
||||||
|
|
@ -454,8 +480,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
|
||||||
}
|
}
|
||||||
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
||||||
|
|
||||||
for (const auto & header : params.headers) {
|
for (const auto & header : params.headers) {
|
||||||
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
|
std::string header_ = header.first + ": " + header.second;
|
||||||
|
http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
|
||||||
}
|
}
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
||||||
|
|
||||||
|
|
@ -617,9 +645,11 @@ static bool common_pull_file(httplib::Client & cli,
|
||||||
}
|
}
|
||||||
|
|
||||||
// download one single file from remote URL to local path
|
// download one single file from remote URL to local path
|
||||||
static bool common_download_file_single_online(const std::string & url,
|
// returns status code or -1 on error
|
||||||
|
static int common_download_file_single_online(const std::string & url,
|
||||||
const std::string & path,
|
const std::string & path,
|
||||||
const std::string & bearer_token) {
|
const std::string & bearer_token,
|
||||||
|
const common_header_list & custom_headers) {
|
||||||
static const int max_attempts = 3;
|
static const int max_attempts = 3;
|
||||||
static const int retry_delay_seconds = 2;
|
static const int retry_delay_seconds = 2;
|
||||||
|
|
||||||
|
|
@ -629,6 +659,9 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
if (!bearer_token.empty()) {
|
if (!bearer_token.empty()) {
|
||||||
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
||||||
}
|
}
|
||||||
|
for (const auto & h : custom_headers) {
|
||||||
|
default_headers.emplace(h.first, h.second);
|
||||||
|
}
|
||||||
cli.set_default_headers(default_headers);
|
cli.set_default_headers(default_headers);
|
||||||
|
|
||||||
const bool file_exists = std::filesystem::exists(path);
|
const bool file_exists = std::filesystem::exists(path);
|
||||||
|
|
@ -647,8 +680,10 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
|
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
|
||||||
if (file_exists) {
|
if (file_exists) {
|
||||||
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
|
||||||
return true;
|
return 304; // 304 Not Modified - fake cached response
|
||||||
}
|
}
|
||||||
|
return head->status; // cannot use cached file, return raw status code
|
||||||
|
// TODO: maybe retry only on certain codes
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string etag;
|
std::string etag;
|
||||||
|
|
@ -680,12 +715,12 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
if (file_exists) {
|
if (file_exists) {
|
||||||
if (!should_download_from_scratch) {
|
if (!should_download_from_scratch) {
|
||||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||||
return true;
|
return 304; // 304 Not Modified - fake cached response
|
||||||
}
|
}
|
||||||
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||||
if (remove(path.c_str()) != 0) {
|
if (remove(path.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -697,7 +732,7 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
existing_size = std::filesystem::file_size(path_temporary);
|
existing_size = std::filesystem::file_size(path_temporary);
|
||||||
} else if (remove(path_temporary.c_str()) != 0) {
|
} else if (remove(path_temporary.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -718,15 +753,16 @@ static bool common_download_file_single_online(const std::string & url,
|
||||||
|
|
||||||
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
|
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
if (!etag.empty()) {
|
if (!etag.empty()) {
|
||||||
write_etag(path, etag);
|
write_etag(path, etag);
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
return head->status; // TODO: use actual GET status?
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return -1; // max attempts reached
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
|
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
|
||||||
|
|
@ -734,13 +770,9 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||||
auto [cli, parts] = common_http_client(url);
|
auto [cli, parts] = common_http_client(url);
|
||||||
|
|
||||||
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
||||||
|
|
||||||
for (const auto & header : params.headers) {
|
for (const auto & header : params.headers) {
|
||||||
size_t pos = header.find(':');
|
headers.emplace(header.first, header.second);
|
||||||
if (pos != std::string::npos) {
|
|
||||||
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
|
|
||||||
} else {
|
|
||||||
headers.emplace(header, "");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.timeout > 0) {
|
if (params.timeout > 0) {
|
||||||
|
|
@ -769,32 +801,45 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||||
|
|
||||||
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
|
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
|
||||||
|
|
||||||
static bool common_download_file_single(const std::string & url,
|
int common_download_file_single(const std::string & url,
|
||||||
const std::string & path,
|
const std::string & path,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
bool offline) {
|
bool offline,
|
||||||
|
const common_header_list & headers) {
|
||||||
if (!offline) {
|
if (!offline) {
|
||||||
return common_download_file_single_online(url, path, bearer_token);
|
return common_download_file_single_online(url, path, bearer_token, headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!std::filesystem::exists(path)) {
|
if (!std::filesystem::exists(path)) {
|
||||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||||
return false;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||||
return true;
|
return 304; // Not Modified - fake cached response
|
||||||
}
|
}
|
||||||
|
|
||||||
// download multiple files from remote URLs to local paths
|
// download multiple files from remote URLs to local paths
|
||||||
// the input is a vector of pairs <url, path>
|
// the input is a vector of pairs <url, path>
|
||||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
|
||||||
|
const std::string & bearer_token,
|
||||||
|
bool offline,
|
||||||
|
const common_header_list & headers) {
|
||||||
// Prepare download in parallel
|
// Prepare download in parallel
|
||||||
std::vector<std::future<bool>> futures_download;
|
std::vector<std::future<bool>> futures_download;
|
||||||
|
futures_download.reserve(urls.size());
|
||||||
|
|
||||||
for (auto const & item : urls) {
|
for (auto const & item : urls) {
|
||||||
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
futures_download.push_back(
|
||||||
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
std::async(
|
||||||
}, item));
|
std::launch::async,
|
||||||
|
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
|
||||||
|
const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
|
||||||
|
return is_http_status_ok(http_status);
|
||||||
|
},
|
||||||
|
item
|
||||||
|
)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for all downloads to complete
|
// Wait for all downloads to complete
|
||||||
|
|
@ -807,17 +852,18 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_download_model(
|
bool common_download_model(const common_params_model & model,
|
||||||
const common_params_model & model,
|
const std::string & bearer_token,
|
||||||
const std::string & bearer_token,
|
bool offline,
|
||||||
bool offline) {
|
const common_header_list & headers) {
|
||||||
// Basic validation of the model.url
|
// Basic validation of the model.url
|
||||||
if (model.url.empty()) {
|
if (model.url.empty()) {
|
||||||
LOG_ERR("%s: invalid model url\n", __func__);
|
LOG_ERR("%s: invalid model url\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
|
||||||
|
if (!is_http_status_ok(http_status)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -876,27 +922,26 @@ bool common_download_model(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download in parallel
|
// Download in parallel
|
||||||
common_download_file_multiple(urls, bearer_token, offline);
|
common_download_file_multiple(urls, bearer_token, offline, headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
|
||||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
const std::string & bearer_token,
|
||||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
bool offline,
|
||||||
std::string hf_repo = parts[0];
|
const common_header_list & custom_headers) {
|
||||||
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
// the returned hf_repo is without tag
|
||||||
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
|
||||||
}
|
|
||||||
|
|
||||||
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
|
||||||
|
|
||||||
// headers
|
// headers
|
||||||
std::vector<std::string> headers;
|
common_header_list headers = custom_headers;
|
||||||
headers.push_back("Accept: application/json");
|
headers.push_back({"Accept", "application/json"});
|
||||||
if (!bearer_token.empty()) {
|
if (!bearer_token.empty()) {
|
||||||
headers.push_back("Authorization: Bearer " + bearer_token);
|
headers.push_back({"Authorization", "Bearer " + bearer_token});
|
||||||
}
|
}
|
||||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||||
|
|
@ -952,7 +997,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
|
||||||
} else if (res_code == 401) {
|
} else if (res_code == 401) {
|
||||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// check response
|
// check response
|
||||||
|
|
@ -1031,9 +1076,10 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||||
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
|
||||||
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
std::string manifest_url = url_prefix + "/manifests/" + tag;
|
||||||
common_remote_params manifest_params;
|
common_remote_params manifest_params;
|
||||||
manifest_params.headers.push_back("Authorization: Bearer " + token);
|
manifest_params.headers.push_back({"Authorization", "Bearer " + token});
|
||||||
manifest_params.headers.push_back(
|
manifest_params.headers.push_back({"Accept",
|
||||||
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
|
"application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
|
||||||
|
});
|
||||||
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
|
||||||
if (manifest_res.first != 200) {
|
if (manifest_res.first != 200) {
|
||||||
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
|
||||||
|
|
@ -1070,7 +1116,8 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||||
std::string local_path = fs_get_cache_file(model_filename);
|
std::string local_path = fs_get_cache_file(model_filename);
|
||||||
|
|
||||||
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
|
||||||
if (!common_download_file_single(blob_url, local_path, token, false)) {
|
const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
|
||||||
|
if (!is_http_status_ok(http_status)) {
|
||||||
throw std::runtime_error("Failed to download Docker Model");
|
throw std::runtime_error("Failed to download Docker Model");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1084,11 +1131,11 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
|
||||||
throw std::runtime_error("download functionality is not enabled in this build");
|
throw std::runtime_error("download functionality is not enabled in this build");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_download_model(const common_params_model &, const std::string &, bool) {
|
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
|
||||||
throw std::runtime_error("download functionality is not enabled in this build");
|
throw std::runtime_error("download functionality is not enabled in this build");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1096,6 +1143,14 @@ std::string common_docker_resolve_model(const std::string &) {
|
||||||
throw std::runtime_error("download functionality is not enabled in this build");
|
throw std::runtime_error("download functionality is not enabled in this build");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int common_download_file_single(const std::string &,
|
||||||
|
const std::string &,
|
||||||
|
const std::string &,
|
||||||
|
bool,
|
||||||
|
const common_header_list &) {
|
||||||
|
throw std::runtime_error("download functionality is not enabled in this build");
|
||||||
|
}
|
||||||
|
|
||||||
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
|
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
|
||||||
|
|
||||||
std::vector<common_cached_model_info> common_list_cached_models() {
|
std::vector<common_cached_model_info> common_list_cached_models() {
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,27 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
struct common_params_model;
|
struct common_params_model;
|
||||||
|
|
||||||
//
|
using common_header = std::pair<std::string, std::string>;
|
||||||
// download functionalities
|
using common_header_list = std::vector<common_header>;
|
||||||
//
|
|
||||||
|
struct common_remote_params {
|
||||||
|
common_header_list headers;
|
||||||
|
long timeout = 0; // in seconds, 0 means no timeout
|
||||||
|
long max_size = 0; // unlimited if 0
|
||||||
|
};
|
||||||
|
|
||||||
|
// get remote file content, returns <http_code, raw_response_body>
|
||||||
|
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||||
|
|
||||||
|
// split HF repo with tag into <repo, tag>
|
||||||
|
// for example: "user/model:tag" -> <"user/model", "tag">
|
||||||
|
// if tag is not present, default to "latest"
|
||||||
|
// example: "user/model" -> <"user/model", "latest">
|
||||||
|
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);
|
||||||
|
|
||||||
struct common_cached_model_info {
|
struct common_cached_model_info {
|
||||||
std::string manifest_path;
|
std::string manifest_path;
|
||||||
|
|
@ -41,17 +56,29 @@ struct common_hf_file_res {
|
||||||
common_hf_file_res common_get_hf_file(
|
common_hf_file_res common_get_hf_file(
|
||||||
const std::string & hf_repo_with_tag,
|
const std::string & hf_repo_with_tag,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
bool offline);
|
bool offline,
|
||||||
|
const common_header_list & headers = {}
|
||||||
|
);
|
||||||
|
|
||||||
// returns true if download succeeded
|
// returns true if download succeeded
|
||||||
bool common_download_model(
|
bool common_download_model(
|
||||||
const common_params_model & model,
|
const common_params_model & model,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
bool offline);
|
bool offline,
|
||||||
|
const common_header_list & headers = {}
|
||||||
|
);
|
||||||
|
|
||||||
// returns list of cached models
|
// returns list of cached models
|
||||||
std::vector<common_cached_model_info> common_list_cached_models();
|
std::vector<common_cached_model_info> common_list_cached_models();
|
||||||
|
|
||||||
|
// download single file from url to local path
|
||||||
|
// returns status code or -1 on error
|
||||||
|
int common_download_file_single(const std::string & url,
|
||||||
|
const std::string & path,
|
||||||
|
const std::string & bearer_token,
|
||||||
|
bool offline,
|
||||||
|
const common_header_list & headers = {});
|
||||||
|
|
||||||
// resolve and download model from Docker registry
|
// resolve and download model from Docker registry
|
||||||
// return local path to downloaded model file
|
// return local path to downloaded model file
|
||||||
std::string common_docker_resolve_model(const std::string & docker);
|
std::string common_docker_resolve_model(const std::string & docker);
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,48 @@ static std::string rm_leading_dashes(const std::string & str) {
|
||||||
return str.substr(pos);
|
return str.substr(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// only allow a subset of args for remote presets for security reasons
|
||||||
|
// do not add more args unless absolutely necessary
|
||||||
|
// args that output to files are strictly prohibited
|
||||||
|
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
|
||||||
|
static const std::set<std::string> allowed_options = {
|
||||||
|
"model-url",
|
||||||
|
"hf-repo",
|
||||||
|
"hf-repo-draft",
|
||||||
|
"hf-repo-v", // vocoder
|
||||||
|
"hf-file-v", // vocoder
|
||||||
|
"mmproj-url",
|
||||||
|
"pooling",
|
||||||
|
"jinja",
|
||||||
|
"batch-size",
|
||||||
|
"ubatch-size",
|
||||||
|
"cache-reuse",
|
||||||
|
"chat-template-kwargs",
|
||||||
|
"mmap",
|
||||||
|
// note: sampling params are automatically allowed by default
|
||||||
|
// negated args will be added automatically if the positive arg is specified above
|
||||||
|
};
|
||||||
|
|
||||||
|
std::set<std::string> allowed_keys;
|
||||||
|
|
||||||
|
for (const auto & it : key_to_opt) {
|
||||||
|
const std::string & key = it.first;
|
||||||
|
const common_arg & opt = it.second;
|
||||||
|
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
|
||||||
|
allowed_keys.insert(key);
|
||||||
|
// also add variant keys (args without leading dashes and env vars)
|
||||||
|
for (const auto & arg : opt.get_args()) {
|
||||||
|
allowed_keys.insert(rm_leading_dashes(arg));
|
||||||
|
}
|
||||||
|
for (const auto & env : opt.get_env()) {
|
||||||
|
allowed_keys.insert(env);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return allowed_keys;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
||||||
std::vector<std::string> args;
|
std::vector<std::string> args;
|
||||||
|
|
||||||
|
|
@ -121,6 +163,29 @@ void common_preset::merge(const common_preset & other) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_preset::apply_to_params(common_params & params) const {
|
||||||
|
for (const auto & [opt, val] : options) {
|
||||||
|
// apply each option to params
|
||||||
|
if (opt.handler_string) {
|
||||||
|
opt.handler_string(params, val);
|
||||||
|
} else if (opt.handler_int) {
|
||||||
|
opt.handler_int(params, std::stoi(val));
|
||||||
|
} else if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, common_arg_utils::is_truthy(val));
|
||||||
|
} else if (opt.handler_str_str) {
|
||||||
|
// not supported yet
|
||||||
|
throw std::runtime_error(string_format(
|
||||||
|
"%s: option with two values is not supported yet",
|
||||||
|
__func__
|
||||||
|
));
|
||||||
|
} else if (opt.handler_void) {
|
||||||
|
opt.handler_void(params);
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("unknown handler type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
||||||
std::map<std::string, std::map<std::string, std::string>> parsed;
|
std::map<std::string, std::map<std::string, std::string>> parsed;
|
||||||
|
|
||||||
|
|
@ -230,10 +295,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_preset_context::common_preset_context(llama_example ex)
|
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
|
||||||
: ctx_params(common_params_parser_init(default_params, ex)) {
|
: ctx_params(common_params_parser_init(default_params, ex)) {
|
||||||
common_params_add_preset_options(ctx_params.options);
|
common_params_add_preset_options(ctx_params.options);
|
||||||
key_to_opt = get_map_key_opt(ctx_params);
|
key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
|
||||||
|
// setup allowed keys if only_remote_allowed is true
|
||||||
|
if (only_remote_allowed) {
|
||||||
|
filter_allowed_keys = true;
|
||||||
|
allowed_keys = get_remote_preset_whitelist(key_to_opt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
||||||
|
|
@ -249,7 +320,18 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
|
||||||
}
|
}
|
||||||
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
||||||
for (const auto & [key, value] : section.second) {
|
for (const auto & [key, value] : section.second) {
|
||||||
|
if (key == "version") {
|
||||||
|
// skip version key (reserved for future use)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
|
if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
|
||||||
|
throw std::runtime_error(string_format(
|
||||||
|
"option '%s' is not allowed in remote presets",
|
||||||
|
key.c_str()
|
||||||
|
));
|
||||||
|
}
|
||||||
if (key_to_opt.find(key) != key_to_opt.end()) {
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||||
const auto & opt = key_to_opt.at(key);
|
const auto & opt = key_to_opt.at(key);
|
||||||
if (is_bool_arg(opt)) {
|
if (is_bool_arg(opt)) {
|
||||||
|
|
@ -259,7 +341,10 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
|
||||||
}
|
}
|
||||||
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
||||||
} else {
|
} else {
|
||||||
// TODO: maybe warn about unknown key?
|
throw std::runtime_error(string_format(
|
||||||
|
"option '%s' not recognized in preset '%s'",
|
||||||
|
key.c_str(), preset.name.c_str()
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
//
|
//
|
||||||
// INI preset parser and writer
|
// INI preset parser and writer
|
||||||
|
|
@ -40,6 +41,9 @@ struct common_preset {
|
||||||
|
|
||||||
// merge another preset into this one, overwriting existing options
|
// merge another preset into this one, overwriting existing options
|
||||||
void merge(const common_preset & other);
|
void merge(const common_preset & other);
|
||||||
|
|
||||||
|
// apply preset options to common_params
|
||||||
|
void apply_to_params(common_params & params) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
// interface for multiple presets in one file
|
// interface for multiple presets in one file
|
||||||
|
|
@ -50,7 +54,12 @@ struct common_preset_context {
|
||||||
common_params default_params; // unused for now
|
common_params default_params; // unused for now
|
||||||
common_params_context ctx_params;
|
common_params_context ctx_params;
|
||||||
std::map<std::string, common_arg> key_to_opt;
|
std::map<std::string, common_arg> key_to_opt;
|
||||||
common_preset_context(llama_example ex);
|
|
||||||
|
bool filter_allowed_keys = false;
|
||||||
|
std::set<std::string> allowed_keys;
|
||||||
|
|
||||||
|
// if only_remote_allowed is true, only accept whitelisted keys
|
||||||
|
common_preset_context(llama_example ex, bool only_remote_allowed = false);
|
||||||
|
|
||||||
// load presets from INI file
|
// load presets from INI file
|
||||||
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
||||||
|
|
|
||||||
|
|
@ -528,7 +528,11 @@ class ModelBase:
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
def prepare_tensors(self):
|
def prepare_tensors(self):
|
||||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||||
|
if self.tensor_map.mapping:
|
||||||
|
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||||
|
else:
|
||||||
|
max_name_len = len("vision_encoder.weight,") # Default reasonable length
|
||||||
|
|
||||||
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
|
||||||
# we don't need these
|
# we don't need these
|
||||||
|
|
@ -771,8 +775,8 @@ class TextModel(ModelBase):
|
||||||
|
|
||||||
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||||
|
|
||||||
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
|
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
|
||||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||||
|
|
||||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||||
|
|
@ -4363,7 +4367,37 @@ class Qwen3NextModel(Qwen2MoeModel):
|
||||||
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
|
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
|
||||||
data_torch = data_torch + 1
|
data_torch = data_torch + 1
|
||||||
|
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
if "in_proj_qkvz.weight" in name:
|
||||||
|
# original order: [q, k, v, z] * head_count
|
||||||
|
# corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
|
||||||
|
head_k_dim = self.hparams["linear_key_head_dim"]
|
||||||
|
head_v_dim = self.hparams["linear_value_head_dim"]
|
||||||
|
num_v_heads = self.hparams["linear_num_value_heads"]
|
||||||
|
num_k_heads = self.hparams["linear_num_key_heads"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
split_arg_list_qkvz = [
|
||||||
|
head_k_dim, # q partition
|
||||||
|
head_k_dim, # k partition
|
||||||
|
(num_v_heads // num_k_heads * head_v_dim), # v partition
|
||||||
|
(num_v_heads // num_k_heads * head_v_dim), # z partition
|
||||||
|
]
|
||||||
|
# view as (n_embd, head_count, [q+k+v+z])
|
||||||
|
data_torch = data_torch.permute(1, 0).contiguous()
|
||||||
|
data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
|
||||||
|
# split into q, k, v, z
|
||||||
|
q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
|
||||||
|
# flatten dim + head_count
|
||||||
|
q = q.contiguous().view(hidden_size, -1)
|
||||||
|
k = k.contiguous().view(hidden_size, -1)
|
||||||
|
v = v.contiguous().view(hidden_size, -1)
|
||||||
|
z = z.contiguous().view(hidden_size, -1)
|
||||||
|
# stack back
|
||||||
|
qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
|
||||||
|
z = z.permute(1, 0).contiguous()
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv)
|
||||||
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
|
||||||
|
else:
|
||||||
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("RND1")
|
@ModelBase.register("RND1")
|
||||||
|
|
@ -6038,7 +6072,175 @@ class Gemma3VisionModel(MmprojModel):
|
||||||
return [] # skip other tensors
|
return [] # skip other tensors
|
||||||
|
|
||||||
|
|
||||||
|
class ConformerAudioModel(MmprojModel):
|
||||||
|
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_audio_tensor(name: str):
|
||||||
|
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ConformerAudioModel.is_audio_tensor(name):
|
||||||
|
if ".conv" in name or "_conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
||||||
|
if "batch_norm" in name:
|
||||||
|
if self._batch_norm_tensors is None:
|
||||||
|
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
||||||
|
assert bid is not None
|
||||||
|
self._batch_norm_tensors[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._batch_norm_tensors[bid]) < 5:
|
||||||
|
return []
|
||||||
|
|
||||||
|
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
||||||
|
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
||||||
|
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
||||||
|
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
||||||
|
eps = 1e-5 # default value
|
||||||
|
|
||||||
|
a = weight / torch.sqrt(running_var + eps)
|
||||||
|
b = bias - running_mean * a
|
||||||
|
return [
|
||||||
|
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
||||||
|
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
||||||
|
]
|
||||||
|
|
||||||
|
# reshape conv weights
|
||||||
|
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
||||||
|
data_torch = data_torch[:, None, None]
|
||||||
|
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
||||||
|
assert data_torch.shape[1] == 1
|
||||||
|
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
||||||
|
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
||||||
|
assert data_torch.shape[2] == 1
|
||||||
|
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Gemma3nForConditionalGeneration")
|
@ModelBase.register("Gemma3nForConditionalGeneration")
|
||||||
|
class Gemma3nVisionAudioModel(ConformerAudioModel):
|
||||||
|
has_audio_encoder = True
|
||||||
|
has_vision_encoder = True
|
||||||
|
|
||||||
|
# Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
|
||||||
|
# This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
|
||||||
|
block_tensor_mapping = {
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
|
||||||
|
"model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
# Parent init will call find_hparam which now returns 0 for empty keys
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
|
||||||
|
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
|
||||||
|
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
|
||||||
|
|
||||||
|
# MobileNetV5 does not use image_mean/std
|
||||||
|
self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
|
||||||
|
self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
|
||||||
|
self.hparams_vision["image_size"] = self.preprocessor_config.get(
|
||||||
|
"size", {"height": 768, "width": 768}
|
||||||
|
)["height"]
|
||||||
|
|
||||||
|
# Image sequence length (256 tokens = 16x16 for Gemma3n)
|
||||||
|
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||||
|
image_size = self.hparams_vision["image_size"]
|
||||||
|
self.hparams_vision["patch_size"] = image_size // image_seq_length
|
||||||
|
|
||||||
|
# remap audio hparams
|
||||||
|
assert self.hparams_audio is not None
|
||||||
|
self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
|
||||||
|
self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
|
||||||
|
self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
|
||||||
|
self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
# vision params
|
||||||
|
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
|
||||||
|
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||||
|
|
||||||
|
# audio params
|
||||||
|
assert self.hparams_audio is not None
|
||||||
|
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
# Force quantization settings for specific tensor types
|
||||||
|
if "input_projection" in name or "input_proj" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F16
|
||||||
|
if ".embeddings." in name or "stem" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def custom_map(self, name: str) -> str:
|
||||||
|
"""Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
|
||||||
|
parts = name.split(".")
|
||||||
|
# MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
|
||||||
|
if len(parts) >= 7:
|
||||||
|
bid, sid = parts[4], parts[5]
|
||||||
|
suffix = ".".join(parts[6:])
|
||||||
|
template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
|
||||||
|
if template in self.block_tensor_mapping:
|
||||||
|
return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
|
||||||
|
|
||||||
|
raise ValueError(f"Unknown name: {name}")
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if (ConformerAudioModel.is_audio_tensor(name)):
|
||||||
|
name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
# Gemma3n uses
|
||||||
|
# - model.embed_vision.* for projection layers
|
||||||
|
# - model.vision_tower.* for vision encoder
|
||||||
|
# Skip non-vision tensors
|
||||||
|
if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("model.vision_tower.timm_model.blocks."):
|
||||||
|
# Double-indexed block tensors through custom logic
|
||||||
|
new_name = self.custom_map(name)
|
||||||
|
else:
|
||||||
|
# Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
|
||||||
|
data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
|
||||||
|
|
||||||
|
return [(new_name, data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||||
class Gemma3NModel(Gemma3Model):
|
class Gemma3NModel(Gemma3Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
model_arch = gguf.MODEL_ARCH.GEMMA3N
|
||||||
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
|
||||||
|
|
@ -6061,8 +6263,25 @@ class Gemma3NModel(Gemma3Model):
|
||||||
]
|
]
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
|
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
|
||||||
|
# which includes special tokens from 262144-262399 for vision/audio.
|
||||||
|
# The vocab_size_per_layer_input (262144) is only the embedding size per layer.
|
||||||
|
# Temporarily override the hparams lookup order to prioritize vocab_size.
|
||||||
|
|
||||||
|
# Store original vocab_size_per_layer_input if it exists
|
||||||
|
vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
|
||||||
|
|
||||||
|
# Temporarily remove vocab_size_per_layer_input to force using vocab_size
|
||||||
|
if vocab_size_per_layer_input is not None:
|
||||||
|
del self.hparams["vocab_size_per_layer_input"]
|
||||||
|
|
||||||
|
# Call parent set_vocab which will now use vocab_size (262400)
|
||||||
super().set_vocab()
|
super().set_vocab()
|
||||||
|
|
||||||
|
# Restore vocab_size_per_layer_input for later use
|
||||||
|
if vocab_size_per_layer_input is not None:
|
||||||
|
self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
|
||||||
|
|
@ -6098,8 +6317,32 @@ class Gemma3NModel(Gemma3Model):
|
||||||
if "language_model." not in name:
|
if "language_model." not in name:
|
||||||
return [] # skip non-language model tensors
|
return [] # skip non-language model tensors
|
||||||
|
|
||||||
|
# Pad token embeddings for vision/audio special tokens (262144-262399)
|
||||||
|
if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
|
||||||
|
# Move to CPU to avoid meta device issues during padding
|
||||||
|
data_torch = data_torch.to(device="cpu")
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get("vocab_size", 262400)
|
||||||
|
current_size = data_torch.shape[0] # First dimension is vocab_size
|
||||||
|
|
||||||
|
if current_size < vocab_size:
|
||||||
|
# Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
|
||||||
|
padding_size = vocab_size - current_size
|
||||||
|
tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
|
||||||
|
logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
|
||||||
|
|
||||||
|
# Create padding with zeros (vision tokens won't use these embeddings)
|
||||||
|
padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
|
||||||
|
data_torch = torch.cat([data_torch, padding], dim=0)
|
||||||
|
|
||||||
|
# Continue with normal processing
|
||||||
|
name = name.replace("language_model.", "")
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
if "altup_unembed_projections" in name:
|
if "altup_unembed_projections" in name:
|
||||||
data_torch = data_torch.to(device="cpu")
|
data_torch = data_torch.to(device="cpu")
|
||||||
|
# altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
|
||||||
|
# They should NOT be padded
|
||||||
if ".0." in name:
|
if ".0." in name:
|
||||||
self._altup_unembd[0] = data_torch
|
self._altup_unembd[0] = data_torch
|
||||||
elif ".1." in name:
|
elif ".1." in name:
|
||||||
|
|
@ -7212,6 +7455,7 @@ class DeepseekModel(TextModel):
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"KimiVLForConditionalGeneration",
|
"KimiVLForConditionalGeneration",
|
||||||
"YoutuForCausalLM",
|
"YoutuForCausalLM",
|
||||||
|
"YoutuVLForConditionalGeneration"
|
||||||
)
|
)
|
||||||
class DeepseekV2Model(TextModel):
|
class DeepseekV2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
@ -9935,7 +10179,7 @@ class LFM2Model(TextModel):
|
||||||
self._add_feed_forward_length()
|
self._add_feed_forward_length()
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if self._is_vision_tensor(name) or self._is_audio_tensor(name):
|
if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
|
||||||
# skip multimodal tensors
|
# skip multimodal tensors
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
@ -9951,8 +10195,26 @@ class LFM2Model(TextModel):
|
||||||
def _is_vision_tensor(self, name: str) -> bool:
|
def _is_vision_tensor(self, name: str) -> bool:
|
||||||
return "vision_tower" in name or "multi_modal_projector" in name
|
return "vision_tower" in name or "multi_modal_projector" in name
|
||||||
|
|
||||||
def _is_audio_tensor(self, name: str):
|
|
||||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
@ModelBase.register("Lfm2Model")
|
||||||
|
class LFM2ColBertModel(LFM2Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.LFM2
|
||||||
|
dense_tensor_name = "dense_2"
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if not name.startswith(self.dense_tensor_name):
|
||||||
|
name = "model." + name
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# dense tensor is stored in a separate safetensors file
|
||||||
|
from safetensors.torch import load_file
|
||||||
|
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
|
||||||
|
assert tensors_file.is_file()
|
||||||
|
tensor = load_file(tensors_file)["linear.weight"]
|
||||||
|
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
|
||||||
|
yield f"{self.dense_tensor_name}.weight", tensor.clone()
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2MoeForCausalLM")
|
@ModelBase.register("Lfm2MoeForCausalLM")
|
||||||
|
|
@ -10060,13 +10322,11 @@ class LFM2VLModel(MmprojModel):
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2AudioForConditionalGeneration")
|
@ModelBase.register("Lfm2AudioForConditionalGeneration")
|
||||||
class LFM2AudioModel(MmprojModel):
|
class LFM2AudioModel(ConformerAudioModel):
|
||||||
has_vision_encoder = False
|
has_vision_encoder = False
|
||||||
has_audio_encoder = True
|
has_audio_encoder = True
|
||||||
model_name = "Lfm2AudioEncoder"
|
model_name = "Lfm2AudioEncoder"
|
||||||
|
|
||||||
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
|
|
||||||
|
|
||||||
def get_audio_config(self) -> dict[str, Any] | None:
|
def get_audio_config(self) -> dict[str, Any] | None:
|
||||||
return self.global_config.get("encoder")
|
return self.global_config.get("encoder")
|
||||||
|
|
||||||
|
|
@ -10080,12 +10340,7 @@ class LFM2AudioModel(MmprojModel):
|
||||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||||
|
|
||||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
def modify_tensors(self, data_torch, name, bid):
|
||||||
if ".conv" in name and ".weight" in name:
|
|
||||||
return gguf.GGMLQuantizationType.F32
|
|
||||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
||||||
# skip language model tensors
|
# skip language model tensors
|
||||||
if name.startswith("lfm."):
|
if name.startswith("lfm."):
|
||||||
return []
|
return []
|
||||||
|
|
@ -10098,40 +10353,7 @@ class LFM2AudioModel(MmprojModel):
|
||||||
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
|
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# fold running_mean, running_var and eps into weight and bias for batch_norm
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
if "batch_norm" in name:
|
|
||||||
if self._batch_norm_tensors is None:
|
|
||||||
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
|
|
||||||
assert bid is not None
|
|
||||||
self._batch_norm_tensors[bid][name] = data_torch
|
|
||||||
|
|
||||||
if len(self._batch_norm_tensors[bid]) < 5:
|
|
||||||
return []
|
|
||||||
|
|
||||||
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
|
|
||||||
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
|
|
||||||
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
|
|
||||||
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
|
|
||||||
eps = 1e-5 # default value
|
|
||||||
|
|
||||||
a = weight / torch.sqrt(running_var + eps)
|
|
||||||
b = bias - running_mean * a
|
|
||||||
return [
|
|
||||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
|
|
||||||
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
|
|
||||||
]
|
|
||||||
|
|
||||||
# reshape conv weights
|
|
||||||
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
|
|
||||||
data_torch = data_torch[:, None, None]
|
|
||||||
if "conv.depthwise_conv" in name and name.endswith(".weight"):
|
|
||||||
assert data_torch.shape[1] == 1
|
|
||||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
|
|
||||||
if "conv.pointwise_conv" in name and name.endswith(".weight"):
|
|
||||||
assert data_torch.shape[2] == 1
|
|
||||||
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
|
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("SmallThinkerForCausalLM")
|
@ModelBase.register("SmallThinkerForCausalLM")
|
||||||
|
|
@ -10674,8 +10896,8 @@ class JanusProVisionModel(MmprojModel):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
|
@ModelBase.register("YoutuVLForConditionalGeneration")
|
||||||
class YOUTUVLVisionModel(MmprojModel):
|
class YoutuVLVisionModel(MmprojModel):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
assert self.hparams_vision is not None
|
assert self.hparams_vision is not None
|
||||||
|
|
@ -10952,8 +11174,8 @@ def parse_args() -> argparse.Namespace:
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sentence-transformers-dense-modules", action="store_true",
|
"--sentence-transformers-dense-modules", action="store_true",
|
||||||
help=("Whether to include sentence-transformers dense modules."
|
help=("Whether to include sentence-transformers dense modules. "
|
||||||
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
|
"It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
|
||||||
"Default these modules are not included.")
|
"Default these modules are not included.")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ Legend:
|
||||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
|
|
@ -57,7 +57,6 @@ Legend:
|
||||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
||||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -71,10 +70,9 @@ Legend:
|
||||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
||||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
|
||||||
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -99,7 +97,6 @@ Legend:
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
|
||||||
|
|
@ -965,6 +965,7 @@
|
||||||
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
||||||
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
||||||
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
||||||
|
"BLAS","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
|
||||||
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
||||||
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
||||||
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
|
||||||
|
|
@ -4964,6 +4965,7 @@
|
||||||
"BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
|
"BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
|
||||||
"BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
|
"BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
|
||||||
"BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
|
"BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
|
||||||
|
"BLAS","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","BLAS"
|
||||||
"BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
|
"BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
|
"BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
|
"BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
|
||||||
|
|
@ -5715,15 +5717,15 @@
|
||||||
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
|
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
|
"BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
|
||||||
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
|
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
|
||||||
|
|
@ -5733,6 +5735,15 @@
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
"BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
||||||
"BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
"BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
||||||
"BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
"BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
|
||||||
|
|
@ -6592,6 +6603,30 @@
|
||||||
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
|
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
|
||||||
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
|
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q4_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q5_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=q6_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq2_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq2_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq3_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq1_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq1_m,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq4_nl,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq3_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
"BLAS","MUL_MAT","type_a=iq4_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
|
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
|
||||||
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
|
||||||
|
|
@ -8916,6 +8951,11 @@
|
||||||
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
|
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","SOFT_MAX","type=f32,ne=[200000,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","SOFT_MAX","type=f32,ne=[200000,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","SOFT_MAX","type=f32,ne=[643251,3,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
||||||
"BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
"BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
||||||
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
|
||||||
|
|
@ -8968,6 +9008,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -8977,6 +9018,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -8987,11 +9029,13 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9001,6 +9045,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9011,11 +9056,13 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9025,6 +9072,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9035,11 +9083,13 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9049,6 +9099,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9059,6 +9110,7 @@
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9184,6 +9236,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9193,6 +9246,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9203,11 +9257,13 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9217,6 +9273,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9227,11 +9284,13 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9241,6 +9300,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9251,11 +9311,13 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9265,6 +9327,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9275,6 +9338,7 @@
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
|
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
"BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
"BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
|
||||||
|
|
@ -9542,333 +9606,333 @@
|
||||||
"BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
|
"BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
|
||||||
"BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
|
"BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
|
||||||
"BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
|
"BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","0","no","BLAS"
|
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
|
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
|
||||||
"BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
|
"BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
|
"BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
|
||||||
"BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
|
"BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
|
||||||
|
|
@ -9891,8 +9955,9 @@
|
||||||
"BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
|
"BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
|
||||||
"BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
|
"BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
|
||||||
"BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
|
"BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","0","no","BLAS"
|
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","BLAS"
|
"BLAS","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","BLAS"
|
||||||
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
|
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
|
||||||
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
|
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
|
||||||
"BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
|
"BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
|
||||||
|
|
@ -9914,6 +9979,7 @@
|
||||||
"BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
|
"BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
|
"BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
|
"BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
|
"BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
|
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
|
||||||
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
|
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
|
||||||
|
|
@ -9923,17 +9989,41 @@
|
||||||
"BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
|
"BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
|
||||||
"BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
|
"BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
|
||||||
"BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
|
"BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
|
||||||
|
"BLAS","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","BLAS"
|
||||||
|
"BLAS","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","BLAS"
|
||||||
|
"BLAS","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","BLAS"
|
||||||
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","BLAS"
|
||||||
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","BLAS"
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","BLAS"
|
||||||
|
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","BLAS"
|
||||||
|
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","BLAS"
|
||||||
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
|
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
|
||||||
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
|
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
|
||||||
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
|
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
|
||||||
|
|
|
||||||
|
Can't render this file because it is too large.
|
File diff suppressed because it is too large
Load Diff
13483
docs/ops/zDNN.csv
13483
docs/ops/zDNN.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,97 @@
|
||||||
|
# llama.cpp INI Presets
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859), allows users to create reusable and shareable parameter configurations for llama.cpp.
|
||||||
|
|
||||||
|
### Using Presets with the Server
|
||||||
|
|
||||||
|
When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
|
||||||
|
|
||||||
|
### Using a Remote Preset
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
>
|
||||||
|
> This feature is currently only supported via the `-hf` option.
|
||||||
|
|
||||||
|
For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
hf-repo-draft = username/my-draft-model-GGUF
|
||||||
|
temp = 0.5
|
||||||
|
top-k = 20
|
||||||
|
top-p = 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
llama-cli -hf username/my-model-with-preset
|
||||||
|
|
||||||
|
# This is equivalent to:
|
||||||
|
llama-cli -hf username/my-model-with-preset \
|
||||||
|
--hf-repo-draft username/my-draft-model-GGUF \
|
||||||
|
--temp 0.5 \
|
||||||
|
--top-k 20 \
|
||||||
|
--top-p 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also override preset arguments by specifying them on the command line:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Force temp = 0.1, overriding the preset value
|
||||||
|
llama-cli -hf username/my-model-with-preset --temp 0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
|
||||||
|
|
||||||
|
```ini
|
||||||
|
hf-repo = user/my-model-main
|
||||||
|
hf-repo-draft = user/my-model-draft
|
||||||
|
temp = 0.8
|
||||||
|
ctx-size = 1024
|
||||||
|
; (and other configurations)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Named presets
|
||||||
|
|
||||||
|
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo containing a single `preset.ini` file that references the actual model(s):
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[*]
|
||||||
|
mmap = 1
|
||||||
|
|
||||||
|
[gpt-oss-20b-hf]
|
||||||
|
hf = ggml-org/gpt-oss-20b-GGUF
|
||||||
|
batch-size = 2048
|
||||||
|
ubatch-size = 2048
|
||||||
|
top-p = 1.0
|
||||||
|
top-k = 0
|
||||||
|
min-p = 0.01
|
||||||
|
temp = 1.0
|
||||||
|
chat-template-kwargs = {"reasoning_effort": "high"}
|
||||||
|
|
||||||
|
[gpt-oss-120b-hf]
|
||||||
|
hf = ggml-org/gpt-oss-120b-GGUF
|
||||||
|
batch-size = 2048
|
||||||
|
ubatch-size = 2048
|
||||||
|
top-p = 1.0
|
||||||
|
top-k = 0
|
||||||
|
min-p = 0.01
|
||||||
|
temp = 1.0
|
||||||
|
chat-template-kwargs = {"reasoning_effort": "high"}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can then use it via `llama-cli` or `llama-server`, example:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
llama-server -hf user/repo:gpt-oss-120b-hf
|
||||||
|
```
|
||||||
|
|
||||||
|
Please make sure to provide the correct `hf-repo` for each child preset. Otherwise, you may get error: `The specified tag is not a valid quantization scheme.`
|
||||||
|
|
@ -15,6 +15,7 @@ llama_add_compile_flags()
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
|
add_subdirectory(debug)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
|
||||||
|
|
@ -34,7 +35,6 @@ else()
|
||||||
add_subdirectory(gen-docs)
|
add_subdirectory(gen-docs)
|
||||||
add_subdirectory(training)
|
add_subdirectory(training)
|
||||||
add_subdirectory(diffusion)
|
add_subdirectory(diffusion)
|
||||||
add_subdirectory(model-conversion)
|
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
# these examples use the backends directly and cannot be built with dynamic loading
|
# these examples use the backends directly and cannot be built with dynamic loading
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET llama-logits)
|
set(TARGET llama-debug)
|
||||||
add_executable(${TARGET} logits.cpp)
|
add_executable(${TARGET} debug.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
# llama.cpp/examples/debug
|
||||||
|
|
||||||
|
This is a utility intended to help debug a model by registering a callback that
|
||||||
|
logs GGML operations and tensor data. It can also store the generated logits or
|
||||||
|
embeddings as well as the prompt and token ids for comparision with the original
|
||||||
|
model.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```shell
|
||||||
|
llama-debug \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model phi-2-q4_0.gguf \
|
||||||
|
--prompt hello \
|
||||||
|
--save-logits \
|
||||||
|
--verbose
|
||||||
|
```
|
||||||
|
The tensor data is logged as debug and required the --verbose flag. The reason
|
||||||
|
for this is that while useful for a model with many layers there can be a lot of
|
||||||
|
output. You can filter the tensor names using the `--tensor-filter` option.
|
||||||
|
|
||||||
|
A recommended approach is to first run without `--verbose` and see if the
|
||||||
|
generated logits/embeddings are close to the original model. If they are not,
|
||||||
|
then it might be required to inspect tensor by tensor and in that case it is
|
||||||
|
useful to enable the `--verbose` flag along with `--tensor-filter` to focus on
|
||||||
|
specific tensors.
|
||||||
|
|
||||||
|
### Options
|
||||||
|
This example supports all standard `llama.cpp` options and also accepts the
|
||||||
|
following options:
|
||||||
|
```console
|
||||||
|
$ llama-debug --help
|
||||||
|
...
|
||||||
|
|
||||||
|
----- example-specific params -----
|
||||||
|
|
||||||
|
--save-logits save final logits to files for verification (default: false)
|
||||||
|
--logits-output-dir PATH directory for saving logits output files (default: data)
|
||||||
|
--tensor-filter REGEX filter tensor names for debug output (regex pattern, can be specified multiple times)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Output Files
|
||||||
|
|
||||||
|
When `--save-logits` is enabled, the following files are created in the output
|
||||||
|
directory:
|
||||||
|
|
||||||
|
* `llamacpp-<model>[-embeddings].bin` - Binary output (logits or embeddings)
|
||||||
|
* `llamacpp-<model>[-embeddings].txt` - Text output (logits or embeddings, one per line)
|
||||||
|
* `llamacpp-<model>[-embeddings]-prompt.txt` - Prompt text and token IDs
|
||||||
|
* `llamacpp-<model>[-embeddings]-tokens.bin` - Binary token IDs for programmatic comparison
|
||||||
|
|
||||||
|
These files can be compared against the original model's output to verify the
|
||||||
|
converted model.
|
||||||
|
|
@ -0,0 +1,439 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
|
static void print_usage(int, char ** argv) {
|
||||||
|
const std::string usage_template = R"(
|
||||||
|
example usage:
|
||||||
|
|
||||||
|
Print tensors:
|
||||||
|
|
||||||
|
{prog} -m model.gguf -p "Hello my name is" --verbose
|
||||||
|
|
||||||
|
The tensors to be printed can be filtered with --tensor-filter option.
|
||||||
|
|
||||||
|
Save logits/embeddings:
|
||||||
|
|
||||||
|
{prog} -m model.gguf -p "Hello my name is" --save-logits
|
||||||
|
|
||||||
|
Add --embedding to save embeddings)" "\n";
|
||||||
|
|
||||||
|
// Fix the source code indentation above that is introduced by the raw string literal.
|
||||||
|
std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
|
||||||
|
usage = std::regex_replace(usage, std::regex("\\{prog\\}"), argv[0]);
|
||||||
|
LOG("%s\n", usage.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
|
|
||||||
|
struct callback_data {
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
std::vector<std::regex> tensor_filters;
|
||||||
|
|
||||||
|
callback_data() = default;
|
||||||
|
|
||||||
|
callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
||||||
|
for (const auto & pattern : filter_patterns) {
|
||||||
|
try {
|
||||||
|
std::string anchored_pattern = "^" + pattern;
|
||||||
|
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||||
|
} catch (const std::regex_error & e) {
|
||||||
|
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
params.cb_eval = ggml_debug;
|
||||||
|
params.cb_eval_user_data = this;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool has_pooling(llama_context * ctx) {
|
||||||
|
switch (llama_pooling_type(ctx)) {
|
||||||
|
case LLAMA_POOLING_TYPE_NONE:
|
||||||
|
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct output_data {
|
||||||
|
float * data_ptr = nullptr;
|
||||||
|
int data_size = 0;
|
||||||
|
std::string type_suffix;
|
||||||
|
std::vector<float> embd_norm;
|
||||||
|
std::string prompt;
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
output_data(llama_context * ctx, const llama_model * model, const common_params & params) {
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||||
|
|
||||||
|
tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
prompt = params.prompt;
|
||||||
|
|
||||||
|
if (params.embedding) {
|
||||||
|
const int n_embd = llama_model_n_embd_out(model);
|
||||||
|
const bool pooling = has_pooling(ctx);
|
||||||
|
const int n_embd_count = pooling ? 1 : tokens.size();
|
||||||
|
const int n_floats = n_embd * n_embd_count;
|
||||||
|
|
||||||
|
float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx);
|
||||||
|
if (embd_raw == nullptr) {
|
||||||
|
throw std::runtime_error("failed to get embeddings from the model");
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false");
|
||||||
|
LOG_DBG("n_embd: %d\n", n_embd);
|
||||||
|
LOG_DBG("n_floats: %d\n", n_floats);
|
||||||
|
LOG_DBG("n_embd_count: %d\n", n_embd_count);
|
||||||
|
|
||||||
|
data_ptr = embd_raw;
|
||||||
|
data_size = n_floats;
|
||||||
|
type_suffix = "-embeddings";
|
||||||
|
|
||||||
|
if (params.embd_normalize >= 0) {
|
||||||
|
embd_norm.resize(n_floats);
|
||||||
|
for (int i = 0; i < n_embd_count; i++) {
|
||||||
|
common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize);
|
||||||
|
}
|
||||||
|
data_ptr = embd_norm.data();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
|
||||||
|
const int n_logits = llama_vocab_n_tokens(vocab);
|
||||||
|
|
||||||
|
data_ptr = const_cast<float*>(logits);
|
||||||
|
data_size = n_logits;
|
||||||
|
type_suffix = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
|
std::string str;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
str += std::to_string(t->ne[i]);
|
||||||
|
if (i + 1 < GGML_MAX_DIMS) {
|
||||||
|
str += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||||
|
union {
|
||||||
|
float f;
|
||||||
|
uint32_t i;
|
||||||
|
} u;
|
||||||
|
u.i = (uint32_t)h.bits << 16;
|
||||||
|
return u.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static float ggml_get_float_value(const uint8_t * data, ggml_type type,
|
||||||
|
const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||||
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
return *(const float *) &data[i];
|
||||||
|
case GGML_TYPE_I64:
|
||||||
|
return (float) *(const int64_t *) &data[i];
|
||||||
|
case GGML_TYPE_I32:
|
||||||
|
return (float) *(const int32_t *) &data[i];
|
||||||
|
case GGML_TYPE_I16:
|
||||||
|
return (float) *(const int16_t *) &data[i];
|
||||||
|
case GGML_TYPE_I8:
|
||||||
|
return (float) *(const int8_t *) &data[i];
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||||
|
GGML_ASSERT(n > 0);
|
||||||
|
float sum = 0;
|
||||||
|
float sum_sq = 0.0;
|
||||||
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||||
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||||
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||||
|
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||||
|
sum += v;
|
||||||
|
sum_sq += v * v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
|
LOG_DBG(" [\n");
|
||||||
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||||
|
if (i2 == n && ne[2] > 2*n) {
|
||||||
|
LOG_DBG(" ..., \n");
|
||||||
|
i2 = ne[2] - n;
|
||||||
|
}
|
||||||
|
LOG_DBG(" [\n");
|
||||||
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||||
|
if (i1 == n && ne[1] > 2*n) {
|
||||||
|
LOG_DBG(" ..., \n");
|
||||||
|
i1 = ne[1] - n;
|
||||||
|
}
|
||||||
|
LOG_DBG(" [");
|
||||||
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||||
|
if (i0 == n && ne[0] > 2*n) {
|
||||||
|
LOG_DBG("..., ");
|
||||||
|
i0 = ne[0] - n;
|
||||||
|
}
|
||||||
|
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||||
|
LOG_DBG("%12.4f", v);
|
||||||
|
if (i0 < ne[0] - 1) {
|
||||||
|
LOG_DBG(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG_DBG("],\n");
|
||||||
|
}
|
||||||
|
LOG_DBG(" ],\n");
|
||||||
|
}
|
||||||
|
LOG_DBG(" ]\n");
|
||||||
|
LOG_DBG(" sum = %f\n", sum);
|
||||||
|
LOG_DBG(" sum_sq = %f\n", sum_sq);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::isnan(sum)) {
|
||||||
|
LOG_ERR("encountered NaN - aborting\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GGML operations callback during the graph execution.
|
||||||
|
*
|
||||||
|
* @param t current tensor
|
||||||
|
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||||
|
* see ggml_backend_sched_eval_callback
|
||||||
|
* @param user_data user data to pass at each call back
|
||||||
|
* @return true to receive data or continue the graph, false otherwise
|
||||||
|
*/
|
||||||
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return true; // Always retrieve data
|
||||||
|
}
|
||||||
|
|
||||||
|
bool matches_filter = cb_data->tensor_filters.empty();
|
||||||
|
|
||||||
|
if (!matches_filter) {
|
||||||
|
for (const auto & filter : cb_data->tensor_filters) {
|
||||||
|
if (std::regex_search(t->name, filter)) {
|
||||||
|
matches_filter = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char src1_str[128] = {0};
|
||||||
|
if (src1) {
|
||||||
|
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matches_filter) {
|
||||||
|
LOG_DBG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
|
t->name,
|
||||||
|
ggml_type_name(t->type),
|
||||||
|
ggml_op_desc(t),
|
||||||
|
src0->name,
|
||||||
|
ggml_ne_string(src0).c_str(),
|
||||||
|
src1 ? src1_str : "",
|
||||||
|
ggml_ne_string(t).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||||
|
|
||||||
|
if (!is_host) {
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
cb_data->data.resize(n_bytes);
|
||||||
|
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ggml_is_quantized(t->type) && matches_filter) {
|
||||||
|
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||||
|
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
|
||||||
|
std::filesystem::create_directory(output_dir);
|
||||||
|
auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
|
||||||
|
|
||||||
|
// Save logits/embeddings to binary file.
|
||||||
|
{
|
||||||
|
std::filesystem::path filepath{base_path.string() + ".bin"};
|
||||||
|
std::ofstream file{filepath, std::ios::binary};
|
||||||
|
if (!file) {
|
||||||
|
throw std::runtime_error("failed to open binary output file: " + filepath.string());
|
||||||
|
}
|
||||||
|
file.write(reinterpret_cast<const char*>(output.data_ptr), output.data_size * sizeof(float));
|
||||||
|
LOG("Data saved to %s\n", filepath.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save logits/embeddings to text file.
|
||||||
|
{
|
||||||
|
std::filesystem::path filepath{base_path.string() + ".txt"};
|
||||||
|
std::ofstream file{filepath};
|
||||||
|
if (!file) {
|
||||||
|
throw std::runtime_error("failed to open text output file: " + filepath.string());
|
||||||
|
}
|
||||||
|
for (int i = 0; i < output.data_size; i++) {
|
||||||
|
file << i << ": " << output.data_ptr[i] << '\n';
|
||||||
|
}
|
||||||
|
LOG("Data saved to %s\n", filepath.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save prompt and tokens to text file.
|
||||||
|
{
|
||||||
|
std::filesystem::path filepath{base_path.string() + "-prompt.txt"};
|
||||||
|
std::ofstream file{filepath};
|
||||||
|
if (!file) {
|
||||||
|
throw std::runtime_error("failed to open prompt output file: " + filepath.string());
|
||||||
|
}
|
||||||
|
|
||||||
|
file << "prompt: " << output.prompt << '\n';
|
||||||
|
file << "n_tokens: " << output.tokens.size() << '\n';
|
||||||
|
|
||||||
|
file << "token ids: ";
|
||||||
|
for (size_t i = 0; i < output.tokens.size(); i++) {
|
||||||
|
file << output.tokens[i];
|
||||||
|
if (i + 1 < output.tokens.size()) {
|
||||||
|
file << ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
file << '\n';
|
||||||
|
LOG("Prompt saved to %s\n", filepath.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save token ids to binary file.
|
||||||
|
{
|
||||||
|
std::filesystem::path filepath{base_path.string() + "-tokens.bin"};
|
||||||
|
std::ofstream file{filepath, std::ios::binary};
|
||||||
|
if (!file) {
|
||||||
|
throw std::runtime_error("failed to open tokens binary file: " + filepath.string());
|
||||||
|
}
|
||||||
|
file.write(reinterpret_cast<const char*>(output.tokens.data()), output.tokens.size() * sizeof(llama_token));
|
||||||
|
LOG("Tokens saved to %s\n", filepath.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_tokenized_prompt(llama_context * ctx, const std::vector<llama_token> & tokens, const std::string & prompt) {
|
||||||
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
|
LOG("Model add_bos: %s\n", llama_vocab_get_add_bos(vocab) ? "true" : "false");
|
||||||
|
LOG("Input prompt: \"%s\"\n", prompt.c_str());
|
||||||
|
LOG("Token ids (%zu):\n", tokens.size());
|
||||||
|
|
||||||
|
for (auto id : tokens) {
|
||||||
|
std::string piece(128, '\0');
|
||||||
|
int n = llama_token_to_piece(vocab, id, piece.data(), piece.size(), 0, true);
|
||||||
|
if (n < 0) {
|
||||||
|
LOG_ERR("failed to convert token %d to piece\n", id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
piece.resize(n);
|
||||||
|
LOG("%s(%d) ", piece.c_str(), id);
|
||||||
|
}
|
||||||
|
LOG("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool run(llama_context * ctx, const common_params & params) {
|
||||||
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
|
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
if (tokens.empty()) {
|
||||||
|
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||||
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
print_tokenized_prompt(ctx, tokens, params.prompt);
|
||||||
|
|
||||||
|
if (params.save_logits) {
|
||||||
|
output_data output {ctx, model, params};
|
||||||
|
std::filesystem::path model_path{params.model.path};
|
||||||
|
std::string model_name{model_path.stem().string()};
|
||||||
|
save_output_data(output, model_name, params.logits_output_dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
callback_data cb_data(params, params.tensor_filter);
|
||||||
|
|
||||||
|
auto llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
|
auto * model = llama_init->model();
|
||||||
|
auto * ctx = llama_init->context();
|
||||||
|
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
LOG_INF("\n");
|
||||||
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
|
LOG_INF("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!run(ctx, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -553,6 +553,7 @@ int main(int argc, char ** argv) {
|
||||||
model_params.n_gpu_layers = params.n_gpu_layers;
|
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||||
model_params.devices = params.devices.data();
|
model_params.devices = params.devices.data();
|
||||||
model_params.use_mmap = params.use_mmap;
|
model_params.use_mmap = params.use_mmap;
|
||||||
|
model_params.use_direct_io = params.use_direct_io;
|
||||||
model_params.use_mlock = params.use_mlock;
|
model_params.use_mlock = params.use_mlock;
|
||||||
model_params.check_tensors = params.check_tensors;
|
model_params.check_tensors = params.check_tensors;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
|
||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
|
|
@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
}
|
}
|
||||||
|
|
||||||
float * out = output + embd_pos * n_embd;
|
float * out = output + embd_pos * n_embd_out;
|
||||||
common_embd_normalize(embd, out, n_embd, embd_norm);
|
common_embd_normalize(embd, out, n_embd_out, embd_norm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate output
|
// allocate output
|
||||||
const int n_embd = llama_model_n_embd(model);
|
const int n_embd_out = llama_model_n_embd_out(model);
|
||||||
std::vector<float> embeddings(n_embd_count * n_embd, 0);
|
std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
|
||||||
float * emb = embeddings.data();
|
float * emb = embeddings.data();
|
||||||
|
|
||||||
// break into batches
|
// break into batches
|
||||||
|
|
@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||||
float * out = emb + e * n_embd;
|
float * out = emb + e * n_embd_out;
|
||||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||||
s = 0;
|
s = 0;
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + e * n_embd;
|
float * out = emb + e * n_embd_out;
|
||||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||||
|
|
||||||
if (params.embd_out.empty()) {
|
if (params.embd_out.empty()) {
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
for (int j = 0; j < n_embd_count; j++) {
|
for (int j = 0; j < n_embd_count; j++) {
|
||||||
LOG("embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
for (int i = 0; i < std::min(3, n_embd_out); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||||
} else {
|
} else {
|
||||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG(" ... ");
|
LOG(" ... ");
|
||||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||||
} else {
|
} else {
|
||||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
|
||||||
for (uint32_t i = 0; i < n_cls_out; i++) {
|
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||||
if (n_cls_out == 1) {
|
if (n_cls_out == 1) {
|
||||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
|
||||||
} else {
|
} else {
|
||||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
|
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
LOG("embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||||
} else {
|
} else {
|
||||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||||
LOG("%6.2f ", sim);
|
LOG("%6.2f ", sim);
|
||||||
}
|
}
|
||||||
LOG("%1.10s", prompts[i].c_str());
|
LOG("%1.10s", prompts[i].c_str());
|
||||||
|
|
@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
|
||||||
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
LOG("[");
|
LOG("[");
|
||||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
|
||||||
i++;
|
i++;
|
||||||
if (i < n_embd) LOG(","); else break;
|
if (i < n_embd_out) LOG(","); else break;
|
||||||
}
|
}
|
||||||
LOG(notArray ? "]\n }" : "]");
|
LOG(notArray ? "]\n }" : "]");
|
||||||
j++;
|
j++;
|
||||||
|
|
@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
|
||||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
LOG(" [");
|
LOG(" [");
|
||||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||||
LOG("%6.2f", sim);
|
LOG("%6.2f", sim);
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) LOG(", "); else break;
|
if (j < n_embd_count) LOG(", "); else break;
|
||||||
|
|
@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (notArray) LOG("\n}\n");
|
if (notArray) LOG("\n}\n");
|
||||||
} else if (params.embd_out == "raw") {
|
} else if (params.embd_out == "raw") {
|
||||||
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
|
print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ causal-run-converted-model:
|
||||||
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
|
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
|
||||||
|
|
||||||
causal-verify-logits: causal-run-original-model causal-run-converted-model
|
causal-verify-logits: causal-run-original-model causal-run-converted-model
|
||||||
@./scripts/causal/compare-logits.py
|
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
|
||||||
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
|
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
|
||||||
|
|
||||||
causal-run-original-embeddings:
|
causal-run-original-embeddings:
|
||||||
|
|
@ -138,16 +138,13 @@ embedding-run-original-model-st: embedding-run-original-model
|
||||||
embedding-run-converted-model:
|
embedding-run-converted-model:
|
||||||
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
||||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
|
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
|
||||||
$(if $(USE_POOLING),--pooling)
|
$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")
|
||||||
|
|
||||||
embedding-run-converted-model-st: USE_POOLING=1
|
|
||||||
embedding-run-converted-model-st: embedding-run-converted-model
|
|
||||||
|
|
||||||
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
||||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||||
|
|
||||||
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
|
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
|
||||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -198,14 +198,13 @@ model, and the other is a text file which allows for manual visual inspection.
|
||||||
|
|
||||||
#### Using SentenceTransformer with numbered layers
|
#### Using SentenceTransformer with numbered layers
|
||||||
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
|
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
|
||||||
03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
|
03_Dense, 04_Normalize), these will be applied automatically when running the
|
||||||
|
converted model but currently there is a separate target to run the original
|
||||||
|
version:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
# Run original model with SentenceTransformer (applies all numbered layers)
|
# Run original model with SentenceTransformer (applies all numbered layers)
|
||||||
(venv) $ make embedding-run-original-model-st
|
(venv) $ make embedding-run-original-model-st
|
||||||
|
|
||||||
# Run converted model with pooling enabled
|
|
||||||
(venv) $ make embedding-run-converted-model-st
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This will use the SentenceTransformer library to load and run the model, which
|
This will use the SentenceTransformer library to load and run the model, which
|
||||||
|
|
@ -213,6 +212,17 @@ automatically applies all the numbered layers in the correct order. This is
|
||||||
particularly useful when comparing with models that should include these
|
particularly useful when comparing with models that should include these
|
||||||
additional transformation layers beyond just the base model output.
|
additional transformation layers beyond just the base model output.
|
||||||
|
|
||||||
|
The type of normalization can be specified for the converted model but is not
|
||||||
|
strictly necessary as the verification uses cosine similarity and the magnitude
|
||||||
|
of the output vectors does not affect this. But the normalization type can be
|
||||||
|
specified as an argument to the target which might be useful for manual
|
||||||
|
inspection:
|
||||||
|
```console
|
||||||
|
(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
|
||||||
|
```
|
||||||
|
The original model will apply the normalization according to the normalization
|
||||||
|
layer specified in the modules.json configuration file.
|
||||||
|
|
||||||
### Model conversion
|
### Model conversion
|
||||||
After updates have been made to [gguf-py](../../gguf-py) to add support for the
|
After updates have been made to [gguf-py](../../gguf-py) to add support for the
|
||||||
new model the model can be converted to GGUF format using the following command:
|
new model the model can be converted to GGUF format using the following command:
|
||||||
|
|
|
||||||
|
|
@ -1,268 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <filesystem>
|
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
|
||||||
printf("\nexample usage:\n");
|
|
||||||
printf("\n %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
|
|
||||||
printf("\n");
|
|
||||||
printf(" -embd-norm: normalization type for pooled embeddings (default: 2)\n");
|
|
||||||
printf(" -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
std::string model_path;
|
|
||||||
std::string prompt = "Hello, my name is";
|
|
||||||
int ngl = 0;
|
|
||||||
bool embedding_mode = false;
|
|
||||||
bool pooling_enabled = false;
|
|
||||||
int32_t embd_norm = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
|
||||||
|
|
||||||
{
|
|
||||||
int i = 1;
|
|
||||||
for (; i < argc; i++) {
|
|
||||||
if (strcmp(argv[i], "-m") == 0) {
|
|
||||||
if (i + 1 < argc) {
|
|
||||||
model_path = argv[++i];
|
|
||||||
} else {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else if (strcmp(argv[i], "-ngl") == 0) {
|
|
||||||
if (i + 1 < argc) {
|
|
||||||
try {
|
|
||||||
ngl = std::stoi(argv[++i]);
|
|
||||||
} catch (...) {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else if (strcmp(argv[i], "-embd-mode") == 0) {
|
|
||||||
embedding_mode = true;
|
|
||||||
} else if (strcmp(argv[i], "-pooling") == 0) {
|
|
||||||
pooling_enabled = true;
|
|
||||||
} else if (strcmp(argv[i], "-embd-norm") == 0) {
|
|
||||||
if (i + 1 < argc) {
|
|
||||||
try {
|
|
||||||
embd_norm = std::stoi(argv[++i]);
|
|
||||||
} catch (...) {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// prompt starts here
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model_path.empty()) {
|
|
||||||
print_usage(argc, argv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i < argc) {
|
|
||||||
prompt = argv[i++];
|
|
||||||
for (; i < argc; i++) {
|
|
||||||
prompt += " ";
|
|
||||||
prompt += argv[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_load_all();
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
|
||||||
model_params.n_gpu_layers = ngl;
|
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract basename from model_path
|
|
||||||
const char * basename = strrchr(model_path.c_str(), '/');
|
|
||||||
basename = (basename == NULL) ? model_path.c_str() : basename + 1;
|
|
||||||
|
|
||||||
char model_name[256];
|
|
||||||
strncpy(model_name, basename, 255);
|
|
||||||
model_name[255] = '\0';
|
|
||||||
|
|
||||||
char * dot = strrchr(model_name, '.');
|
|
||||||
if (dot != NULL && strcmp(dot, ".gguf") == 0) {
|
|
||||||
*dot = '\0';
|
|
||||||
}
|
|
||||||
printf("Model name: %s\n", model_name);
|
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
|
||||||
|
|
||||||
std::vector<llama_token> prompt_tokens(n_prompt);
|
|
||||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
|
||||||
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
|
||||||
ctx_params.n_ctx = n_prompt;
|
|
||||||
ctx_params.n_batch = n_prompt;
|
|
||||||
ctx_params.no_perf = false;
|
|
||||||
if (embedding_mode) {
|
|
||||||
ctx_params.embeddings = true;
|
|
||||||
ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
|
|
||||||
ctx_params.n_ubatch = ctx_params.n_batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
||||||
if (ctx == NULL) {
|
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Input prompt: \"%s\"\n", prompt.c_str());
|
|
||||||
printf("Tokenized prompt (%d tokens): ", n_prompt);
|
|
||||||
for (auto id : prompt_tokens) {
|
|
||||||
char buf[128];
|
|
||||||
int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
|
|
||||||
if (n < 0) {
|
|
||||||
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
std::string s(buf, n);
|
|
||||||
printf("%s (%d)", s.c_str(), id);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch)) {
|
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * data_ptr;
|
|
||||||
int data_size;
|
|
||||||
const char * type;
|
|
||||||
std::vector<float> embd_out;
|
|
||||||
|
|
||||||
if (embedding_mode) {
|
|
||||||
const int n_embd = llama_model_n_embd(model);
|
|
||||||
const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
|
|
||||||
const int n_embeddings = n_embd * n_embd_count;
|
|
||||||
float * embeddings;
|
|
||||||
type = "-embeddings";
|
|
||||||
|
|
||||||
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
|
|
||||||
embeddings = llama_get_embeddings_seq(ctx, 0);
|
|
||||||
embd_out.resize(n_embeddings);
|
|
||||||
printf("Normalizing embeddings using norm: %d\n", embd_norm);
|
|
||||||
common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
|
|
||||||
embeddings = embd_out.data();
|
|
||||||
} else {
|
|
||||||
embeddings = llama_get_embeddings(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Embedding dimension: %d\n", n_embd);
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
// Print embeddings in the specified format
|
|
||||||
for (int j = 0; j < n_embd_count; j++) {
|
|
||||||
printf("embedding %d: ", j);
|
|
||||||
|
|
||||||
// Print first 3 values
|
|
||||||
for (int i = 0; i < 3 && i < n_embd; i++) {
|
|
||||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf(" ... ");
|
|
||||||
|
|
||||||
// Print last 3 values
|
|
||||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
|
||||||
if (i >= 0) {
|
|
||||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
printf("Embeddings size: %d\n", n_embeddings);
|
|
||||||
|
|
||||||
data_ptr = embeddings;
|
|
||||||
data_size = n_embeddings;
|
|
||||||
} else {
|
|
||||||
float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
|
||||||
const int n_logits = llama_vocab_n_tokens(vocab);
|
|
||||||
type = "";
|
|
||||||
printf("Vocab size: %d\n", n_logits);
|
|
||||||
|
|
||||||
data_ptr = logits;
|
|
||||||
data_size = n_logits;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::filesystem::create_directory("data");
|
|
||||||
|
|
||||||
// Save data to binary file
|
|
||||||
char bin_filename[512];
|
|
||||||
snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
|
|
||||||
printf("Saving data to %s\n", bin_filename);
|
|
||||||
|
|
||||||
FILE * f = fopen(bin_filename, "wb");
|
|
||||||
if (f == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
fwrite(data_ptr, sizeof(float), data_size, f);
|
|
||||||
fclose(f);
|
|
||||||
|
|
||||||
// Also save as text for debugging
|
|
||||||
char txt_filename[512];
|
|
||||||
snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
|
|
||||||
f = fopen(txt_filename, "w");
|
|
||||||
if (f == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < data_size; i++) {
|
|
||||||
fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
|
|
||||||
}
|
|
||||||
fclose(f);
|
|
||||||
|
|
||||||
if (!embedding_mode) {
|
|
||||||
printf("First 10 logits: ");
|
|
||||||
for (int i = 0; i < 10 && i < data_size; i++) {
|
|
||||||
printf("%.6f ", data_ptr[i]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
|
|
||||||
printf("Last 10 logits: ");
|
|
||||||
for (int i = data_size - 10; i < data_size; i++) {
|
|
||||||
if (i >= 0) printf("%.6f ", data_ptr[i]);
|
|
||||||
}
|
|
||||||
printf("\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Data saved to %s\n", bin_filename);
|
|
||||||
printf("Data saved to %s\n", txt_filename);
|
|
||||||
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
@ -3,10 +3,11 @@
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
|
||||||
# Add utils directory to path for direct script execution
|
# Add utils directory to path for direct script execution
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||||
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||||
|
|
||||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
"""Lightweight sanity check before NMSE"""
|
"""Lightweight sanity check before NMSE"""
|
||||||
|
|
@ -38,6 +39,7 @@ def quick_logits_check(pytorch_file, llamacpp_file):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
model_path = os.environ.get('MODEL_PATH')
|
||||||
model_name = get_model_name_from_env_path('MODEL_PATH')
|
model_name = get_model_name_from_env_path('MODEL_PATH')
|
||||||
data_dir = Path("data")
|
data_dir = Path("data")
|
||||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||||
|
|
@ -58,6 +60,12 @@ def main():
|
||||||
|
|
||||||
print("Checked all required files were found. Proceeding...\n")
|
print("Checked all required files were found. Proceeding...\n")
|
||||||
|
|
||||||
|
# Verify tokens as they are a prerequisite for logits comparison.
|
||||||
|
print("🔍 Token Comparison Check")
|
||||||
|
print("=" * 40)
|
||||||
|
if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
|
||||||
|
exit_with_warning("\n❌ Token mismatch detected", model_path)
|
||||||
|
print()
|
||||||
|
|
||||||
print("🔍 GGML Model Validation for model ", model_name)
|
print("🔍 GGML Model Validation for model ", model_name)
|
||||||
print("=" * 40)
|
print("=" * 40)
|
||||||
|
|
@ -73,8 +81,7 @@ def main():
|
||||||
print(" Ok to proceed with NMSE check...")
|
print(" Ok to proceed with NMSE check...")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
else:
|
else:
|
||||||
print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
|
exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ with torch.no_grad():
|
||||||
last_hidden_states = outputs.hidden_states[-1]
|
last_hidden_states = outputs.hidden_states[-1]
|
||||||
|
|
||||||
# Get embeddings for all tokens
|
# Get embeddings for all tokens
|
||||||
token_embeddings = last_hidden_states[0].cpu().numpy() # Remove batch dimension
|
token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension
|
||||||
|
|
||||||
print(f"Hidden states shape: {last_hidden_states.shape}")
|
print(f"Hidden states shape: {last_hidden_states.shape}")
|
||||||
print(f"Token embeddings shape: {token_embeddings.shape}")
|
print(f"Token embeddings shape: {token_embeddings.shape}")
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,6 @@ if [ -z "$CONVERTED_MODEL" ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cmake --build ../../build --target llama-logits -j8
|
cmake --build ../../build --target llama-debug -j8
|
||||||
|
|
||||||
../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
|
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,6 @@ fi
|
||||||
echo $CONVERTED_MODEL
|
echo $CONVERTED_MODEL
|
||||||
echo $MODEL_TESTING_PROMPT
|
echo $MODEL_TESTING_PROMPT
|
||||||
|
|
||||||
cmake --build ../../build --target llama-logits -j8
|
cmake --build ../../build --target llama-debug -j8
|
||||||
|
|
||||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"
|
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,11 @@ import importlib
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
|
||||||
|
|
||||||
# Add parent directory to path for imports
|
# Add parent directory to path for imports
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||||
from utils.common import debug_hook
|
from utils.common import debug_hook, save_output_data
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(description="Process model with specified path")
|
parser = argparse.ArgumentParser(description="Process model with specified path")
|
||||||
|
|
@ -126,6 +125,7 @@ def main():
|
||||||
device = next(model.parameters()).device
|
device = next(model.parameters()).device
|
||||||
prompt = get_prompt(args)
|
prompt = get_prompt(args)
|
||||||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
||||||
|
token_ids = input_ids[0].cpu().tolist()
|
||||||
|
|
||||||
print(f"Input tokens: {input_ids}")
|
print(f"Input tokens: {input_ids}")
|
||||||
print(f"Input text: {repr(prompt)}")
|
print(f"Input text: {repr(prompt)}")
|
||||||
|
|
@ -151,19 +151,6 @@ def main():
|
||||||
print(f"Last token logits shape: {last_logits.shape}")
|
print(f"Last token logits shape: {last_logits.shape}")
|
||||||
print(f"Vocab size: {len(last_logits)}")
|
print(f"Vocab size: {len(last_logits)}")
|
||||||
|
|
||||||
data_dir = Path("data")
|
|
||||||
data_dir.mkdir(exist_ok=True)
|
|
||||||
bin_filename = data_dir / f"pytorch-{model_name}.bin"
|
|
||||||
txt_filename = data_dir / f"pytorch-{model_name}.txt"
|
|
||||||
|
|
||||||
# Save to file for comparison
|
|
||||||
last_logits.astype(np.float32).tofile(bin_filename)
|
|
||||||
|
|
||||||
# Also save as text file for easy inspection
|
|
||||||
with open(txt_filename, "w") as f:
|
|
||||||
for i, logit in enumerate(last_logits):
|
|
||||||
f.write(f"{i}: {logit:.6f}\n")
|
|
||||||
|
|
||||||
# Print some sample logits for quick verification
|
# Print some sample logits for quick verification
|
||||||
print(f"First 10 logits: {last_logits[:10]}")
|
print(f"First 10 logits: {last_logits[:10]}")
|
||||||
print(f"Last 10 logits: {last_logits[-10:]}")
|
print(f"Last 10 logits: {last_logits[-10:]}")
|
||||||
|
|
@ -175,8 +162,7 @@ def main():
|
||||||
token = tokenizer.decode([idx])
|
token = tokenizer.decode([idx])
|
||||||
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
|
||||||
|
|
||||||
print(f"Saved bin logits to: {bin_filename}")
|
save_output_data(last_logits, token_ids, prompt, model_name)
|
||||||
print(f"Saved txt logist to: {txt_filename}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ set -e
|
||||||
# Parse command line arguments
|
# Parse command line arguments
|
||||||
CONVERTED_MODEL=""
|
CONVERTED_MODEL=""
|
||||||
PROMPTS_FILE=""
|
PROMPTS_FILE=""
|
||||||
USE_POOLING=""
|
EMBD_NORMALIZE="2"
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
|
|
@ -13,9 +13,9 @@ while [[ $# -gt 0 ]]; do
|
||||||
PROMPTS_FILE="$2"
|
PROMPTS_FILE="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--pooling)
|
--embd-normalize)
|
||||||
USE_POOLING="1"
|
EMBD_NORMALIZE="$2"
|
||||||
shift
|
shift 2
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
if [ -z "$CONVERTED_MODEL" ]; then
|
if [ -z "$CONVERTED_MODEL" ]; then
|
||||||
|
|
@ -50,10 +50,5 @@ fi
|
||||||
|
|
||||||
echo $CONVERTED_MODEL
|
echo $CONVERTED_MODEL
|
||||||
|
|
||||||
cmake --build ../../build --target llama-logits -j8
|
cmake --build ../../build --target llama-debug -j8
|
||||||
# TODO: update logits.cpp to accept a --file/-f option for the prompt
|
../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
|
||||||
if [ -n "$USE_POOLING" ]; then
|
|
||||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
|
|
||||||
else
|
|
||||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
|
|
||||||
fi
|
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,15 @@
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
|
||||||
import importlib
|
import importlib
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
from transformers import AutoTokenizer, AutoConfig, AutoModel
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||||
|
from utils.common import save_output_data
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(description='Run original embedding model')
|
parser = argparse.ArgumentParser(description='Run original embedding model')
|
||||||
|
|
@ -169,6 +171,7 @@ def main():
|
||||||
return_tensors="pt"
|
return_tensors="pt"
|
||||||
)
|
)
|
||||||
tokens = encoded['input_ids'][0]
|
tokens = encoded['input_ids'][0]
|
||||||
|
token_ids = tokens.cpu().tolist()
|
||||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||||
print(f"{token_id:6d} -> '{token_str}'")
|
print(f"{token_id:6d} -> '{token_str}'")
|
||||||
|
|
@ -185,6 +188,7 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
tokens = encoded['input_ids'][0]
|
tokens = encoded['input_ids'][0]
|
||||||
|
token_ids = tokens.cpu().tolist()
|
||||||
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
token_strings = tokenizer.convert_ids_to_tokens(tokens)
|
||||||
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
|
||||||
print(f"{token_id:6d} -> '{token_str}'")
|
print(f"{token_id:6d} -> '{token_str}'")
|
||||||
|
|
@ -228,24 +232,11 @@ def main():
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
data_dir = Path("data")
|
|
||||||
data_dir.mkdir(exist_ok=True)
|
|
||||||
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
|
|
||||||
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
|
|
||||||
|
|
||||||
flattened_embeddings = all_embeddings.flatten()
|
flattened_embeddings = all_embeddings.flatten()
|
||||||
flattened_embeddings.astype(np.float32).tofile(bin_filename)
|
|
||||||
|
|
||||||
with open(txt_filename, "w") as f:
|
|
||||||
idx = 0
|
|
||||||
for j in range(n_embd_count):
|
|
||||||
for value in all_embeddings[j]:
|
|
||||||
f.write(f"{idx}: {value:.6f}\n")
|
|
||||||
idx += 1
|
|
||||||
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
|
||||||
print("")
|
print("")
|
||||||
print(f"Saved bin embeddings to: {bin_filename}")
|
|
||||||
print(f"Saved txt embeddings to: {txt_filename}")
|
save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,11 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import torch
|
import torch
|
||||||
|
import transformers
|
||||||
|
import json
|
||||||
|
import textwrap
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def get_model_name_from_env_path(env_path_name):
|
def get_model_name_from_env_path(env_path_name):
|
||||||
|
|
@ -148,3 +153,147 @@ def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_
|
||||||
# Patch it
|
# Patch it
|
||||||
setattr(module, function_name, debug_rope)
|
setattr(module, function_name, debug_rope)
|
||||||
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
|
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
|
||||||
|
|
||||||
|
|
||||||
|
def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
|
||||||
|
"""
|
||||||
|
Save output data (logits/embeddings), tokens, and prompt to files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: numpy array of floats (logits or embeddings)
|
||||||
|
tokens: list or array of token IDs
|
||||||
|
prompt: string containing the input prompt
|
||||||
|
model_name: name of the model
|
||||||
|
type_suffix: optional suffix like "-embeddings" (default: "")
|
||||||
|
output_dir: directory to save files (default: "data")
|
||||||
|
|
||||||
|
Creates the following files in output_dir:
|
||||||
|
- pytorch-{model_name}{type_suffix}.bin
|
||||||
|
- pytorch-{model_name}{type_suffix}.txt
|
||||||
|
- pytorch-{model_name}{type_suffix}-prompt.txt
|
||||||
|
- pytorch-{model_name}{type_suffix}-tokens.bin
|
||||||
|
"""
|
||||||
|
data_dir = Path(output_dir)
|
||||||
|
data_dir.mkdir(exist_ok=True)
|
||||||
|
base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
|
||||||
|
|
||||||
|
# Convert and flatten logits/embeddings
|
||||||
|
data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
|
||||||
|
data = data.flatten() if data.ndim > 1 else data
|
||||||
|
|
||||||
|
# Save logits/embedding files
|
||||||
|
data.astype(np.float32).tofile(f"{base_path}.bin")
|
||||||
|
print(f"Data saved to {base_path}.bin")
|
||||||
|
|
||||||
|
with open(f"{base_path}.txt", "w") as f:
|
||||||
|
f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
|
||||||
|
print(f"Data saved to {base_path}.txt")
|
||||||
|
|
||||||
|
# Convert and flatten tokens
|
||||||
|
tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
|
||||||
|
tokens = tokens.flatten() if tokens.ndim > 1 else tokens
|
||||||
|
|
||||||
|
# Save token binary file
|
||||||
|
tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
|
||||||
|
print(f"Tokens saved to {base_path}-tokens.bin")
|
||||||
|
|
||||||
|
# Save prompt file
|
||||||
|
with open(f"{base_path}-prompt.txt", "w") as f:
|
||||||
|
f.write(f"prompt: {prompt}\n")
|
||||||
|
f.write(f"n_tokens: {len(tokens)}\n")
|
||||||
|
f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
|
||||||
|
print(f"Prompt saved to {base_path}-prompt.txt")
|
||||||
|
|
||||||
|
|
||||||
|
def compare_tokens(original, converted, type_suffix="", output_dir="data"):
|
||||||
|
data_dir = Path(output_dir)
|
||||||
|
|
||||||
|
# Read tokens from both models
|
||||||
|
tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
|
||||||
|
tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
|
||||||
|
|
||||||
|
if not tokens1_file.exists():
|
||||||
|
print(f"Error: Token file not found: {tokens1_file}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not tokens2_file.exists():
|
||||||
|
print(f"Error: Token file not found: {tokens2_file}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
|
||||||
|
tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
|
||||||
|
|
||||||
|
print(f"\nComparing tokens between:")
|
||||||
|
print(f" Original : {original} ({len(tokens1)} tokens)")
|
||||||
|
print(f" Converted: {converted} ({len(tokens2)} tokens)")
|
||||||
|
|
||||||
|
if len(tokens1) != len(tokens2):
|
||||||
|
print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if np.array_equal(tokens1, tokens2):
|
||||||
|
print(f"\n✅ All {len(tokens1)} tokens match!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
mismatches = np.where(tokens1 != tokens2)[0]
|
||||||
|
print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
|
||||||
|
|
||||||
|
num_to_show = min(len(mismatches), 10)
|
||||||
|
for idx in mismatches[:num_to_show]:
|
||||||
|
print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
|
||||||
|
|
||||||
|
if len(mismatches) > num_to_show:
|
||||||
|
print(f" ... and {len(mismatches) - num_to_show} more mismatches")
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def show_version_warning(current_version, model_version):
|
||||||
|
if not model_version:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from packaging.version import parse, InvalidVersion
|
||||||
|
try:
|
||||||
|
return parse(current_version) < parse(model_version)
|
||||||
|
except InvalidVersion:
|
||||||
|
return current_version != model_version
|
||||||
|
except ImportError:
|
||||||
|
return current_version != model_version
|
||||||
|
|
||||||
|
def get_model_transformers_version(model_path):
|
||||||
|
if not model_path:
|
||||||
|
return None
|
||||||
|
|
||||||
|
config_path = Path(model_path) / "config.json"
|
||||||
|
if not config_path.is_file():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
return config.get("transformers_version")
|
||||||
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
|
print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def exit_with_warning(message, model_path):
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
if model_path and transformers is not None:
|
||||||
|
model_transformers_version = get_model_transformers_version(model_path)
|
||||||
|
transformers_version = transformers.__version__
|
||||||
|
if show_version_warning(transformers_version, model_transformers_version):
|
||||||
|
warning_message = f"""
|
||||||
|
=====================================================================
|
||||||
|
Verification failure might be due to a transformers version mismatch:
|
||||||
|
|
||||||
|
Current transformers version: {transformers_version}
|
||||||
|
Model's required version : {model_transformers_version}
|
||||||
|
|
||||||
|
Consider installing the version specified by the model's config:
|
||||||
|
pip install transformers=={model_transformers_version}
|
||||||
|
=====================================================================
|
||||||
|
"""
|
||||||
|
print(textwrap.dedent(warning_message))
|
||||||
|
sys.exit(1)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from common import compare_tokens # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Compare tokens between two models',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
%(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'original',
|
||||||
|
help='Original model name'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'converted',
|
||||||
|
help='Converted model name'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-s', '--suffix',
|
||||||
|
default='',
|
||||||
|
help='Type suffix (e.g., "-embeddings")'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--data-dir',
|
||||||
|
default='data',
|
||||||
|
help='Directory containing token files (default: data)'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-v', '--verbose',
|
||||||
|
action='store_true',
|
||||||
|
help='Print prompts from both models'
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_arguments()
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
from pathlib import Path
|
||||||
|
data_dir = Path(args.data_dir)
|
||||||
|
|
||||||
|
prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
|
||||||
|
prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
|
||||||
|
|
||||||
|
if prompt1_file.exists():
|
||||||
|
print(f"\nOriginal model prompt ({args.original}):")
|
||||||
|
print(f" {prompt1_file.read_text().strip()}")
|
||||||
|
|
||||||
|
if prompt2_file.exists():
|
||||||
|
print(f"\nConverted model prompt ({args.converted}):")
|
||||||
|
print(f" {prompt2_file.read_text().strip()}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
result = compare_tokens(
|
||||||
|
args.original,
|
||||||
|
args.converted,
|
||||||
|
type_suffix=args.suffix,
|
||||||
|
output_dir=args.data_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enable the script to be used in shell scripts so that they can check
|
||||||
|
# the exit code for success/failure.
|
||||||
|
sys.exit(0 if result else 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -4,8 +4,10 @@ import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
|
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
|
||||||
|
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||||
|
|
||||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||||
|
|
||||||
|
|
@ -157,9 +159,24 @@ def main():
|
||||||
else:
|
else:
|
||||||
prompt = args.prompt
|
prompt = args.prompt
|
||||||
|
|
||||||
|
python_emb_path = Path(args.python_embeddings)
|
||||||
|
cpp_emb_path = Path(args.cpp_embeddings)
|
||||||
|
|
||||||
|
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
|
||||||
|
python_model_name = python_emb_path.stem.replace("-embeddings", "")
|
||||||
|
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
|
||||||
|
|
||||||
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
||||||
print("=" * 70)
|
print("=" * 70)
|
||||||
|
|
||||||
|
# First verify tokens match before comparing embeddings
|
||||||
|
print("\n🔍 Token Comparison Check")
|
||||||
|
print("=" * 70)
|
||||||
|
data_dir = python_emb_path.parent
|
||||||
|
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
|
||||||
|
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
|
||||||
|
print()
|
||||||
|
|
||||||
# Single prompt detailed comparison
|
# Single prompt detailed comparison
|
||||||
print(f"\nTesting with prompt: '{prompt}'")
|
print(f"\nTesting with prompt: '{prompt}'")
|
||||||
|
|
||||||
|
|
@ -219,7 +236,7 @@ def main():
|
||||||
elif avg_cross_sim > 0.70:
|
elif avg_cross_sim > 0.70:
|
||||||
print("⚠️ FAIR: Models have some differences")
|
print("⚠️ FAIR: Models have some differences")
|
||||||
else:
|
else:
|
||||||
print("❌ POOR: Models are significantly different")
|
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
|
||||||
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||||
|
|
||||||
// allocate output
|
// allocate output
|
||||||
const int n_embd = llama_model_n_embd(model);
|
const int n_embd_out = llama_model_n_embd_out(model);
|
||||||
std::vector<float> embeddings(n_chunks * n_embd, 0);
|
std::vector<float> embeddings(n_chunks * n_embd_out, 0);
|
||||||
float * emb = embeddings.data();
|
float * emb = embeddings.data();
|
||||||
|
|
||||||
// break into batches
|
// break into batches
|
||||||
|
|
@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
|
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd_out;
|
||||||
batch_process(ctx, batch, out, s, n_embd);
|
batch_process(ctx, batch, out, s, n_embd_out);
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
|
@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd_out;
|
||||||
batch_process(ctx, batch, out, s, n_embd);
|
batch_process(ctx, batch, out, s, n_embd_out);
|
||||||
|
|
||||||
// save embeddings to chunks
|
// save embeddings to chunks
|
||||||
for (int i = 0; i < n_chunks; i++) {
|
for (int i = 0; i < n_chunks; i++) {
|
||||||
chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
|
chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
|
||||||
// clear tokens as they are no longer needed
|
// clear tokens as they are no longer needed
|
||||||
chunks[i].tokens.clear();
|
chunks[i].tokens.clear();
|
||||||
}
|
}
|
||||||
|
|
@ -266,8 +266,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
batch_add_seq(query_batch, query_tokens, 0);
|
batch_add_seq(query_batch, query_tokens, 0);
|
||||||
|
|
||||||
std::vector<float> query_emb(n_embd, 0);
|
std::vector<float> query_emb(n_embd_out, 0);
|
||||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
|
||||||
|
|
||||||
common_batch_clear(query_batch);
|
common_batch_clear(query_batch);
|
||||||
|
|
||||||
|
|
@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
std::vector<std::pair<int, float>> similarities;
|
std::vector<std::pair<int, float>> similarities;
|
||||||
for (int i = 0; i < n_chunks; i++) {
|
for (int i = 0; i < n_chunks; i++) {
|
||||||
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
|
||||||
similarities.push_back(std::make_pair(i, sim));
|
similarities.push_back(std::make_pair(i, sim));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -234,6 +234,11 @@
|
||||||
|
|
||||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||||
#define GGML_MEM_ALIGN 4
|
#define GGML_MEM_ALIGN 4
|
||||||
|
#elif defined(__EMSCRIPTEN__)
|
||||||
|
// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm.
|
||||||
|
// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.)
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/18628
|
||||||
|
#define GGML_MEM_ALIGN 8
|
||||||
#else
|
#else
|
||||||
#define GGML_MEM_ALIGN 16
|
#define GGML_MEM_ALIGN 16
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -144,7 +144,7 @@ extern "C" {
|
||||||
// device description: short informative description of the device, could be the model name
|
// device description: short informative description of the device, could be the model name
|
||||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
// device memory in bytes
|
// device memory in bytes: 0 bytes to indicate no memory to report
|
||||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||||
|
|
||||||
// device type
|
// device type
|
||||||
|
|
|
||||||
|
|
@ -32,14 +32,12 @@ if (BLAS_FOUND)
|
||||||
pkg_check_modules(DepBLAS openblas)
|
pkg_check_modules(DepBLAS openblas)
|
||||||
endif()
|
endif()
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
||||||
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
|
||||||
pkg_check_modules(DepBLAS blis)
|
pkg_check_modules(DepBLAS blis)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
||||||
pkg_check_modules(DepBLAS blas-atlas)
|
pkg_check_modules(DepBLAS blas-atlas)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||||
pkg_check_modules(DepBLAS flexiblas_api)
|
pkg_check_modules(DepBLAS flexiblas_api)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
|
||||||
# all Intel* libraries share the same include path
|
# all Intel* libraries share the same include path
|
||||||
pkg_check_modules(DepBLAS mkl-sdl)
|
pkg_check_modules(DepBLAS mkl-sdl)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
||||||
|
|
@ -74,10 +72,26 @@ if (BLAS_FOUND)
|
||||||
|
|
||||||
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
||||||
|
|
||||||
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
if ("${GGML_BLAS_VENDOR}" STREQUAL "")
|
||||||
|
message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
|
||||||
|
add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
|
||||||
|
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
|
||||||
|
add_compile_definitions(GGML_BLAS_USE_NVPL)
|
||||||
|
endif()
|
||||||
|
|
||||||
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
||||||
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
||||||
else()
|
else()
|
||||||
|
|
|
||||||
|
|
@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(OPENBLAS_VERSION)
|
#if defined(GGML_BLAS_USE_OPENBLAS)
|
||||||
openblas_set_num_threads(ctx->n_threads);
|
openblas_set_num_threads(ctx->n_threads);
|
||||||
#endif
|
#elif defined(GGML_BLAS_USE_BLIS)
|
||||||
|
|
||||||
#if defined(GGML_BLAS_USE_BLIS)
|
|
||||||
bli_thread_set_num_threads(ctx->n_threads);
|
bli_thread_set_num_threads(ctx->n_threads);
|
||||||
#endif
|
#elif defined(GGML_BLAS_USE_NVPL)
|
||||||
|
|
||||||
#if defined(GGML_BLAS_USE_NVPL)
|
|
||||||
nvpl_blas_set_num_threads(ctx->n_threads);
|
nvpl_blas_set_num_threads(ctx->n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
|
||||||
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
||||||
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
|
||||||
return "BLIS";
|
return "BLIS";
|
||||||
#elif defined(GGML_BLAS_USE_NVPL)
|
#elif defined(GGML_BLAS_USE_NVPL)
|
||||||
return "NVPL";
|
return "NVPL";
|
||||||
#elif defined(OPENBLAS_VERSION)
|
#elif defined(GGML_BLAS_USE_OPENBLAS)
|
||||||
return "OpenBLAS";
|
return "OpenBLAS";
|
||||||
#else
|
#else
|
||||||
return "BLAS";
|
return "BLAS";
|
||||||
|
|
|
||||||
|
|
@ -1963,7 +1963,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||||
acl_tensor_ptr acl_weight_tensor;
|
acl_tensor_ptr acl_weight_tensor;
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||||
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@ const ggml_cann_device_info & ggml_cann_info();
|
||||||
void ggml_cann_set_device(int32_t device);
|
void ggml_cann_set_device(int32_t device);
|
||||||
int32_t ggml_cann_get_device();
|
int32_t ggml_cann_get_device();
|
||||||
|
|
||||||
std::optional<std::string> get_env(const std::string & name);
|
std::optional<std::string> get_env_as_lowercase(const std::string & name);
|
||||||
bool parse_bool(const std::string & value);
|
bool parse_bool(const std::string & value);
|
||||||
int parse_integer(const std::string & value);
|
int parse_integer(const std::string & value);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -105,10 +105,10 @@ int32_t ggml_cann_get_device() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Get the value of the specified environment variable (name).
|
* @brief Get the value of the specified environment variable (name) as lowercase.
|
||||||
* if not empty, return a std::string object
|
* if not empty, return a std::string object
|
||||||
*/
|
*/
|
||||||
std::optional<std::string> get_env(const std::string & name) {
|
std::optional<std::string> get_env_as_lowercase(const std::string & name) {
|
||||||
const char * val = std::getenv(name.c_str());
|
const char * val = std::getenv(name.c_str());
|
||||||
if (!val) {
|
if (!val) {
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
|
|
@ -122,7 +122,7 @@ std::optional<std::string> get_env(const std::string & name) {
|
||||||
* @brief Verify whether the environment variable is a valid value.
|
* @brief Verify whether the environment variable is a valid value.
|
||||||
*/
|
*/
|
||||||
bool parse_bool(const std::string & value) {
|
bool parse_bool(const std::string & value) {
|
||||||
std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
||||||
return valid_values.find(value) != valid_values.end();
|
return valid_values.find(value) != valid_values.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -259,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
||||||
* @param device The device ID to associate with this buffer pool.
|
* @param device The device ID to associate with this buffer pool.
|
||||||
*/
|
*/
|
||||||
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
||||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -452,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
||||||
* @param device The device ID to associate with this buffer pool.
|
* @param device The device ID to associate with this buffer pool.
|
||||||
*/
|
*/
|
||||||
explicit ggml_cann_pool_buf(int device) : device(device) {
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
||||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -764,7 +764,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
* @return A unique pointer to the created CANN pool.
|
* @return A unique pointer to the created CANN pool.
|
||||||
*/
|
*/
|
||||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
|
||||||
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
|
||||||
|
|
||||||
if (mem_pool_type == "prio") {
|
if (mem_pool_type == "prio") {
|
||||||
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
||||||
|
|
@ -1217,7 +1217,7 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
// Why aclrtSynchronizeDevice?
|
// Why aclrtSynchronizeDevice?
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
||||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
||||||
|
|
@ -1442,7 +1442,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||||
int64_t ne0 = tensor->ne[0];
|
int64_t ne0 = tensor->ne[0];
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
|
|
||||||
// last line must bigger than 32, because every single op deal at
|
// last line must bigger than 32, because every single op deal at
|
||||||
// least 32 bytes.
|
// least 32 bytes.
|
||||||
|
|
@ -2136,7 +2136,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
||||||
#endif // USE_ACL_GRAPH
|
#endif // USE_ACL_GRAPH
|
||||||
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||||
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||||
static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or(""));
|
static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
|
||||||
|
|
||||||
if (!use_cann_graph || cann_graph_capture_required) {
|
if (!use_cann_graph || cann_graph_capture_required) {
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
|
@ -2201,7 +2201,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
||||||
#ifdef USE_ACL_GRAPH
|
#ifdef USE_ACL_GRAPH
|
||||||
bool use_cann_graph = true;
|
bool use_cann_graph = true;
|
||||||
|
|
||||||
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
||||||
if (!prefill_use_graph) {
|
if (!prefill_use_graph) {
|
||||||
// Do not use acl_graph for prefill.
|
// Do not use acl_graph for prefill.
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
|
@ -2541,27 +2541,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Determines if a tensor operation should be offloaded to the CANN
|
|
||||||
* backend.
|
|
||||||
*
|
|
||||||
* This function checks if a given tensor operation should be offloaded to the
|
|
||||||
* CANN backend based on the operation type and the size of the tensor. It
|
|
||||||
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
|
||||||
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
|
||||||
*
|
|
||||||
* @param backend Pointer to the CANN backend.
|
|
||||||
* @param op Pointer to the tensor operation to check.
|
|
||||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
||||||
* false.
|
|
||||||
*/
|
|
||||||
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
||||||
const int min_batch_size = 32;
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
|
|
||||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Records an event on the CANN backend stream.
|
* @brief Records an event on the CANN backend stream.
|
||||||
*
|
*
|
||||||
|
|
@ -2637,6 +2616,7 @@ struct ggml_backend_cann_device_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -2713,6 +2693,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
||||||
return ggml_backend_cann_host_buffer_type();
|
return ggml_backend_cann_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Determines if a tensor operation should be offloaded to the CANN
|
||||||
|
* backend.
|
||||||
|
*
|
||||||
|
* This function checks if a given tensor operation should be offloaded to the
|
||||||
|
* CANN backend based on the operation type and the size of the tensor. It
|
||||||
|
* returns true if the second dimension (ne[1]) of the tensor is greater than or
|
||||||
|
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
|
||||||
|
*
|
||||||
|
* @param backend Pointer to the CANN backend.
|
||||||
|
* @param op Pointer to the tensor operation to check.
|
||||||
|
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||||
|
* false.
|
||||||
|
*/
|
||||||
|
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
|
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
||||||
|
|
||||||
|
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Creates a new event for the CANN backend device.
|
* @brief Creates a new event for the CANN backend device.
|
||||||
*
|
*
|
||||||
|
|
@ -2829,12 +2829,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
aclInit(nullptr);
|
aclInit(nullptr);
|
||||||
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
||||||
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
||||||
dev_ctx->description = aclrtGetSocName();
|
dev_ctx->description = aclrtGetSocName();
|
||||||
dev_ctx->device = i;
|
dev_ctx->device = i;
|
||||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
ggml_cann_set_device(i);
|
ggml_cann_set_device(i);
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
||||||
/* .reg = */ ®,
|
/* .reg = */ ®,
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,10 @@ if (CUDAToolkit_FOUND)
|
||||||
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
|
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
|
||||||
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
|
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
|
||||||
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
|
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
|
||||||
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real 121a-real)
|
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
|
||||||
|
endif()
|
||||||
|
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
|
||||||
|
list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -1036,7 +1036,7 @@ struct ggml_tensor_extra_gpu {
|
||||||
#define USE_CUDA_GRAPH
|
#define USE_CUDA_GRAPH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_graph_node_properties {
|
struct ggml_cuda_graph_node_properties {
|
||||||
void * node_address;
|
void * node_address;
|
||||||
ggml_op node_op;
|
ggml_op node_op;
|
||||||
int64_t ne[GGML_MAX_DIMS];
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
|
@ -1061,10 +1061,25 @@ struct ggml_cuda_graph {
|
||||||
std::vector<cudaGraphNode_t> nodes;
|
std::vector<cudaGraphNode_t> nodes;
|
||||||
bool disable_due_to_gpu_arch = false;
|
bool disable_due_to_gpu_arch = false;
|
||||||
bool disable_due_to_too_many_updates = false;
|
bool disable_due_to_too_many_updates = false;
|
||||||
bool disable_due_to_failed_graph_capture = false;
|
|
||||||
int number_consecutive_updates = 0;
|
int number_consecutive_updates = 0;
|
||||||
bool cuda_graphs_enabled = false;
|
std::vector<ggml_cuda_graph_node_properties> props;
|
||||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
|
||||||
|
void record_update(bool use_graph, bool update_required) {
|
||||||
|
if (use_graph && update_required) {
|
||||||
|
number_consecutive_updates++;
|
||||||
|
} else {
|
||||||
|
number_consecutive_updates = 0;
|
||||||
|
}
|
||||||
|
if (number_consecutive_updates >= 4) {
|
||||||
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||||
|
disable_due_to_too_many_updates = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_enabled() const {
|
||||||
|
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||||
|
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,12 @@
|
||||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||||
|
|
||||||
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
|
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
|
||||||
// by the VKQ accumulators is effectively being shifted up by a factor of 8.
|
// by the VKQ accumulators is effectively being shifted up by a factor of 2.
|
||||||
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
|
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
|
||||||
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
|
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
|
||||||
#define FATTN_KQ_MAX_OFFSET 0.6931f
|
// Still, the value range should be shifted as much as necessary but as little as possible.
|
||||||
|
// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
|
||||||
|
#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
|
||||||
|
|
||||||
typedef void (* fattn_kernel_t)(
|
typedef void (* fattn_kernel_t)(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
|
|
|
||||||
|
|
@ -2853,9 +2853,9 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_CUDA_GRAPH
|
#ifdef USE_CUDA_GRAPH
|
||||||
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||||
bool use_cuda_graph) {
|
|
||||||
|
|
||||||
|
bool use_cuda_graph = true;
|
||||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||||
|
|
||||||
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
||||||
|
|
@ -2915,41 +2915,41 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||||
return use_cuda_graph;
|
return use_cuda_graph;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||||
graph_node_properties->node_address = node->data;
|
props->node_address = node->data;
|
||||||
graph_node_properties->node_op = node->op;
|
props->node_op = node->op;
|
||||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
graph_node_properties->ne[i] = node->ne[i];
|
props->ne[i] = node->ne[i];
|
||||||
graph_node_properties->nb[i] = node->nb[i];
|
props->nb[i] = node->nb[i];
|
||||||
}
|
}
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||||
}
|
}
|
||||||
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
|
||||||
if (node->data != graph_node_properties->node_address &&
|
if (node->data != props->node_address &&
|
||||||
node->op != GGML_OP_VIEW) {
|
node->op != GGML_OP_VIEW) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->op != graph_node_properties->node_op) {
|
if (node->op != props->node_op) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
if (node->ne[i] != props->ne[i]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
if (node->nb[i] != props->nb[i]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
if (node->src[i] &&
|
if (node->src[i] &&
|
||||||
node->src[i]->data != graph_node_properties->src_address[i] &&
|
node->src[i]->data != props->src_address[i] &&
|
||||||
node->op != GGML_OP_VIEW
|
node->op != GGML_OP_VIEW
|
||||||
) {
|
) {
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -2957,44 +2957,55 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
|
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
|
||||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||||
|
|
||||||
bool cuda_graph_update_required = false;
|
bool res = false;
|
||||||
|
|
||||||
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
||||||
cuda_graph_update_required = true;
|
res = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the graph size has changed
|
// Check if the graph size has changed
|
||||||
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
if (cuda_ctx->cuda_graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
|
||||||
cuda_graph_update_required = true;
|
res = true;
|
||||||
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
cuda_ctx->cuda_graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
||||||
// and store properties to allow this comparison for the next token
|
// and store properties to allow this comparison for the next token
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
bool has_matching_properties = true;
|
bool props_match = true;
|
||||||
if (!cuda_graph_update_required) {
|
if (!res) {
|
||||||
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &cuda_ctx->cuda_graph->props[i]);
|
||||||
}
|
}
|
||||||
if (!has_matching_properties) {
|
if (!props_match) {
|
||||||
cuda_graph_update_required = true;
|
res = true;
|
||||||
}
|
}
|
||||||
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[i], cgraph->nodes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cuda_graph_update_required;
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||||
|
bool props_match= true;
|
||||||
|
if (!res) {
|
||||||
|
props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &cuda_ctx->cuda_graph->props[cgraph->n_nodes + i]);
|
||||||
|
}
|
||||||
|
if (!props_match) {
|
||||||
|
res = true;
|
||||||
|
}
|
||||||
|
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12000
|
#if CUDART_VERSION >= 12000
|
||||||
cudaGraphExecUpdateResultInfo result_info;
|
cudaGraphExecUpdateResultInfo result_info;
|
||||||
|
|
@ -3225,10 +3236,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required) {
|
||||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
bool graph_evaluated_or_captured = false;
|
||||||
|
|
||||||
// flag used to determine whether it is an integrated_gpu
|
// flag used to determine whether it is an integrated_gpu
|
||||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||||
|
|
||||||
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
|
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
|
||||||
bool is_concurrent_event_active = false;
|
bool is_concurrent_event_active = false;
|
||||||
|
|
@ -3698,7 +3710,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||||
}
|
}
|
||||||
if (cuda_graph_update_required) { // Update graph executable
|
if (cuda_graph_update_required) { // Update graph executable
|
||||||
update_cuda_graph_executable(cuda_ctx);
|
ggml_cuda_graph_update_executable(cuda_ctx);
|
||||||
}
|
}
|
||||||
// Launch graph
|
// Launch graph
|
||||||
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||||
|
|
@ -3708,43 +3720,25 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
||||||
|
|
||||||
#ifdef USE_CUDA_GRAPH
|
#ifdef USE_CUDA_GRAPH
|
||||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
|
||||||
|
|
||||||
// Objects required for CUDA Graph
|
|
||||||
if (cuda_ctx->cuda_graph == nullptr) {
|
if (cuda_ctx->cuda_graph == nullptr) {
|
||||||
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool use_cuda_graph = true;
|
|
||||||
|
|
||||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
||||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||||
#ifndef NDEBUG
|
|
||||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
return cuda_ctx->cuda_graph->is_enabled();
|
||||||
// or previous graph capture failure.
|
|
||||||
// Also disable for multi-gpu for now. TO DO investigate
|
|
||||||
if (disable_cuda_graphs_due_to_env
|
|
||||||
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
|
||||||
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
|
||||||
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
|
||||||
use_cuda_graph = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
|
|
||||||
#else
|
#else
|
||||||
bool use_cuda_graph = false;
|
return false;
|
||||||
#endif // USE_CUDA_GRAPH
|
#endif // USE_CUDA_GRAPH
|
||||||
|
|
||||||
return use_cuda_graph;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
|
|
@ -3755,30 +3749,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
bool use_cuda_graph = false;
|
bool use_cuda_graph = false;
|
||||||
bool cuda_graph_update_required = false;
|
bool cuda_graph_update_required = false;
|
||||||
|
|
||||||
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
|
|
||||||
// we call it here instead.
|
|
||||||
#ifdef USE_CUDA_GRAPH
|
#ifdef USE_CUDA_GRAPH
|
||||||
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||||
|
|
||||||
if (use_cuda_graph) {
|
if (cuda_ctx->cuda_graph->is_enabled()) {
|
||||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||||
|
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
|
||||||
|
|
||||||
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
|
cuda_ctx->cuda_graph->record_update(use_cuda_graph, cuda_graph_update_required);
|
||||||
|
|
||||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
|
||||||
if (use_cuda_graph && cuda_graph_update_required) {
|
|
||||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
|
||||||
} else {
|
|
||||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
|
||||||
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
|
||||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
|
|
||||||
#ifndef NDEBUG
|
|
||||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif // USE_CUDA_GRAPH
|
#endif // USE_CUDA_GRAPH
|
||||||
|
|
||||||
|
|
@ -3792,9 +3770,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool graph_evaluated_or_captured = false;
|
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required);
|
||||||
|
|
||||||
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
|
||||||
|
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
@ -3827,7 +3803,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||||
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||||
|
|
||||||
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
const bool use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||||
|
|
||||||
static bool enable_graph_optimization = [] {
|
static bool enable_graph_optimization = [] {
|
||||||
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
||||||
|
|
@ -4146,6 +4122,7 @@ struct ggml_backend_cuda_device_context {
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
std::string pci_bus_id;
|
std::string pci_bus_id;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4700,11 +4677,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||||
|
|
||||||
return get_op_batch_size(op) >= min_batch_size;
|
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4872,6 +4847,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||||
|
|
@ -4885,6 +4861,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
char pci_bus_id[16] = {};
|
char pci_bus_id[16] = {};
|
||||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
dev_ctx->pci_bus_id = pci_bus_id;
|
dev_ctx->pci_bus_id = pci_bus_id;
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||||
|
|
|
||||||
|
|
@ -34,13 +34,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
// CUDA_GRAPHS_DISABLED
|
// CUDA_GRAPHS_DISABLED
|
||||||
((ncols > 65536) &&
|
((ncols > 65536) &&
|
||||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
ctx.cuda_graph->is_enabled())) ||
|
||||||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
|
|
||||||
// CUDA_GRAPHS ENABLED
|
// CUDA_GRAPHS ENABLED
|
||||||
((ncols > 32768) &&
|
((ncols > 32768) &&
|
||||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
ctx.cuda_graph->is_enabled()))) {
|
||||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
|
||||||
#else
|
#else
|
||||||
(ncols > 65536)) {
|
(ncols > 65536)) {
|
||||||
#endif // USE_CUDA_GRAPH
|
#endif // USE_CUDA_GRAPH
|
||||||
|
|
|
||||||
|
|
@ -190,7 +190,7 @@ void ggml_cuda_mul_mat_q(
|
||||||
{
|
{
|
||||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||||
const int64_t s13 = src1->nb[2] / ts_src1;
|
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||||
|
|
||||||
if (use_native_mxfp4) {
|
if (use_native_mxfp4) {
|
||||||
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||||
|
|
@ -333,6 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||||
}
|
}
|
||||||
|
|
||||||
if (amd_wmma_available(cc)) {
|
if (amd_wmma_available(cc)) {
|
||||||
|
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||||
|
// High expert counts are almost always better on MMQ due to
|
||||||
|
// the synchronization overhead in the cuBLAS/hipBLAS path:
|
||||||
|
// https://github.com/ggml-org/llama.cpp/pull/18202
|
||||||
|
if (n_experts >= 64) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For some quantization types MMQ can have lower peak TOPS than hipBLAS
|
||||||
|
// so it's only faster for sufficiently small batch sizes:
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return ne11 <= 128;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
|
||||||
|
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,7 @@ __global__ void __launch_bounds__(splitD, 1)
|
||||||
#endif // __clang__
|
#endif // __clang__
|
||||||
|
|
||||||
// assumes as many threads as d_state
|
// assumes as many threads as d_state
|
||||||
template <int splitH, int d_state>
|
template <int c_factor, int d_state>
|
||||||
__global__ void __launch_bounds__(d_state, 1)
|
__global__ void __launch_bounds__(d_state, 1)
|
||||||
ssm_scan_f32_group(
|
ssm_scan_f32_group(
|
||||||
const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||||
|
|
@ -125,20 +125,25 @@ __global__ void __launch_bounds__(d_state, 1)
|
||||||
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
|
||||||
const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) {
|
const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) {
|
||||||
|
|
||||||
const int head_idx = (blockIdx.x * splitH) / d_head;
|
const int warp = threadIdx.x / WARP_SIZE;
|
||||||
const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float);
|
const int lane = threadIdx.x % WARP_SIZE;
|
||||||
const int seq_idx = blockIdx.y;
|
const int warp_idx = blockIdx.x * c_factor + warp;
|
||||||
|
|
||||||
|
const int head_idx = warp_idx / d_head;
|
||||||
|
const int head_off = (warp_idx % d_head) * sizeof(float);
|
||||||
|
const int seq_idx = blockIdx.y;
|
||||||
|
|
||||||
const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
|
const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);
|
||||||
|
|
||||||
const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
// TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase
|
||||||
const float * x_block = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float));
|
const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||||
const float * dt_block = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
|
const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
|
||||||
const float * A_block = (const float *) ((const char *) src3 + head_idx * src3_nb1);
|
const float * dt_warp = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
|
||||||
const float * B_block = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
|
const float * A_warp = (const float *) ((const char *) src3 + head_idx * src3_nb1);
|
||||||
const float * C_block = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
|
const float * B_warp = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
|
||||||
float * y_block = dst + (seq_idx * n_tok * n_head * d_head) + blockIdx.x * splitH;
|
const float * C_warp = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
|
||||||
float * s_block = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
float * y_warp = dst + (seq_idx * n_tok * n_head * d_head) + warp_idx;
|
||||||
|
float * s_warp = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
|
||||||
|
|
||||||
// strides across n_seq_tokens
|
// strides across n_seq_tokens
|
||||||
const int stride_x = src1_nb2 / sizeof(float);
|
const int stride_x = src1_nb2 / sizeof(float);
|
||||||
|
|
@ -147,80 +152,42 @@ __global__ void __launch_bounds__(d_state, 1)
|
||||||
const int stride_C = src5_nb2 / sizeof(float);
|
const int stride_C = src5_nb2 / sizeof(float);
|
||||||
const int stride_y = n_head * d_head;
|
const int stride_y = n_head * d_head;
|
||||||
|
|
||||||
float state[splitH];
|
float state[c_factor];
|
||||||
// for the parallel accumulation
|
float state_sum = 0.0f;
|
||||||
__shared__ float stateC[splitH * d_state];
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < splitH; j++) {
|
for (int j = 0; j < c_factor; j++) {
|
||||||
state[j] = s0_block[j * d_state + threadIdx.x];
|
state[j] = s0_warp[WARP_SIZE * j + lane];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t i = 0; i < n_tok; i++) {
|
for (int64_t i = 0; i < n_tok; i++) {
|
||||||
// TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements
|
// NOTE: dt_soft_plus, dA and x_dt have the same value for a warp here.
|
||||||
// TODO: only calculate B and C once per head group
|
// Recalculation is intentional; sharing via shuffles/smem proved slower due to sync overhead.
|
||||||
// NOTE: dt_soft_plus, dA and x_dt have the same value across threads here.
|
const float dt_soft_plus = (dt_warp[i * stride_dt] <= 20.0f ? log1pf(expf(dt_warp[i * stride_dt])) : dt_warp[i * stride_dt]);
|
||||||
float dt_soft_plus = dt_block[i * stride_dt];
|
|
||||||
if (dt_soft_plus <= 20.0f) {
|
|
||||||
dt_soft_plus = log1pf(expf(dt_soft_plus));
|
|
||||||
}
|
|
||||||
const float dA = expf(dt_soft_plus * A_block[0]);
|
|
||||||
const float B = B_block[i * stride_B + threadIdx.x];
|
|
||||||
const float C = C_block[i * stride_C + threadIdx.x];
|
|
||||||
|
|
||||||
// across d_head
|
state_sum = 0.0f;
|
||||||
|
const float dA = expf(dt_soft_plus * A_warp[0]);
|
||||||
|
const float x_dt = x_warp[i * stride_x] * dt_soft_plus;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < splitH; j++) {
|
for (int j = 0; j < c_factor; j++) {
|
||||||
const float x_dt = x_block[i * stride_x + j] * dt_soft_plus;
|
const float B_val = B_warp[i * stride_B + WARP_SIZE * j + lane];
|
||||||
|
const float C_val = C_warp[i * stride_C + WARP_SIZE * j + lane];
|
||||||
state[j] = (state[j] * dA) + (B * x_dt);
|
state[j] = (state[j] * dA) + (B_val * x_dt);
|
||||||
|
state_sum += state[j] * C_val;
|
||||||
stateC[j * d_state + threadIdx.x] = state[j] * C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
// parallel accumulation for output
|
||||||
|
state_sum = warp_reduce_sum(state_sum);
|
||||||
|
|
||||||
// parallel accumulation for stateC
|
if (lane == 0) {
|
||||||
// TODO: simplify
|
y_warp[i * stride_y] = state_sum;
|
||||||
{
|
|
||||||
static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2");
|
|
||||||
static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2");
|
|
||||||
|
|
||||||
// reduce until w matches the warp size
|
|
||||||
// TODO: does this work even when the physical warp size is 64?
|
|
||||||
#pragma unroll
|
|
||||||
for (int w = d_state; w > WARP_SIZE; w >>= 1) {
|
|
||||||
// (assuming there are d_state threads)
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) {
|
|
||||||
// TODO: check for bank conflicts
|
|
||||||
const int k = (threadIdx.x % (w >> 1)) + (d_state * (threadIdx.x / (w >> 1))) + j * d_state * (d_state / (w >> 1));
|
|
||||||
stateC[k] += stateC[k + (w >> 1)];
|
|
||||||
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
static_assert(splitH >= d_state / WARP_SIZE);
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) {
|
|
||||||
float y = stateC[(threadIdx.x % WARP_SIZE) + d_state * (threadIdx.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)];
|
|
||||||
y = warp_reduce_sum(y);
|
|
||||||
|
|
||||||
// store the above accumulations
|
|
||||||
if (threadIdx.x % WARP_SIZE == 0) {
|
|
||||||
const int k = threadIdx.x / WARP_SIZE + j * (d_state / WARP_SIZE);
|
|
||||||
y_block[i * stride_y + k] = y;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// write back the state
|
// write back the state
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < splitH; j++) {
|
for (int j = 0; j < c_factor; j++) {
|
||||||
s_block[j * d_state + threadIdx.x] = state[j];
|
s_warp[WARP_SIZE * j + lane] = state[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -231,27 +198,24 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||||
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
||||||
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
const int threads = 128;
|
|
||||||
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
||||||
if (src3_nb1 == sizeof(float)) {
|
if (src3_nb1 == sizeof(float)) {
|
||||||
// Mamba-2
|
// Mamba-2
|
||||||
if (d_state == 128) {
|
if (d_state == 128) {
|
||||||
GGML_ASSERT(d_state % threads == 0);
|
constexpr int threads = 128;
|
||||||
// NOTE: can be any power of two between 4 and 64
|
constexpr int num_warps = threads/WARP_SIZE;
|
||||||
const int splitH = 16;
|
|
||||||
GGML_ASSERT(head_dim % splitH == 0);
|
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
ssm_scan_f32_group<128/WARP_SIZE, 128><<<blocks, threads, 0, stream>>>(
|
||||||
ssm_scan_f32_group<16, 128><<<blocks, threads, 0, stream>>>(
|
|
||||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||||
} else if (d_state == 256) { // Falcon-H1
|
} else if (d_state == 256) { // Falcon-H1
|
||||||
const int threads = 256;
|
constexpr int threads = 256;
|
||||||
// NOTE: can be any power of two between 8 and 64
|
constexpr int num_warps = threads/WARP_SIZE;
|
||||||
const int splitH = 16;
|
|
||||||
GGML_ASSERT(head_dim % splitH == 0);
|
const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
|
||||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
ssm_scan_f32_group<256/WARP_SIZE, 256><<<blocks, threads, 0, stream>>>(
|
||||||
ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
|
|
||||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||||
|
|
@ -260,6 +224,7 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Mamba-1
|
// Mamba-1
|
||||||
|
constexpr int threads = 128;
|
||||||
GGML_ASSERT(n_head % threads == 0);
|
GGML_ASSERT(n_head % threads == 0);
|
||||||
GGML_ASSERT(head_dim == 1);
|
GGML_ASSERT(head_dim == 1);
|
||||||
GGML_ASSERT(n_group == 1);
|
GGML_ASSERT(n_group == 1);
|
||||||
|
|
|
||||||
|
|
@ -1773,6 +1773,37 @@ static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||||
|
const struct ggml_tensor * src0 = op->src[0];
|
||||||
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
const struct ggml_tensor * src2 = op->src[2];
|
||||||
|
const struct ggml_tensor * src3 = op->src[3];
|
||||||
|
const struct ggml_tensor * src4 = op->src[4];
|
||||||
|
const struct ggml_tensor * dst = op;
|
||||||
|
|
||||||
|
// Check for F16 support only as requested
|
||||||
|
if ((src0->type != GGML_TYPE_F16 && src0->type != GGML_TYPE_F32) || src1->type != GGML_TYPE_F16 || src2->type != GGML_TYPE_F16) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src3 && src3->type != GGML_TYPE_F16) { // mask
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src4 && src4->type != GGML_TYPE_F32) { // sinks
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For now we support F32 or F16 output as htp backend often converts output on the fly if needed,
|
||||||
|
// but the op implementation writes to F16 or F32.
|
||||||
|
// Let's assume dst can be F32 or F16.
|
||||||
|
if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return opt_experimental;
|
||||||
|
}
|
||||||
|
|
||||||
static bool hex_supported_src0_type(ggml_type t) {
|
static bool hex_supported_src0_type(ggml_type t) {
|
||||||
return t == GGML_TYPE_F32;
|
return t == GGML_TYPE_F32;
|
||||||
}
|
}
|
||||||
|
|
@ -1815,12 +1846,11 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
if (dst->type != GGML_TYPE_F32) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: add support for non-cont tensors
|
if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
|
||||||
if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1836,7 +1866,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
return false; // typically the lm-head which would be too large for VTCM
|
return false; // typically the lm-head which would be too large for VTCM
|
||||||
}
|
}
|
||||||
|
|
||||||
// if ((src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3])) return false;
|
|
||||||
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
|
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -1885,21 +1914,10 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
if (!opt_experimental) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: add support for non-cont tensors
|
|
||||||
if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2060,6 +2078,46 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||||
|
const struct ggml_tensor * src0 = op->src[0]; // values
|
||||||
|
const struct ggml_tensor * src1 = op->src[1]; // indices
|
||||||
|
const struct ggml_tensor * dst = op;
|
||||||
|
|
||||||
|
if (src0->type != GGML_TYPE_F32) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst->type != GGML_TYPE_F16) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||||
|
const struct ggml_tensor * src0 = op->src[0]; // values
|
||||||
|
const struct ggml_tensor * src1 = op->src[1]; // indices
|
||||||
|
const struct ggml_tensor * dst = op;
|
||||||
|
|
||||||
|
if (src0->type != GGML_TYPE_F32) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst->type != GGML_TYPE_F32) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||||
const int32_t * op_params = &op->op_params[0];
|
const int32_t * op_params = &op->op_params[0];
|
||||||
|
|
||||||
|
|
@ -2154,6 +2212,11 @@ static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_t
|
||||||
d->offset = (uint8_t *) t->data - buf->base;
|
d->offset = (uint8_t *) t->data - buf->base;
|
||||||
d->size = ggml_nbytes(t);
|
d->size = ggml_nbytes(t);
|
||||||
|
|
||||||
|
if (!d->size) {
|
||||||
|
// Some requests contain srcs where ggml_nbytes() returns 0 but the rest of the op is non-empty
|
||||||
|
d->size = 64;
|
||||||
|
}
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
|
case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
|
||||||
// Flush CPU
|
// Flush CPU
|
||||||
|
|
@ -2239,6 +2302,17 @@ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bu
|
||||||
return n_bufs;
|
return n_bufs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||||
|
req->op = HTP_OP_GET_ROWS;
|
||||||
|
|
||||||
|
size_t n_bufs = 0;
|
||||||
|
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||||
|
|
||||||
|
return n_bufs;
|
||||||
|
}
|
||||||
|
|
||||||
template <bool _is_src0_constant>
|
template <bool _is_src0_constant>
|
||||||
static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||||
switch (t->op) {
|
switch (t->op) {
|
||||||
|
|
@ -2266,6 +2340,17 @@ static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer *
|
||||||
return n_bufs;
|
return n_bufs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline size_t init_set_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||||
|
req->op = HTP_OP_SET_ROWS;
|
||||||
|
|
||||||
|
size_t n_bufs = 0;
|
||||||
|
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||||
|
|
||||||
|
return n_bufs;
|
||||||
|
}
|
||||||
|
|
||||||
static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||||
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
||||||
|
|
||||||
|
|
@ -2277,6 +2362,11 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
|
||||||
supported = true;
|
supported = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_SCALE:
|
||||||
|
req->op = HTP_OP_SCALE;
|
||||||
|
supported = true;
|
||||||
|
break;
|
||||||
|
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
|
||||||
req->op = HTP_OP_UNARY_SILU;
|
req->op = HTP_OP_UNARY_SILU;
|
||||||
|
|
@ -2331,6 +2421,21 @@ static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs
|
||||||
return n_bufs;
|
return n_bufs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline size_t init_flash_attn_ext_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||||
|
memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
|
||||||
|
req->op = HTP_OP_FLASH_ATTN_EXT;
|
||||||
|
|
||||||
|
size_t n_bufs = 0;
|
||||||
|
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src3, &bufs[n_bufs], t->src[3], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->src4, &bufs[n_bufs], t->src[4], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||||
|
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||||
|
|
||||||
|
return n_bufs;
|
||||||
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||||
return sess->name.c_str();
|
return sess->name.c_str();
|
||||||
|
|
@ -2417,6 +2522,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||||
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
|
case GGML_OP_SCALE:
|
||||||
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
|
@ -2439,6 +2545,18 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||||
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
|
ggml_hexagon_dispatch_op<init_flash_attn_ext_req>(sess, node, flags);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_SET_ROWS:
|
||||||
|
ggml_hexagon_dispatch_op<init_set_rows_req>(sess, node, flags);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
ggml_hexagon_dispatch_op<init_get_rows_req>(sess, node, flags);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
|
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
|
||||||
}
|
}
|
||||||
|
|
@ -2778,6 +2896,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
|
case GGML_OP_SCALE:
|
||||||
supp = ggml_hexagon_supported_unary(sess, op);
|
supp = ggml_hexagon_supported_unary(sess, op);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
@ -2805,6 +2924,18 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||||
supp = ggml_hexagon_supported_rope(sess, op);
|
supp = ggml_hexagon_supported_rope(sess, op);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
|
supp = ggml_hexagon_supported_flash_attn_ext(sess, op);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_SET_ROWS:
|
||||||
|
supp = ggml_hexagon_supported_set_rows(sess, op);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
supp = ggml_hexagon_supported_get_rows(sess, op);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,9 @@ add_library(${HTP_LIB} SHARED
|
||||||
softmax-ops.c
|
softmax-ops.c
|
||||||
act-ops.c
|
act-ops.c
|
||||||
rope-ops.c
|
rope-ops.c
|
||||||
|
flash-attn-ops.c
|
||||||
|
set-rows-ops.c
|
||||||
|
get-rows-ops.c
|
||||||
)
|
)
|
||||||
|
|
||||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,566 @@
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-function"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||||
|
|
||||||
|
#ifdef HTP_DEBUG
|
||||||
|
# define FARF_HIGH 1
|
||||||
|
#endif
|
||||||
|
#include <HAP_farf.h>
|
||||||
|
#include <HAP_mem.h>
|
||||||
|
#include <HAP_perf.h>
|
||||||
|
#include <hexagon_protos.h>
|
||||||
|
#include <hexagon_types.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define GGML_COMMON_DECL_C
|
||||||
|
#include "ggml-common.h"
|
||||||
|
#include "htp-ctx.h"
|
||||||
|
#include "htp-dma.h"
|
||||||
|
#include "htp-msg.h"
|
||||||
|
#include "htp-ops.h"
|
||||||
|
#include "hvx-utils.h"
|
||||||
|
#include "ops-utils.h"
|
||||||
|
|
||||||
|
// Dot product of FP32 and FP16 vectors, accumulating to float
|
||||||
|
static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
|
||||||
|
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
|
||||||
|
const HVX_Vector * restrict vx = (const HVX_Vector * restrict) x; // fp16
|
||||||
|
|
||||||
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
|
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||||
|
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; i++) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||||
|
|
||||||
|
// Load x (fp16)
|
||||||
|
HVX_Vector x_hf = vx[i];
|
||||||
|
|
||||||
|
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||||
|
|
||||||
|
// Load x (fp16)
|
||||||
|
HVX_Vector x_hf = vx[i];
|
||||||
|
|
||||||
|
// Zero-out unused elements
|
||||||
|
// Note that we need to clear both x and y because they may contain NANs
|
||||||
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||||
|
x_hf = Q6_V_vand_QV(bmask, x_hf);
|
||||||
|
y_hf = Q6_V_vand_QV(bmask, y_hf);
|
||||||
|
|
||||||
|
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||||
|
}
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
|
||||||
|
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
|
||||||
|
|
||||||
|
hvx_vec_store_u(r, 4, rsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dot product of two F16 vectors, accumulating to float
|
||||||
|
static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict x, const void * restrict y, unsigned int n, float s) {
|
||||||
|
const HVX_Vector * restrict vx = (const HVX_Vector * restrict) x; // fp16
|
||||||
|
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16
|
||||||
|
|
||||||
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
|
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||||
|
HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; i++) {
|
||||||
|
HVX_Vector y_hf = vy[i];
|
||||||
|
HVX_Vector x_hf = vx[i];
|
||||||
|
|
||||||
|
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector y_hf = vy[i];
|
||||||
|
|
||||||
|
// Load x (fp16) and zero-out unused elements
|
||||||
|
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||||
|
HVX_Vector x_hf = Q6_V_vand_QV(bmask, vx[i]);
|
||||||
|
|
||||||
|
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||||
|
}
|
||||||
|
|
||||||
|
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_fp32(s));
|
||||||
|
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_qf32_reduce_sum(rsum));
|
||||||
|
hvx_vec_store_u(r, 4, rsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
// MAD: y (F32) += x (F16) * v (float)
|
||||||
|
static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict x, int n, float s) {
|
||||||
|
const HVX_Vector * restrict ptr_x = (const HVX_Vector *) x;
|
||||||
|
HVX_Vector * restrict ptr_y = (HVX_Vector *) y;
|
||||||
|
|
||||||
|
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||||
|
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||||
|
|
||||||
|
HVX_Vector S = hvx_vec_splat_fp16(s);
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; ++i) {
|
||||||
|
// Multiply x * s -> pair of F32 vectors
|
||||||
|
HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
|
||||||
|
ptr_y[i*2] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(xs_p), ptr_y[i*2]));
|
||||||
|
ptr_y[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(xs_p), ptr_y[i*2+1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
|
||||||
|
|
||||||
|
HVX_Vector xs = Q6_V_lo_W(xs_p);
|
||||||
|
i = 2 * i; // index for ptr_y
|
||||||
|
|
||||||
|
if (nloe >= 32) {
|
||||||
|
ptr_y[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
|
||||||
|
nloe -= 32; ++i; xs = Q6_V_hi_W(xs_p);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
|
||||||
|
hvx_vec_store_u(&ptr_y[i], nloe * 4, xy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FLASH_ATTN_BLOCK_SIZE 128
|
||||||
|
|
||||||
|
static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, int nth) {
|
||||||
|
const struct htp_tensor * q = &octx->src0;
|
||||||
|
const struct htp_tensor * k = &octx->src1;
|
||||||
|
const struct htp_tensor * v = &octx->src2;
|
||||||
|
const struct htp_tensor * mask = (octx->src3.data) ? &octx->src3 : NULL;
|
||||||
|
const struct htp_tensor * sinks = (octx->src4.data) ? &octx->src4 : NULL;
|
||||||
|
struct htp_tensor * dst = &octx->dst;
|
||||||
|
|
||||||
|
const uint32_t neq0 = q->ne[0];
|
||||||
|
const uint32_t neq1 = q->ne[1];
|
||||||
|
const uint32_t neq2 = q->ne[2];
|
||||||
|
const uint32_t neq3 = q->ne[3];
|
||||||
|
|
||||||
|
const uint32_t nek0 = k->ne[0];
|
||||||
|
const uint32_t nek1 = k->ne[1];
|
||||||
|
const uint32_t nek2 = k->ne[2];
|
||||||
|
const uint32_t nek3 = k->ne[3];
|
||||||
|
|
||||||
|
const uint32_t nev0 = v->ne[0];
|
||||||
|
const uint32_t nev1 = v->ne[1];
|
||||||
|
const uint32_t nev2 = v->ne[2];
|
||||||
|
const uint32_t nev3 = v->ne[3];
|
||||||
|
|
||||||
|
const uint32_t nbq1 = q->nb[1];
|
||||||
|
const uint32_t nbq2 = q->nb[2];
|
||||||
|
const uint32_t nbq3 = q->nb[3];
|
||||||
|
|
||||||
|
const uint32_t nbk1 = k->nb[1];
|
||||||
|
const uint32_t nbk2 = k->nb[2];
|
||||||
|
const uint32_t nbk3 = k->nb[3];
|
||||||
|
|
||||||
|
const uint32_t nbv1 = v->nb[1];
|
||||||
|
const uint32_t nbv2 = v->nb[2];
|
||||||
|
const uint32_t nbv3 = v->nb[3];
|
||||||
|
|
||||||
|
const uint32_t ne1 = dst->ne[1];
|
||||||
|
const uint32_t ne2 = dst->ne[2];
|
||||||
|
const uint32_t ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
const uint32_t nb1 = dst->nb[1];
|
||||||
|
const uint32_t nb2 = dst->nb[2];
|
||||||
|
const uint32_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
float scale = 1.0f;
|
||||||
|
float max_bias = 0.0f;
|
||||||
|
float logit_softcap = 0.0f;
|
||||||
|
|
||||||
|
memcpy(&scale, (float *) octx->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) octx->op_params + 1, sizeof(float));
|
||||||
|
memcpy(&logit_softcap, (float *) octx->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
|
if (logit_softcap != 0) {
|
||||||
|
scale /= logit_softcap;
|
||||||
|
}
|
||||||
|
|
||||||
|
// total rows in q
|
||||||
|
const uint32_t nr = neq1*neq2*neq3;
|
||||||
|
|
||||||
|
const uint32_t dr = (nr + nth - 1) / nth;
|
||||||
|
const uint32_t ir0 = dr * ith;
|
||||||
|
const uint32_t ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
if (ir0 >= ir1) return;
|
||||||
|
|
||||||
|
dma_queue * dma = octx->ctx->dma[ith];
|
||||||
|
|
||||||
|
const uint32_t DK = nek0;
|
||||||
|
const uint32_t DV = nev0;
|
||||||
|
|
||||||
|
const size_t size_q_row = DK * ((q->type == HTP_TYPE_F32) ? 4 : 2);
|
||||||
|
const size_t size_q_row_padded = htp_round_up(size_q_row, 128);
|
||||||
|
|
||||||
|
const size_t size_k_row = DK * sizeof(__fp16);
|
||||||
|
const size_t size_v_row = DV * sizeof(__fp16);
|
||||||
|
const size_t size_m_row = FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16); // Treat block as one row for mask
|
||||||
|
|
||||||
|
const size_t size_k_row_padded = htp_round_up(size_k_row, 128);
|
||||||
|
const size_t size_v_row_padded = htp_round_up(size_v_row, 128);
|
||||||
|
|
||||||
|
const size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
const size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
const size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
|
||||||
|
|
||||||
|
// Scratchpad buffers for Q, K, V, Mask, and VKQ32 accumulator
|
||||||
|
uint8_t * spad_q = octx->src0_spad.data + octx->src0_spad.size_per_thread * ith;
|
||||||
|
uint8_t * spad_k = octx->src1_spad.data + octx->src1_spad.size_per_thread * ith;
|
||||||
|
uint8_t * spad_v = octx->src2_spad.data + octx->src2_spad.size_per_thread * ith;
|
||||||
|
uint8_t * spad_m = octx->src3_spad.data + octx->src3_spad.size_per_thread * ith;
|
||||||
|
uint8_t * spad_a = octx->dst_spad.data + octx->dst_spad.size_per_thread * ith;
|
||||||
|
|
||||||
|
const uint32_t n_head = neq2;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
for (uint32_t ir = ir0; ir < ir1; ++ir) {
|
||||||
|
const uint32_t iq3 = fastdiv(ir, &octx->src0_div21);
|
||||||
|
const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &octx->src0_div1);
|
||||||
|
const uint32_t iq1 = (ir - iq3*neq2*neq1 - iq2 * neq1);
|
||||||
|
|
||||||
|
const uint32_t ik3 = fastdiv(iq3, &octx->broadcast_rk3);
|
||||||
|
const uint32_t ik2 = fastdiv(iq2, &octx->broadcast_rk2);
|
||||||
|
|
||||||
|
const uint32_t iv3 = fastdiv(iq3, &octx->broadcast_rv3);
|
||||||
|
const uint32_t iv2 = fastdiv(iq2, &octx->broadcast_rv2);
|
||||||
|
|
||||||
|
// Fetch Q row
|
||||||
|
const uint8_t * q_row_ptr = (const uint8_t *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3);
|
||||||
|
dma_queue_push(dma, dma_make_ptr(spad_q, q_row_ptr), size_q_row_padded, nbq1, size_q_row, 1);
|
||||||
|
|
||||||
|
const uint32_t h = iq2; // head index
|
||||||
|
const float slope = (max_bias > 0.0f) ? (h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1)) : 1.0f;
|
||||||
|
|
||||||
|
float S = 0.0f; // sum
|
||||||
|
float M = -INFINITY; // maximum KQ value
|
||||||
|
|
||||||
|
// Clear accumulator
|
||||||
|
float * VKQ32 = (float *) spad_a;
|
||||||
|
memset(VKQ32, 0, DV * sizeof(float));
|
||||||
|
|
||||||
|
const __fp16 * mp_base = NULL;
|
||||||
|
if (mask) {
|
||||||
|
const uint32_t im2 = fastmodulo(iq2, mask->ne[2], &octx->src3_div2);
|
||||||
|
const uint32_t im3 = fastmodulo(iq3, mask->ne[3], &octx->src3_div3);
|
||||||
|
mp_base = (const __fp16 *) ((const uint8_t *) mask->data + iq1*mask->nb[1] + im2*mask->nb[2] + im3*mask->nb[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_blocks = (nek1 + FLASH_ATTN_BLOCK_SIZE - 1) / FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
|
||||||
|
// Prefetch first two blocks
|
||||||
|
for (uint32_t ib = 0; ib < MIN(n_blocks, 2); ++ib) {
|
||||||
|
const uint32_t ic_start = ib * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
const uint32_t current_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - ic_start);
|
||||||
|
|
||||||
|
// K
|
||||||
|
const uint8_t * k_src = (const uint8_t *) k->data + (ic_start*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||||
|
uint8_t * k_dst = spad_k + (ib % 2) * size_k_block;
|
||||||
|
dma_queue_push(dma, dma_make_ptr(k_dst, k_src), size_k_row_padded, nbk1, size_k_row, current_block_size);
|
||||||
|
|
||||||
|
// V
|
||||||
|
const uint8_t * v_src = (const uint8_t *) v->data + (ic_start*nbv1 + iv2*nbv2 + iv3*nbv3);
|
||||||
|
uint8_t * v_dst = spad_v + (ib % 2) * size_v_block;
|
||||||
|
dma_queue_push(dma, dma_make_ptr(v_dst, v_src), size_v_row_padded, nbv1, size_v_row, current_block_size);
|
||||||
|
|
||||||
|
// Mask
|
||||||
|
if (mask) {
|
||||||
|
const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start);
|
||||||
|
uint8_t * m_dst = spad_m + (ib % 2) * size_m_block;
|
||||||
|
// Mask is 1D contiguous for this row
|
||||||
|
dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t * q_ptr_vtcm = dma_queue_pop(dma).dst;
|
||||||
|
|
||||||
|
for (uint32_t ib = 0; ib < n_blocks; ++ib) {
|
||||||
|
const uint32_t ic_start = ib * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
const uint32_t current_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - ic_start);
|
||||||
|
|
||||||
|
// Wait for DMA
|
||||||
|
uint8_t * k_base = dma_queue_pop(dma).dst; // K
|
||||||
|
uint8_t * v_base = dma_queue_pop(dma).dst; // V
|
||||||
|
__fp16 * m_base = mask ? dma_queue_pop(dma).dst : NULL; // M
|
||||||
|
|
||||||
|
// Inner loop processing the block from VTCM
|
||||||
|
uint32_t ic = 0;
|
||||||
|
|
||||||
|
// Process in blocks of 32 (VLEN_FP32)
|
||||||
|
for (; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32) {
|
||||||
|
// 1. Compute scores
|
||||||
|
float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
|
||||||
|
for (int j = 0; j < VLEN_FP32; ++j) {
|
||||||
|
const uint32_t cur_ic = ic + j;
|
||||||
|
const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
|
||||||
|
if (q->type == HTP_TYPE_F32) {
|
||||||
|
hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||||
|
} else {
|
||||||
|
hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HVX_Vector scores = *(HVX_Vector *) scores_arr;
|
||||||
|
|
||||||
|
// 2. Softcap
|
||||||
|
if (logit_softcap != 0.0f) {
|
||||||
|
scores = hvx_vec_tanh_fp32(scores);
|
||||||
|
scores = Q6_Vqf32_vmpy_VsfVsf(scores, hvx_vec_splat_fp32(logit_softcap));
|
||||||
|
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Mask
|
||||||
|
if (mask) {
|
||||||
|
const __fp16 * mp = m_base + ic;
|
||||||
|
HVX_Vector m_vals_fp16 = *(const HVX_UVector *) mp;
|
||||||
|
|
||||||
|
HVX_Vector one_fp16 = Q6_Vh_vsplat_R(0x3c00);
|
||||||
|
HVX_VectorPair m_vals_fp32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_fp16), one_fp16);
|
||||||
|
|
||||||
|
HVX_Vector m_vals_fp32 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(m_vals_fp32_pair));
|
||||||
|
|
||||||
|
HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
|
||||||
|
HVX_Vector add_val = Q6_Vqf32_vmpy_VsfVsf(m_vals_fp32, slope_vec);
|
||||||
|
scores = Q6_Vqf32_vadd_VsfVsf(scores, Q6_Vsf_equals_Vqf32(add_val));
|
||||||
|
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Online Softmax Update
|
||||||
|
HVX_Vector v_max = hvx_vec_reduce_max_fp32(scores);
|
||||||
|
float m_block = hvx_vec_get_fp32(v_max);
|
||||||
|
|
||||||
|
float M_old = M;
|
||||||
|
float M_new = (m_block > M) ? m_block : M;
|
||||||
|
M = M_new;
|
||||||
|
|
||||||
|
float ms = expf(M_old - M_new);
|
||||||
|
|
||||||
|
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||||
|
S = S * ms;
|
||||||
|
|
||||||
|
HVX_Vector M_new_vec = hvx_vec_splat_fp32(M_new);
|
||||||
|
HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_new_vec);
|
||||||
|
HVX_Vector P = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(scores_shifted));
|
||||||
|
|
||||||
|
HVX_Vector p_sum_vec = hvx_vec_fp32_reduce_sum(P);
|
||||||
|
float p_sum = hvx_vec_get_fp32(p_sum_vec);
|
||||||
|
S += p_sum;
|
||||||
|
|
||||||
|
// 5. Accumulate V
|
||||||
|
float __attribute__((aligned(VLEN))) p_arr[VLEN_FP32];
|
||||||
|
*(HVX_Vector*)p_arr = P;
|
||||||
|
|
||||||
|
for (int j = 0; j < VLEN_FP32; ++j) {
|
||||||
|
const uint32_t cur_ic = ic + j;
|
||||||
|
const uint8_t * v_ptr = v_base + cur_ic * size_v_row_padded;
|
||||||
|
hvx_mad_f32_f16_aa(VKQ32, v_ptr, DV, p_arr[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Leftover
|
||||||
|
for (; ic < current_block_size; ++ic) {
|
||||||
|
float s_val;
|
||||||
|
const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
|
||||||
|
|
||||||
|
if (q->type == HTP_TYPE_F32) {
|
||||||
|
hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||||
|
} else {
|
||||||
|
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (logit_softcap != 0.0f) {
|
||||||
|
s_val = logit_softcap * tanhf(s_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mask) {
|
||||||
|
const float m_val = m_base[ic];
|
||||||
|
s_val += slope * m_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float Mold = M;
|
||||||
|
float ms = 1.0f;
|
||||||
|
float vs = 1.0f;
|
||||||
|
|
||||||
|
if (s_val > M) {
|
||||||
|
M = s_val;
|
||||||
|
ms = expf(Mold - M);
|
||||||
|
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||||
|
} else {
|
||||||
|
vs = expf(s_val - M);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t * v_ptr = v_base + ic * size_v_row_padded;
|
||||||
|
|
||||||
|
hvx_mad_f32_f16_aa(VKQ32, v_ptr, DV, vs);
|
||||||
|
|
||||||
|
S = S * ms + vs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Issue DMA for next+1 block (if exists)
|
||||||
|
if (ib + 2 < n_blocks) {
|
||||||
|
const uint32_t next_ib = ib + 2;
|
||||||
|
const uint32_t next_ic_start = next_ib * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
const uint32_t next_block_size = MIN(FLASH_ATTN_BLOCK_SIZE, nek1 - next_ic_start);
|
||||||
|
|
||||||
|
// K
|
||||||
|
const uint8_t * k_src = (const uint8_t *) k->data + (next_ic_start*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||||
|
dma_queue_push(dma, dma_make_ptr(k_base, k_src), size_k_row_padded, nbk1, size_k_row, next_block_size);
|
||||||
|
|
||||||
|
// V
|
||||||
|
const uint8_t * v_src = (const uint8_t *) v->data + (next_ic_start*nbv1 + iv2*nbv2 + iv3*nbv3);
|
||||||
|
dma_queue_push(dma, dma_make_ptr(v_base, v_src), size_v_row_padded, nbv1, size_v_row, next_block_size);
|
||||||
|
|
||||||
|
// Mask
|
||||||
|
if (mask) {
|
||||||
|
const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
|
||||||
|
dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sinks
|
||||||
|
if (sinks) {
|
||||||
|
const float s = ((float *)((char *) sinks->data))[h];
|
||||||
|
|
||||||
|
float ms = 1.0f;
|
||||||
|
float vs = 1.0f;
|
||||||
|
|
||||||
|
if (s > M) {
|
||||||
|
ms = expf(M - s);
|
||||||
|
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms);
|
||||||
|
} else {
|
||||||
|
vs = expf(s - M);
|
||||||
|
}
|
||||||
|
|
||||||
|
S = S * ms + vs;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
|
||||||
|
hvx_scale_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, S_inv);
|
||||||
|
|
||||||
|
// Store result
|
||||||
|
// dst indices
|
||||||
|
const int i1 = iq1;
|
||||||
|
const int i2 = iq2;
|
||||||
|
const int i3 = iq3;
|
||||||
|
|
||||||
|
// dst is permuted
|
||||||
|
uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
|
||||||
|
|
||||||
|
if (dst->type == HTP_TYPE_F32) {
|
||||||
|
hvx_copy_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
|
||||||
|
} else if (dst->type == HTP_TYPE_F16) {
|
||||||
|
hvx_copy_fp16_fp32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void htp_flash_attn_ext_job(unsigned int n, unsigned int i, void * data) {
|
||||||
|
struct htp_ops_context * octx = data;
|
||||||
|
flash_attn_ext_f16_thread(octx, i, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||||
|
const struct htp_tensor * q = &octx->src0;
|
||||||
|
const struct htp_tensor * k = &octx->src1;
|
||||||
|
const struct htp_tensor * v = &octx->src2;
|
||||||
|
const struct htp_tensor * mask = (octx->src3.type != HTP_TYPE_COUNT) ? &octx->src3 : NULL;
|
||||||
|
struct htp_tensor * dst = &octx->dst;
|
||||||
|
|
||||||
|
// Check support
|
||||||
|
if ((q->type != HTP_TYPE_F16 && q->type != HTP_TYPE_F32) ||
|
||||||
|
k->type != HTP_TYPE_F16 ||
|
||||||
|
v->type != HTP_TYPE_F16) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
octx->src0_div21 = init_fastdiv_values(q->ne[2] * q->ne[1]);
|
||||||
|
octx->src0_div1 = init_fastdiv_values(q->ne[1]);
|
||||||
|
|
||||||
|
octx->broadcast_rk2 = init_fastdiv_values(q->ne[2]/k->ne[2]);
|
||||||
|
octx->broadcast_rk3 = init_fastdiv_values(q->ne[3]/k->ne[3]);
|
||||||
|
octx->broadcast_rv2 = init_fastdiv_values(q->ne[2]/v->ne[2]);
|
||||||
|
octx->broadcast_rv3 = init_fastdiv_values(q->ne[3]/v->ne[3]);
|
||||||
|
|
||||||
|
if (mask) {
|
||||||
|
octx->src3_div2 = init_fastdiv_values(mask->ne[2]);
|
||||||
|
octx->src3_div3 = init_fastdiv_values(mask->ne[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size_q_row_padded = htp_round_up(q->ne[0] * (q->type == HTP_TYPE_F32 ? 4 : 2), 128);
|
||||||
|
size_t size_k_row_padded = htp_round_up(k->ne[0] * sizeof(__fp16), 128);
|
||||||
|
size_t size_v_row_padded = htp_round_up(v->ne[0] * sizeof(__fp16), 128);
|
||||||
|
|
||||||
|
size_t size_q_block = size_q_row_padded * 1; // single row for now
|
||||||
|
size_t size_k_block = size_k_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
size_t size_v_block = size_v_row_padded * FLASH_ATTN_BLOCK_SIZE;
|
||||||
|
size_t size_m_block = htp_round_up(FLASH_ATTN_BLOCK_SIZE * sizeof(__fp16), 128);
|
||||||
|
|
||||||
|
size_t size_vkq_acc = htp_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
|
||||||
|
|
||||||
|
octx->src0_spad.size_per_thread = size_q_block * 1;
|
||||||
|
octx->src1_spad.size_per_thread = size_k_block * 2;
|
||||||
|
octx->src2_spad.size_per_thread = size_v_block * 2;
|
||||||
|
octx->src3_spad.size_per_thread = mask ? size_m_block * 2 : 0;
|
||||||
|
octx->dst_spad.size_per_thread = size_vkq_acc;
|
||||||
|
|
||||||
|
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
|
||||||
|
octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
|
||||||
|
octx->src2_spad.size = octx->src2_spad.size_per_thread * octx->n_threads;
|
||||||
|
octx->src3_spad.size = octx->src3_spad.size_per_thread * octx->n_threads;
|
||||||
|
octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
|
||||||
|
|
||||||
|
size_t total_spad = octx->src0_spad.size + octx->src1_spad.size + octx->src2_spad.size + octx->src3_spad.size + octx->dst_spad.size;
|
||||||
|
|
||||||
|
if (octx->ctx->vtcm_size < total_spad) {
|
||||||
|
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||||
|
}
|
||||||
|
|
||||||
|
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||||
|
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||||
|
octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
||||||
|
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
|
||||||
|
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size;
|
||||||
|
|
||||||
|
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||||
|
worker_pool_run_func(octx->ctx->worker_pool, htp_flash_attn_ext_job, octx, octx->n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,112 @@
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-function"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||||
|
|
||||||
|
#ifdef HTP_DEBUG
|
||||||
|
# define FARF_HIGH 1
|
||||||
|
#endif
|
||||||
|
#include <HAP_farf.h>
|
||||||
|
#include <HAP_mem.h>
|
||||||
|
#include <HAP_perf.h>
|
||||||
|
#include <hexagon_protos.h>
|
||||||
|
#include <hexagon_types.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define GGML_COMMON_DECL_C
|
||||||
|
#include "ggml-common.h"
|
||||||
|
#include "htp-ctx.h"
|
||||||
|
#include "htp-msg.h"
|
||||||
|
#include "htp-ops.h"
|
||||||
|
#include "hvx-utils.h"
|
||||||
|
#include "ops-utils.h"
|
||||||
|
|
||||||
|
#define get_rows_preamble \
|
||||||
|
const uint32_t ne00 = octx->src0.ne[0]; \
|
||||||
|
const uint32_t ne01 = octx->src0.ne[1]; \
|
||||||
|
const uint32_t ne02 = octx->src0.ne[2]; \
|
||||||
|
const uint32_t ne03 = octx->src0.ne[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t ne10 = octx->src1.ne[0]; \
|
||||||
|
const uint32_t ne11 = octx->src1.ne[1]; \
|
||||||
|
const uint32_t ne12 = octx->src1.ne[2]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb01 = octx->src0.nb[1]; \
|
||||||
|
const uint32_t nb02 = octx->src0.nb[2]; \
|
||||||
|
const uint32_t nb03 = octx->src0.nb[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb10 = octx->src1.nb[0]; \
|
||||||
|
const uint32_t nb11 = octx->src1.nb[1]; \
|
||||||
|
const uint32_t nb12 = octx->src1.nb[2]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb1 = octx->dst.nb[1]; \
|
||||||
|
const uint32_t nb2 = octx->dst.nb[2]; \
|
||||||
|
const uint32_t nb3 = octx->dst.nb[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t nr = ne10 * ne11 * ne12;
|
||||||
|
|
||||||
|
static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||||
|
get_rows_preamble;
|
||||||
|
|
||||||
|
// parallelize by src1 elements (which correspond to dst rows)
|
||||||
|
const uint32_t dr = octx->src1_nrows_per_thread;
|
||||||
|
const uint32_t ir0 = dr * ith;
|
||||||
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||||
|
|
||||||
|
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||||
|
|
||||||
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||||
|
const uint32_t i12 = fastdiv(i, &octx->get_rows_div_ne10_ne11);
|
||||||
|
const uint32_t rem = i - i12 * ne11 * ne10;
|
||||||
|
const uint32_t i11 = fastdiv(rem, &octx->get_rows_div_ne10);
|
||||||
|
const uint32_t i10 = rem - i11 * ne10;
|
||||||
|
|
||||||
|
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||||
|
|
||||||
|
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||||
|
|
||||||
|
if (i01 >= ne01) {
|
||||||
|
// invalid index, skip for now to avoid crash
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uintptr_t src0_ptr = octx->src0.data + i01*nb01 + i11*nb02 + i12*nb03;
|
||||||
|
const uintptr_t dst_ptr = octx->dst.data + i10*nb1 + i11*nb2 + i12*nb3;
|
||||||
|
hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||||
|
}
|
||||||
|
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void get_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
|
||||||
|
get_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int op_get_rows(struct htp_ops_context * octx) {
|
||||||
|
get_rows_preamble;
|
||||||
|
|
||||||
|
if (octx->src0.type != HTP_TYPE_F32) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->dst.type != HTP_TYPE_F32) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
octx->get_rows_div_ne10 = init_fastdiv_values(octx->src1.ne[0]);
|
||||||
|
octx->get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
|
||||||
|
|
||||||
|
const uint32_t n_jobs = MIN(nr, octx->n_threads);
|
||||||
|
octx->src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
|
||||||
|
|
||||||
|
worker_pool_run_func(octx->ctx->worker_pool, get_rows_work_f32_f32, octx, n_jobs);
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
@ -11,11 +11,6 @@
|
||||||
|
|
||||||
#define HTP_MAX_NTHREADS 10
|
#define HTP_MAX_NTHREADS 10
|
||||||
|
|
||||||
// FIXME: move these into matmul-ops
|
|
||||||
#define HTP_SPAD_SRC0_NROWS 16
|
|
||||||
#define HTP_SPAD_SRC1_NROWS 16
|
|
||||||
#define HTP_SPAD_DST_NROWS 2
|
|
||||||
|
|
||||||
// Main context for htp DSP backend
|
// Main context for htp DSP backend
|
||||||
struct htp_context {
|
struct htp_context {
|
||||||
dspqueue_t queue;
|
dspqueue_t queue;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,8 @@ enum htp_data_type {
|
||||||
HTP_TYPE_F16 = 1,
|
HTP_TYPE_F16 = 1,
|
||||||
HTP_TYPE_Q4_0 = 2,
|
HTP_TYPE_Q4_0 = 2,
|
||||||
HTP_TYPE_Q8_0 = 8,
|
HTP_TYPE_Q8_0 = 8,
|
||||||
|
HTP_TYPE_I32 = 26,
|
||||||
|
HTP_TYPE_I64 = 27,
|
||||||
HTP_TYPE_MXFP4 = 39,
|
HTP_TYPE_MXFP4 = 39,
|
||||||
HTP_TYPE_COUNT
|
HTP_TYPE_COUNT
|
||||||
};
|
};
|
||||||
|
|
@ -57,6 +59,10 @@ enum htp_op {
|
||||||
HTP_OP_SOFTMAX = 11,
|
HTP_OP_SOFTMAX = 11,
|
||||||
HTP_OP_ADD_ID = 12,
|
HTP_OP_ADD_ID = 12,
|
||||||
HTP_OP_ROPE = 13,
|
HTP_OP_ROPE = 13,
|
||||||
|
HTP_OP_FLASH_ATTN_EXT = 14,
|
||||||
|
HTP_OP_SET_ROWS = 15,
|
||||||
|
HTP_OP_SCALE = 16,
|
||||||
|
HTP_OP_GET_ROWS = 17,
|
||||||
INVALID
|
INVALID
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -137,6 +143,8 @@ struct htp_general_req {
|
||||||
struct htp_tensor src0; // Input0 tensor
|
struct htp_tensor src0; // Input0 tensor
|
||||||
struct htp_tensor src1; // Input1 tensor
|
struct htp_tensor src1; // Input1 tensor
|
||||||
struct htp_tensor src2; // Input2 tensor
|
struct htp_tensor src2; // Input2 tensor
|
||||||
|
struct htp_tensor src3; // Input3 tensor
|
||||||
|
struct htp_tensor src4; // Input4 tensor
|
||||||
struct htp_tensor dst; // Output tensor
|
struct htp_tensor dst; // Output tensor
|
||||||
|
|
||||||
// should be multiple of 64 bytes (cacheline)
|
// should be multiple of 64 bytes (cacheline)
|
||||||
|
|
@ -152,6 +160,6 @@ struct htp_general_rsp {
|
||||||
};
|
};
|
||||||
|
|
||||||
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
|
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
|
||||||
#define HTP_MAX_PACKET_BUFFERS 4
|
#define HTP_MAX_PACKET_BUFFERS 8
|
||||||
|
|
||||||
#endif /* HTP_MSG_H */
|
#endif /* HTP_MSG_H */
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@
|
||||||
|
|
||||||
struct htp_spad {
|
struct htp_spad {
|
||||||
uint8_t * data;
|
uint8_t * data;
|
||||||
|
size_t stride;
|
||||||
size_t size;
|
size_t size;
|
||||||
size_t size_per_thread;
|
size_t size_per_thread;
|
||||||
};
|
};
|
||||||
|
|
@ -26,11 +27,14 @@ struct htp_ops_context {
|
||||||
struct htp_tensor src0;
|
struct htp_tensor src0;
|
||||||
struct htp_tensor src1;
|
struct htp_tensor src1;
|
||||||
struct htp_tensor src2;
|
struct htp_tensor src2;
|
||||||
|
struct htp_tensor src3;
|
||||||
|
struct htp_tensor src4;
|
||||||
struct htp_tensor dst;
|
struct htp_tensor dst;
|
||||||
|
|
||||||
struct htp_spad src0_spad;
|
struct htp_spad src0_spad;
|
||||||
struct htp_spad src1_spad;
|
struct htp_spad src1_spad;
|
||||||
struct htp_spad src2_spad;
|
struct htp_spad src2_spad;
|
||||||
|
struct htp_spad src3_spad;
|
||||||
struct htp_spad dst_spad;
|
struct htp_spad dst_spad;
|
||||||
|
|
||||||
worker_pool_context_t * wpool; // worker pool
|
worker_pool_context_t * wpool; // worker pool
|
||||||
|
|
@ -49,6 +53,27 @@ struct htp_ops_context {
|
||||||
struct fastdiv_values src1_div3; // fastdiv values for ne3
|
struct fastdiv_values src1_div3; // fastdiv values for ne3
|
||||||
struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
|
struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
|
||||||
|
|
||||||
|
struct fastdiv_values src3_div1; // fastdiv values for ne1
|
||||||
|
struct fastdiv_values src3_div2; // fastdiv values for ne2
|
||||||
|
struct fastdiv_values src3_div3; // fastdiv values for ne3
|
||||||
|
struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
|
||||||
|
|
||||||
|
struct fastdiv_values broadcast_rk2;
|
||||||
|
struct fastdiv_values broadcast_rk3;
|
||||||
|
struct fastdiv_values broadcast_rv2;
|
||||||
|
struct fastdiv_values broadcast_rv3;
|
||||||
|
|
||||||
|
struct fastdiv_values mm_div_ne12_ne1; // fastdiv values for ne12 * ne1
|
||||||
|
struct fastdiv_values mm_div_ne1; // fastdiv values for ne1
|
||||||
|
struct fastdiv_values mm_div_r2; // fastdiv values for ne12 / ne02
|
||||||
|
struct fastdiv_values mm_div_r3; // fastdiv values for ne13 / ne03
|
||||||
|
|
||||||
|
struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
|
||||||
|
struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
|
||||||
|
|
||||||
|
struct fastdiv_values get_rows_div_ne10; // fastdiv values for ne10
|
||||||
|
struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
|
||||||
|
|
||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -60,5 +85,8 @@ int op_activations(struct htp_ops_context * octx);
|
||||||
int op_softmax(struct htp_ops_context * octx);
|
int op_softmax(struct htp_ops_context * octx);
|
||||||
int op_add_id(struct htp_ops_context * octx);
|
int op_add_id(struct htp_ops_context * octx);
|
||||||
int op_rope(struct htp_ops_context * octx);
|
int op_rope(struct htp_ops_context * octx);
|
||||||
|
int op_flash_attn_ext(struct htp_ops_context * octx);
|
||||||
|
int op_set_rows(struct htp_ops_context * octx);
|
||||||
|
int op_get_rows(struct htp_ops_context * octx);
|
||||||
|
|
||||||
#endif /* HTP_OPS_H */
|
#endif /* HTP_OPS_H */
|
||||||
|
|
|
||||||
|
|
@ -848,55 +848,6 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
|
||||||
return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
|
return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
|
||||||
}
|
}
|
||||||
|
|
||||||
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) {
|
|
||||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
|
||||||
int num_elems_whole = num_elems - left_over;
|
|
||||||
|
|
||||||
int unaligned_addr = 0;
|
|
||||||
int unaligned_loop = 0;
|
|
||||||
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
|
|
||||||
FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
|
|
||||||
unaligned_addr = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
|
|
||||||
unaligned_loop = 1;
|
|
||||||
FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
|
|
||||||
|
|
||||||
if (0 == unaligned_loop) {
|
|
||||||
HVX_Vector * vec_in1 = (HVX_Vector *) src;
|
|
||||||
HVX_Vector * vec_out = (HVX_Vector *) dst;
|
|
||||||
|
|
||||||
#pragma unroll(4)
|
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
||||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
|
|
||||||
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma unroll(4)
|
|
||||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
||||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
||||||
|
|
||||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
|
|
||||||
|
|
||||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (left_over > 0) {
|
|
||||||
const float * srcf = (const float *) src + num_elems_whole;
|
|
||||||
float * dstf = (float *) dst + num_elems_whole;
|
|
||||||
|
|
||||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
|
||||||
|
|
||||||
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
|
|
||||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
|
float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
|
||||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||||
int num_elems_whole = num_elems - left_over;
|
int num_elems_whole = num_elems - left_over;
|
||||||
|
|
@ -1065,3 +1016,5 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
|
||||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,15 +41,24 @@ static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
static inline HVX_Vector hvx_vec_splat_fp32(float v) {
|
||||||
union {
|
union {
|
||||||
float f;
|
float f;
|
||||||
int32_t i;
|
uint32_t i;
|
||||||
} fp32 = { .f = i };
|
} fp32 = { .f = v };
|
||||||
|
|
||||||
return Q6_V_vsplat_R(fp32.i);
|
return Q6_V_vsplat_R(fp32.i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector hvx_vec_splat_fp16(float v) {
|
||||||
|
union {
|
||||||
|
__fp16 f;
|
||||||
|
uint16_t i;
|
||||||
|
} fp16 = { .f = v };
|
||||||
|
|
||||||
|
return Q6_Vh_vsplat_R(fp16.i);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
|
static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
|
||||||
// Rotate as needed.
|
// Rotate as needed.
|
||||||
v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
|
v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
|
||||||
|
|
@ -242,6 +251,120 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copy n fp32 elements : source is unaligned, destination unaligned
|
||||||
|
static inline void hvx_copy_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||||
|
HVX_UVector * restrict vdst = (HVX_UVector *) dst;
|
||||||
|
HVX_UVector * restrict vsrc = (HVX_UVector *) src;
|
||||||
|
|
||||||
|
assert((unsigned long) dst % 128 == 0);
|
||||||
|
|
||||||
|
uint32_t nvec = n / 32;
|
||||||
|
uint32_t nloe = n % 32;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (; i < nvec; i++) {
|
||||||
|
HVX_Vector v = vsrc[i];
|
||||||
|
vdst[i] = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = vsrc[i];
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
|
||||||
|
static inline void hvx_copy_fp16_fp32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||||
|
HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
|
||||||
|
HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
|
||||||
|
|
||||||
|
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||||
|
|
||||||
|
uint32_t nvec = n / 64;
|
||||||
|
uint32_t nloe = n % 64;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (; i < nvec; i++) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
|
||||||
|
static inline void hvx_copy_fp16_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||||
|
HVX_UVector * restrict vdst = (HVX_UVector *) dst; // fp16
|
||||||
|
HVX_Vector * restrict vsrc = (HVX_Vector *) src; // fp32
|
||||||
|
|
||||||
|
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||||
|
|
||||||
|
uint32_t nvec = n / 64;
|
||||||
|
uint32_t nloe = n % 64;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (; i < nvec; i++) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
|
||||||
|
static inline void hvx_copy_fp16_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
|
||||||
|
HVX_Vector * restrict vdst = (HVX_Vector *) dst; // fp16
|
||||||
|
HVX_UVector * restrict vsrc = (HVX_UVector *) src; // fp32
|
||||||
|
|
||||||
|
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||||
|
|
||||||
|
uint32_t nvec = n / 64;
|
||||||
|
uint32_t nloe = n % 64;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (; i < nvec; i++) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
vdst[i] = Q6_Vh_vdeal_Vh(s_hf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
// Load y (fp32) and convert into fp16
|
||||||
|
HVX_Vector s0_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+0], zero); // 32 elements
|
||||||
|
HVX_Vector s1_qf = Q6_Vqf32_vsub_VsfVsf(vsrc[i*2+1], zero); // 32 elements
|
||||||
|
HVX_Vector s_hf = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(s1_qf, s0_qf));
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), Q6_Vh_vdeal_Vh(s_hf));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
|
// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
|
||||||
static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
|
static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
|
||||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||||
|
|
@ -273,8 +396,6 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
|
||||||
return right_off <= chunk_size;
|
return right_off <= chunk_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||||
HVX_VectorAlias u = { .v = v };
|
HVX_VectorAlias u = { .v = v };
|
||||||
|
|
||||||
|
|
@ -531,13 +652,13 @@ static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
|
static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
|
||||||
#if __HTP_ARCH__ > 75
|
#if __HVX_ARCH__ > 75
|
||||||
return Q6_Vsf_vfneg_Vsf(v);
|
return Q6_Vsf_vfneg_Vsf(v);
|
||||||
#else
|
#else
|
||||||
// neg by setting the fp32 sign bit
|
// neg by setting the fp32 sign bit
|
||||||
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
|
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
|
||||||
return Q6_V_vxor_VV(v, mask);
|
return Q6_V_vxor_VV(v, mask);
|
||||||
#endif // __HTP_ARCH__ > 75
|
#endif // __HVX_ARCH__ > 75
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================================================
|
// ====================================================
|
||||||
|
|
@ -976,6 +1097,24 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
|
||||||
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline HVX_Vector hvx_vec_tanh_fp32(HVX_Vector x) {
|
||||||
|
// tanh(x) = 2 * sigmoid(2x) - 1
|
||||||
|
HVX_Vector two = hvx_vec_splat_fp32(2.0f);
|
||||||
|
HVX_Vector one = hvx_vec_splat_fp32(1.0f);
|
||||||
|
HVX_Vector x2 = Q6_Vqf32_vmpy_VsfVsf(x, two);
|
||||||
|
|
||||||
|
static const float kMinExp = -87.f; // 0
|
||||||
|
static const float kMaxExp = 87.f; // 1
|
||||||
|
HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||||
|
HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||||
|
|
||||||
|
HVX_Vector sig2x = hvx_vec_fast_sigmoid_fp32_guard(Q6_Vsf_equals_Vqf32(x2), one, max_exp, min_exp);
|
||||||
|
|
||||||
|
HVX_Vector res = Q6_Vqf32_vmpy_VsfVsf(sig2x, two);
|
||||||
|
res = Q6_Vqf32_vsub_Vqf32Vsf(res, one);
|
||||||
|
return Q6_Vsf_equals_Vqf32(res);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||||
int step_of_1 = num_elems >> 5;
|
int step_of_1 = num_elems >> 5;
|
||||||
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
int remaining = num_elems - step_of_1 * VLEN_FP32;
|
||||||
|
|
@ -1056,6 +1195,115 @@ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||||
|
int nvec = n / VLEN_FP32;
|
||||||
|
int nloe = n % VLEN_FP32;
|
||||||
|
|
||||||
|
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||||
|
|
||||||
|
HVX_Vector * vsrc = (HVX_Vector *) src;
|
||||||
|
HVX_Vector * vdst = (HVX_Vector *) dst;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; ++i) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||||
|
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||||
|
int nvec = n / VLEN_FP32;
|
||||||
|
int nloe = n % VLEN_FP32;
|
||||||
|
|
||||||
|
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||||
|
|
||||||
|
HVX_UVector * vsrc = (HVX_UVector *) src;
|
||||||
|
HVX_UVector * vdst = (HVX_UVector *) dst;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; ++i) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||||
|
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
|
||||||
|
if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
|
||||||
|
hvx_scale_f32_aa(dst, src, n, scale);
|
||||||
|
} else {
|
||||||
|
hvx_scale_f32_uu(dst, src, n, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||||
|
int nvec = n / VLEN_FP32;
|
||||||
|
int nloe = n % VLEN_FP32;
|
||||||
|
|
||||||
|
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||||
|
HVX_Vector vo = hvx_vec_splat_fp32(offset);
|
||||||
|
|
||||||
|
HVX_Vector * vsrc = (HVX_Vector *) src;
|
||||||
|
HVX_Vector * vdst = (HVX_Vector *) dst;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; ++i) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||||
|
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||||
|
int nvec = n / VLEN_FP32;
|
||||||
|
int nloe = n % VLEN_FP32;
|
||||||
|
|
||||||
|
HVX_Vector vs = hvx_vec_splat_fp32(scale);
|
||||||
|
HVX_Vector vo = hvx_vec_splat_fp32(offset);
|
||||||
|
|
||||||
|
HVX_UVector * vsrc = (HVX_UVector *) src;
|
||||||
|
HVX_UVector * vdst = (HVX_UVector *) dst;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
#pragma unroll(4)
|
||||||
|
for (i = 0; i < nvec; ++i) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||||
|
vdst[i] = Q6_Vsf_equals_Vqf32(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nloe) {
|
||||||
|
HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo);
|
||||||
|
hvx_vec_store_u((void *) &vdst[i], nloe * 4, Q6_Vsf_equals_Vqf32(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
|
||||||
|
if (htp_is_aligned((void *) src, VLEN) && htp_is_aligned((void *) dst, VLEN)) {
|
||||||
|
hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
|
||||||
|
} else {
|
||||||
|
hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
|
||||||
void hvx_mul_f32(const uint8_t * restrict src0,
|
void hvx_mul_f32(const uint8_t * restrict src0,
|
||||||
|
|
@ -1090,7 +1338,6 @@ void hvx_sub_f32_opt(const uint8_t * restrict src0,
|
||||||
uint8_t * restrict dst,
|
uint8_t * restrict dst,
|
||||||
const int num_elems);
|
const int num_elems);
|
||||||
void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
|
void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
|
||||||
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale);
|
|
||||||
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
||||||
void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
|
||||||
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
|
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
|
||||||
|
|
|
||||||
|
|
@ -443,6 +443,45 @@ static void proc_matmul_req(struct htp_context * ctx,
|
||||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||||
|
struct dspqueue_buffer rsp_bufs[1];
|
||||||
|
|
||||||
|
// We had written to the output buffer, we'd also need to flush it
|
||||||
|
rsp_bufs[0].fd = bufs[2].fd;
|
||||||
|
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||||
|
rsp_bufs[0].offset = bufs[2].offset;
|
||||||
|
rsp_bufs[0].size = bufs[2].size;
|
||||||
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
|
// Setup Op context
|
||||||
|
struct htp_ops_context octx = { 0 };
|
||||||
|
octx.ctx = ctx;
|
||||||
|
octx.src0 = req->src0;
|
||||||
|
octx.src1 = req->src1;
|
||||||
|
octx.dst = req->dst;
|
||||||
|
octx.flags = req->flags;
|
||||||
|
octx.op = req->op;
|
||||||
|
|
||||||
|
// Update data pointers
|
||||||
|
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||||
|
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||||
|
octx.dst.data = (uint32_t) bufs[2].ptr;
|
||||||
|
octx.n_threads = ctx->n_threads;
|
||||||
|
|
||||||
|
struct profile_data prof;
|
||||||
|
profile_start(&prof);
|
||||||
|
|
||||||
|
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||||
|
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||||
|
rsp_status = op_get_rows(&octx);
|
||||||
|
vtcm_release(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
profile_stop(&prof);
|
||||||
|
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||||
|
}
|
||||||
|
|
||||||
static void proc_matmul_id_req(struct htp_context * ctx,
|
static void proc_matmul_id_req(struct htp_context * ctx,
|
||||||
struct htp_general_req * req,
|
struct htp_general_req * req,
|
||||||
struct dspqueue_buffer * bufs,
|
struct dspqueue_buffer * bufs,
|
||||||
|
|
@ -668,7 +707,7 @@ static void proc_rope_req(struct htp_context * ctx,
|
||||||
uint32_t n_bufs) {
|
uint32_t n_bufs) {
|
||||||
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
|
||||||
|
|
||||||
int write_idx = (n_bufs == 4) ? 3 : 2;
|
int write_idx = n_bufs - 1;
|
||||||
|
|
||||||
// We had written to the output buffer, we'd also need to flush it
|
// We had written to the output buffer, we'd also need to flush it
|
||||||
rsp_bufs[0].fd = bufs[write_idx].fd;
|
rsp_bufs[0].fd = bufs[write_idx].fd;
|
||||||
|
|
@ -716,6 +755,102 @@ static void proc_rope_req(struct htp_context * ctx,
|
||||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void proc_set_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||||
|
struct dspqueue_buffer rsp_bufs[1];
|
||||||
|
|
||||||
|
// We had written to the output buffer, we'd also need to flush it
|
||||||
|
rsp_bufs[0].fd = bufs[2].fd;
|
||||||
|
rsp_bufs[0].ptr = bufs[2].ptr;
|
||||||
|
rsp_bufs[0].offset = bufs[2].offset;
|
||||||
|
rsp_bufs[0].size = bufs[2].size;
|
||||||
|
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
|
// Setup Op context
|
||||||
|
struct htp_ops_context octx = { 0 };
|
||||||
|
octx.ctx = ctx;
|
||||||
|
octx.src0 = req->src0;
|
||||||
|
octx.src1 = req->src1;
|
||||||
|
octx.dst = req->dst;
|
||||||
|
octx.flags = req->flags;
|
||||||
|
octx.op = req->op;
|
||||||
|
|
||||||
|
// Update data pointers
|
||||||
|
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||||
|
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||||
|
octx.dst.data = (uint32_t) bufs[2].ptr;
|
||||||
|
octx.n_threads = ctx->n_threads;
|
||||||
|
|
||||||
|
struct profile_data prof;
|
||||||
|
profile_start(&prof);
|
||||||
|
|
||||||
|
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||||
|
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||||
|
rsp_status = op_set_rows(&octx);
|
||||||
|
vtcm_release(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
profile_stop(&prof);
|
||||||
|
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proc_flash_attn_ext_req(struct htp_context * ctx,
|
||||||
|
struct htp_general_req * req,
|
||||||
|
struct dspqueue_buffer * bufs,
|
||||||
|
uint32_t n_bufs) {
|
||||||
|
// Setup Op context
|
||||||
|
struct htp_ops_context octx;
|
||||||
|
memset(&octx, 0, sizeof(octx));
|
||||||
|
|
||||||
|
octx.ctx = ctx;
|
||||||
|
octx.n_threads = ctx->n_threads;
|
||||||
|
|
||||||
|
octx.src0 = req->src0;
|
||||||
|
octx.src1 = req->src1;
|
||||||
|
octx.src2 = req->src2;
|
||||||
|
octx.src3 = req->src3;
|
||||||
|
octx.src4 = req->src4;
|
||||||
|
octx.dst = req->dst;
|
||||||
|
octx.flags = req->flags;
|
||||||
|
octx.op = req->op;
|
||||||
|
|
||||||
|
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
|
||||||
|
|
||||||
|
// Update data pointers
|
||||||
|
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||||
|
octx.src1.data = (uint32_t) bufs[1].ptr;
|
||||||
|
octx.src2.data = (uint32_t) bufs[2].ptr;
|
||||||
|
|
||||||
|
int last_buf = 3;
|
||||||
|
|
||||||
|
if (octx.src3.ne[0]) {
|
||||||
|
octx.src3.data = (uint32_t) bufs[last_buf++].ptr; // mask is valid
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx.src4.ne[0]) {
|
||||||
|
octx.src4.data = (uint32_t) bufs[last_buf++].ptr; // sinks is valid
|
||||||
|
}
|
||||||
|
|
||||||
|
octx.dst.data = (uint32_t) bufs[last_buf].ptr;
|
||||||
|
|
||||||
|
struct profile_data prof;
|
||||||
|
profile_start(&prof);
|
||||||
|
|
||||||
|
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||||
|
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||||
|
rsp_status = op_flash_attn_ext(&octx);
|
||||||
|
vtcm_release(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
profile_stop(&prof);
|
||||||
|
|
||||||
|
struct dspqueue_buffer rsp_buf = bufs[last_buf];
|
||||||
|
rsp_buf.flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||||
|
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||||
|
|
||||||
|
send_htp_rsp(ctx, req->op, rsp_status, &bufs[last_buf], 1, &prof);
|
||||||
|
}
|
||||||
|
|
||||||
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||||
struct htp_context * ctx = (struct htp_context *) context;
|
struct htp_context * ctx = (struct htp_context *) context;
|
||||||
|
|
||||||
|
|
@ -790,6 +925,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case HTP_OP_RMS_NORM:
|
case HTP_OP_RMS_NORM:
|
||||||
|
case HTP_OP_SCALE:
|
||||||
if (n_bufs != 2) {
|
if (n_bufs != 2) {
|
||||||
FARF(ERROR, "Bad unary-req buffer list");
|
FARF(ERROR, "Bad unary-req buffer list");
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -833,6 +969,30 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||||
proc_rope_req(ctx, &req, bufs, n_bufs);
|
proc_rope_req(ctx, &req, bufs, n_bufs);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case HTP_OP_FLASH_ATTN_EXT:
|
||||||
|
if (!(n_bufs >= 4 && n_bufs <= 6)) {
|
||||||
|
FARF(ERROR, "Bad flash-attn-ext-req buffer list");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
proc_flash_attn_ext_req(ctx, &req, bufs, n_bufs);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case HTP_OP_SET_ROWS:
|
||||||
|
if (n_bufs != 3) {
|
||||||
|
FARF(ERROR, "Bad set-rows-req buffer list");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
proc_set_rows_req(ctx, &req, bufs);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case HTP_OP_GET_ROWS:
|
||||||
|
if (n_bufs != 3) {
|
||||||
|
FARF(ERROR, "Bad get-rows-req buffer list");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
proc_get_rows_req(ctx, &req, bufs);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
FARF(ERROR, "Unknown Op %u", req.op);
|
FARF(ERROR, "Unknown Op %u", req.op);
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,168 @@
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-function"
|
||||||
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||||
|
|
||||||
|
#ifdef HTP_DEBUG
|
||||||
|
# define FARF_HIGH 1
|
||||||
|
#endif
|
||||||
|
#include <HAP_farf.h>
|
||||||
|
#include <HAP_mem.h>
|
||||||
|
#include <HAP_perf.h>
|
||||||
|
#include <hexagon_protos.h>
|
||||||
|
#include <hexagon_types.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define GGML_COMMON_DECL_C
|
||||||
|
#include "ggml-common.h"
|
||||||
|
#include "htp-ctx.h"
|
||||||
|
#include "htp-msg.h"
|
||||||
|
#include "htp-ops.h"
|
||||||
|
#include "hvx-utils.h"
|
||||||
|
#include "ops-utils.h"
|
||||||
|
|
||||||
|
#define set_rows_preamble \
|
||||||
|
const uint32_t ne00 = octx->src0.ne[0]; \
|
||||||
|
const uint32_t ne01 = octx->src0.ne[1]; \
|
||||||
|
const uint32_t ne02 = octx->src0.ne[2]; \
|
||||||
|
const uint32_t ne03 = octx->src0.ne[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t ne10 = octx->src1.ne[0]; \
|
||||||
|
const uint32_t ne11 = octx->src1.ne[1]; \
|
||||||
|
const uint32_t ne12 = octx->src1.ne[2]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb01 = octx->src0.nb[1]; \
|
||||||
|
const uint32_t nb02 = octx->src0.nb[2]; \
|
||||||
|
const uint32_t nb03 = octx->src0.nb[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb10 = octx->src1.nb[0]; \
|
||||||
|
const uint32_t nb11 = octx->src1.nb[1]; \
|
||||||
|
const uint32_t nb12 = octx->src1.nb[2]; \
|
||||||
|
\
|
||||||
|
const uint32_t nb1 = octx->dst.nb[1]; \
|
||||||
|
const uint32_t nb2 = octx->dst.nb[2]; \
|
||||||
|
const uint32_t nb3 = octx->dst.nb[3]; \
|
||||||
|
\
|
||||||
|
const uint32_t ne1 = octx->dst.ne[1]; \
|
||||||
|
\
|
||||||
|
const uint32_t nr = ne01;
|
||||||
|
|
||||||
|
static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||||
|
set_rows_preamble;
|
||||||
|
|
||||||
|
// parallelize by rows of src0
|
||||||
|
const uint32_t dr = octx->src0_nrows_per_thread;
|
||||||
|
const uint32_t ir0 = dr * ith;
|
||||||
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||||
|
|
||||||
|
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||||
|
|
||||||
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
||||||
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
||||||
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||||
|
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
|
||||||
|
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
|
||||||
|
const uint32_t i10 = i;
|
||||||
|
|
||||||
|
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||||
|
|
||||||
|
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||||
|
if (i1 >= ne1) {
|
||||||
|
// ignore invalid indices
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uintptr_t src0_ptr = octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
|
||||||
|
const uintptr_t dst_ptr = octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
||||||
|
|
||||||
|
// copy row
|
||||||
|
hvx_copy_fp32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const int ith) {
|
||||||
|
set_rows_preamble;
|
||||||
|
|
||||||
|
// parallelize by rows of src0
|
||||||
|
const uint32_t dr = octx->src0_nrows_per_thread;
|
||||||
|
const uint32_t ir0 = dr * ith;
|
||||||
|
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||||
|
|
||||||
|
const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
|
||||||
|
|
||||||
|
for (uint32_t i03 = 0; i03 < ne03; ++i03) {
|
||||||
|
for (uint32_t i02 = 0; i02 < ne02; ++i02) {
|
||||||
|
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||||
|
const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
|
||||||
|
const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
|
||||||
|
const uint32_t i10 = i;
|
||||||
|
|
||||||
|
const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||||
|
|
||||||
|
uint32_t i1 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||||
|
if (i1 >= ne1) {
|
||||||
|
// ignore invalid indices
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t* src0_ptr = (const uint8_t *) octx->src0.data + i*nb01 + i02*nb02 + i03*nb03;
|
||||||
|
uint8_t* dst_ptr = (uint8_t *) octx->dst.data + i1*nb1 + i02*nb2 + i03*nb3;
|
||||||
|
|
||||||
|
hvx_copy_fp16_fp32_uu(dst_ptr, src0_ptr, ne00);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
|
||||||
|
set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
|
||||||
|
set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int op_set_rows(struct htp_ops_context * octx) {
|
||||||
|
set_rows_preamble;
|
||||||
|
|
||||||
|
if (octx->src0.type != HTP_TYPE_F32) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->dst.type != HTP_TYPE_F32 && octx->dst.type != HTP_TYPE_F16) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->src1.type != HTP_TYPE_I32 && octx->src1.type != HTP_TYPE_I64) {
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
octx->set_rows_div_ne12 = init_fastdiv_values(ne12);
|
||||||
|
octx->set_rows_div_ne11 = init_fastdiv_values(ne11);
|
||||||
|
|
||||||
|
const uint32_t n_jobs = MIN(nr, octx->n_threads);
|
||||||
|
octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
|
||||||
|
|
||||||
|
switch(octx->dst.type) {
|
||||||
|
case HTP_TYPE_F32:
|
||||||
|
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f32_f32, octx, n_jobs);
|
||||||
|
break;
|
||||||
|
case HTP_TYPE_F16:
|
||||||
|
worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f16_f32, octx, n_jobs);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return HTP_STATUS_NO_SUPPORT;
|
||||||
|
}
|
||||||
|
|
||||||
|
return HTP_STATUS_OK;
|
||||||
|
}
|
||||||
|
|
@ -238,7 +238,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct
|
||||||
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
|
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
|
||||||
(const uint8_t *) mp_f32, slope);
|
(const uint8_t *) mp_f32, slope);
|
||||||
} else {
|
} else {
|
||||||
hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
|
hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
|
||||||
if (mp_f32) {
|
if (mp_f32) {
|
||||||
if (softmax_ctx->use_f16) {
|
if (softmax_ctx->use_f16) {
|
||||||
for (int i = 0; i < ne00; ++i) {
|
for (int i = 0; i < ne00; ++i) {
|
||||||
|
|
@ -258,7 +258,7 @@ static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ct
|
||||||
float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
|
float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
|
||||||
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
|
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
|
||||||
sum = sum > 0.0 ? (1.0 / sum) : 1;
|
sum = sum > 0.0 ? (1.0 / sum) : 1;
|
||||||
hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
|
hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -83,6 +83,31 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void scale_htp_f32(const float * restrict src,
|
||||||
|
float * restrict dst,
|
||||||
|
uint8_t * restrict spad,
|
||||||
|
const uint32_t num_rows,
|
||||||
|
const uint32_t row_elems,
|
||||||
|
const size_t row_size,
|
||||||
|
int32_t * op_params,
|
||||||
|
int opt_path) {
|
||||||
|
float scale = 0.f;
|
||||||
|
float bias = 0.f;
|
||||||
|
memcpy(&scale, &op_params[0], sizeof(float));
|
||||||
|
memcpy(&bias, &op_params[1], sizeof(float));
|
||||||
|
|
||||||
|
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||||
|
const float * restrict src_local = src + (ir * row_elems);
|
||||||
|
float * restrict dst_local = dst + (ir * row_elems);
|
||||||
|
|
||||||
|
if (ir + 1 < num_rows) {
|
||||||
|
htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void rms_norm_htp_f32(const float * restrict src,
|
static void rms_norm_htp_f32(const float * restrict src,
|
||||||
float * restrict dst,
|
float * restrict dst,
|
||||||
uint8_t * restrict spad,
|
uint8_t * restrict spad,
|
||||||
|
|
@ -110,7 +135,7 @@ static void rms_norm_htp_f32(const float * restrict src,
|
||||||
const float mean = sum / row_elems;
|
const float mean = sum / row_elems;
|
||||||
const float scale = 1.0f / sqrtf(mean + epsilon);
|
const float scale = 1.0f / sqrtf(mean + epsilon);
|
||||||
|
|
||||||
hvx_scale_f32((const uint8_t *) src_local, (uint8_t *) dst_local, row_elems, scale);
|
hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -162,6 +187,9 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
|
||||||
case HTP_OP_RMS_NORM:
|
case HTP_OP_RMS_NORM:
|
||||||
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
||||||
break;
|
break;
|
||||||
|
case HTP_OP_SCALE:
|
||||||
|
scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
@ -195,6 +223,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||||
unary_op_func = unary_job_dispatcher_f32;
|
unary_op_func = unary_job_dispatcher_f32;
|
||||||
op_type = "rmsnorm-f32";
|
op_type = "rmsnorm-f32";
|
||||||
break;
|
break;
|
||||||
|
case HTP_OP_SCALE:
|
||||||
|
unary_op_func = unary_job_dispatcher_f32;
|
||||||
|
op_type = "scale-f32";
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||||
|
|
|
||||||
|
|
@ -219,6 +219,8 @@ struct ggml_metal_device_props {
|
||||||
bool use_shared_buffers;
|
bool use_shared_buffers;
|
||||||
|
|
||||||
bool supports_gpu_family_apple7;
|
bool supports_gpu_family_apple7;
|
||||||
|
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_device_t ggml_metal_device_init(void);
|
ggml_metal_device_t ggml_metal_device_init(void);
|
||||||
|
|
|
||||||
|
|
@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
|
|
||||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||||
|
|
|
||||||
|
|
@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||||
|
|
||||||
return (op->op == GGML_OP_MUL_MAT ||
|
return (op->op == GGML_OP_MUL_MAT ||
|
||||||
op->op == GGML_OP_MUL_MAT_ID) &&
|
op->op == GGML_OP_MUL_MAT_ID) &&
|
||||||
get_op_batch_size(op) >= min_batch_size;
|
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
|
||||||
GGML_UNUSED(op);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||||
|
|
|
||||||
|
|
@ -9148,6 +9148,7 @@ typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
||||||
|
template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||||
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ set(GGML_OPENCL_KERNELS
|
||||||
add
|
add
|
||||||
add_id
|
add_id
|
||||||
argsort
|
argsort
|
||||||
|
fill
|
||||||
clamp
|
clamp
|
||||||
cpy
|
cpy
|
||||||
cvt
|
cvt
|
||||||
|
|
@ -120,6 +121,8 @@ set(GGML_OPENCL_KERNELS
|
||||||
tsembd
|
tsembd
|
||||||
upscale
|
upscale
|
||||||
tanh
|
tanh
|
||||||
|
expm1
|
||||||
|
softplus
|
||||||
pad
|
pad
|
||||||
repeat
|
repeat
|
||||||
mul_mat_f16_f32
|
mul_mat_f16_f32
|
||||||
|
|
|
||||||
|
|
@ -489,6 +489,7 @@ struct ggml_backend_opencl_context {
|
||||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||||
cl_kernel kernel_relu;
|
cl_kernel kernel_relu;
|
||||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||||
|
cl_kernel kernel_fill;
|
||||||
cl_kernel kernel_clamp;
|
cl_kernel kernel_clamp;
|
||||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
|
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
|
||||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||||
|
|
@ -537,6 +538,10 @@ struct ggml_backend_opencl_context {
|
||||||
cl_kernel kernel_pad;
|
cl_kernel kernel_pad;
|
||||||
cl_kernel kernel_tanh_f32_nd;
|
cl_kernel kernel_tanh_f32_nd;
|
||||||
cl_kernel kernel_tanh_f16_nd;
|
cl_kernel kernel_tanh_f16_nd;
|
||||||
|
cl_kernel kernel_expm1_f32_nd;
|
||||||
|
cl_kernel kernel_expm1_f16_nd;
|
||||||
|
cl_kernel kernel_softplus_f32_nd;
|
||||||
|
cl_kernel kernel_softplus_f16_nd;
|
||||||
cl_kernel kernel_upscale;
|
cl_kernel kernel_upscale;
|
||||||
cl_kernel kernel_upscale_bilinear;
|
cl_kernel kernel_upscale_bilinear;
|
||||||
cl_kernel kernel_concat_f32_contiguous;
|
cl_kernel kernel_concat_f32_contiguous;
|
||||||
|
|
@ -787,6 +792,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fill
|
||||||
|
{
|
||||||
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
const std::string kernel_src {
|
||||||
|
#include "fill.cl.h"
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
const std::string kernel_src = read_file("fill.cl");
|
||||||
|
#endif
|
||||||
|
cl_program prog =
|
||||||
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
|
CL_CHECK((backend_ctx->kernel_fill = clCreateKernel(prog, "kernel_fill_f32", &err), err));
|
||||||
|
GGML_LOG_CONT(".");
|
||||||
|
|
||||||
|
CL_CHECK(clReleaseProgram(prog));
|
||||||
|
}
|
||||||
|
|
||||||
// clamp
|
// clamp
|
||||||
{
|
{
|
||||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
|
@ -1780,6 +1803,56 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expm1
|
||||||
|
{
|
||||||
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
const std::string kernel_src {
|
||||||
|
#include "expm1.cl.h"
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
const std::string kernel_src = read_file("expm1.cl");
|
||||||
|
#endif
|
||||||
|
cl_program prog;
|
||||||
|
if (!kernel_src.empty()) {
|
||||||
|
prog =
|
||||||
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
|
||||||
|
GGML_LOG_CONT(".");
|
||||||
|
} else {
|
||||||
|
GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
|
||||||
|
prog = nullptr;
|
||||||
|
backend_ctx->kernel_expm1_f32_nd = nullptr;
|
||||||
|
backend_ctx->kernel_expm1_f16_nd = nullptr;
|
||||||
|
}
|
||||||
|
CL_CHECK(clReleaseProgram(prog));
|
||||||
|
}
|
||||||
|
|
||||||
|
// softplus
|
||||||
|
{
|
||||||
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
const std::string kernel_src {
|
||||||
|
#include "softplus.cl.h"
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
const std::string kernel_src = read_file("softplus.cl");
|
||||||
|
#endif
|
||||||
|
cl_program prog;
|
||||||
|
if (!kernel_src.empty()) {
|
||||||
|
prog =
|
||||||
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
|
||||||
|
GGML_LOG_CONT(".");
|
||||||
|
} else {
|
||||||
|
GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
|
||||||
|
prog = nullptr;
|
||||||
|
backend_ctx->kernel_softplus_f32_nd = nullptr;
|
||||||
|
backend_ctx->kernel_softplus_f16_nd = nullptr;
|
||||||
|
}
|
||||||
|
CL_CHECK(clReleaseProgram(prog));
|
||||||
|
}
|
||||||
|
|
||||||
// upscale
|
// upscale
|
||||||
{
|
{
|
||||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
|
@ -3089,6 +3162,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||||
|
case GGML_UNARY_OP_EXPM1:
|
||||||
|
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||||
|
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||||
|
case GGML_UNARY_OP_SOFTPLUS:
|
||||||
|
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||||
|
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -3104,6 +3183,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
case GGML_OP_FILL:
|
||||||
|
return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
return op->src[0]->type == GGML_TYPE_F32;
|
return op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
|
@ -4266,8 +4347,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
*free = 1;
|
*free = 0;
|
||||||
*total = 1;
|
*total = 0;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
@ -5860,6 +5941,36 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(dst);
|
||||||
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
UNUSED(src0);
|
||||||
|
UNUSED(src1);
|
||||||
|
|
||||||
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
|
||||||
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||||
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||||
|
|
||||||
|
float v = 0.0f;
|
||||||
|
memcpy(&v, ((int32_t *) dst->op_params), sizeof(float));
|
||||||
|
|
||||||
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
cl_kernel kernel = backend_ctx->kernel_fill;
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extrad->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsetd));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(float), &v));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(float), &n));
|
||||||
|
|
||||||
|
size_t local_work_size[1] = { 256 };
|
||||||
|
size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
|
||||||
|
|
||||||
|
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0);
|
GGML_ASSERT(src0);
|
||||||
GGML_ASSERT(src0->extra);
|
GGML_ASSERT(src0->extra);
|
||||||
|
|
@ -6413,6 +6524,210 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(src0);
|
||||||
|
GGML_ASSERT(src0->extra);
|
||||||
|
GGML_ASSERT(dst);
|
||||||
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
UNUSED(src1);
|
||||||
|
|
||||||
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
|
||||||
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||||
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||||
|
|
||||||
|
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||||
|
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||||
|
|
||||||
|
cl_kernel kernel;
|
||||||
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_expm1_f32_nd;
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
kernel = backend_ctx->kernel_expm1_f16_nd;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
|
||||||
|
}
|
||||||
|
GGML_ASSERT(kernel != nullptr);
|
||||||
|
|
||||||
|
const int ne00 = src0->ne[0];
|
||||||
|
const int ne01 = src0->ne[1];
|
||||||
|
const int ne02 = src0->ne[2];
|
||||||
|
const int ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb00 = src0->nb[0];
|
||||||
|
const cl_ulong nb01 = src0->nb[1];
|
||||||
|
const cl_ulong nb02 = src0->nb[2];
|
||||||
|
const cl_ulong nb03 = src0->nb[3];
|
||||||
|
|
||||||
|
const int ne10 = dst->ne[0];
|
||||||
|
const int ne11 = dst->ne[1];
|
||||||
|
const int ne12 = dst->ne[2];
|
||||||
|
const int ne13 = dst->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb10 = dst->nb[0];
|
||||||
|
const cl_ulong nb11 = dst->nb[1];
|
||||||
|
const cl_ulong nb12 = dst->nb[2];
|
||||||
|
const cl_ulong nb13 = dst->nb[3];
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||||
|
|
||||||
|
size_t global_work_size[3];
|
||||||
|
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
global_work_size[0] = (size_t)ne10;
|
||||||
|
global_work_size[1] = (size_t)ne11;
|
||||||
|
global_work_size[2] = (size_t)ne12;
|
||||||
|
|
||||||
|
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||||
|
if (ne10 < 16) lws0 = ne10;
|
||||||
|
if (ne11 < 4) lws1 = ne11;
|
||||||
|
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||||
|
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||||
|
|
||||||
|
|
||||||
|
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||||
|
|
||||||
|
size_t* local_work_size_ptr = local_work_size;
|
||||||
|
if (!backend_ctx->non_uniform_workgroups) {
|
||||||
|
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||||
|
global_work_size[1] % local_work_size[1] != 0 ||
|
||||||
|
global_work_size[2] % local_work_size[2] != 0) {
|
||||||
|
local_work_size_ptr = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||||
|
|
||||||
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(src0);
|
||||||
|
GGML_ASSERT(src0->extra);
|
||||||
|
GGML_ASSERT(dst);
|
||||||
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
UNUSED(src1);
|
||||||
|
|
||||||
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
|
||||||
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||||
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||||
|
|
||||||
|
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||||
|
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||||
|
|
||||||
|
cl_kernel kernel;
|
||||||
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_softplus_f32_nd;
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
kernel = backend_ctx->kernel_softplus_f16_nd;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
|
||||||
|
}
|
||||||
|
GGML_ASSERT(kernel != nullptr);
|
||||||
|
|
||||||
|
const int ne00 = src0->ne[0];
|
||||||
|
const int ne01 = src0->ne[1];
|
||||||
|
const int ne02 = src0->ne[2];
|
||||||
|
const int ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb00 = src0->nb[0];
|
||||||
|
const cl_ulong nb01 = src0->nb[1];
|
||||||
|
const cl_ulong nb02 = src0->nb[2];
|
||||||
|
const cl_ulong nb03 = src0->nb[3];
|
||||||
|
|
||||||
|
const int ne10 = dst->ne[0];
|
||||||
|
const int ne11 = dst->ne[1];
|
||||||
|
const int ne12 = dst->ne[2];
|
||||||
|
const int ne13 = dst->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb10 = dst->nb[0];
|
||||||
|
const cl_ulong nb11 = dst->nb[1];
|
||||||
|
const cl_ulong nb12 = dst->nb[2];
|
||||||
|
const cl_ulong nb13 = dst->nb[3];
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||||
|
|
||||||
|
size_t global_work_size[3];
|
||||||
|
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
global_work_size[0] = (size_t)ne10;
|
||||||
|
global_work_size[1] = (size_t)ne11;
|
||||||
|
global_work_size[2] = (size_t)ne12;
|
||||||
|
|
||||||
|
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||||
|
if (ne10 < 16) lws0 = ne10;
|
||||||
|
if (ne11 < 4) lws1 = ne11;
|
||||||
|
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||||
|
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||||
|
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||||
|
|
||||||
|
|
||||||
|
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||||
|
|
||||||
|
size_t* local_work_size_ptr = local_work_size;
|
||||||
|
if (!backend_ctx->non_uniform_workgroups) {
|
||||||
|
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||||
|
global_work_size[1] % local_work_size[1] != 0 ||
|
||||||
|
global_work_size[2] % local_work_size[2] != 0) {
|
||||||
|
local_work_size_ptr = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||||
|
|
||||||
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0);
|
GGML_ASSERT(src0);
|
||||||
GGML_ASSERT(src0->extra);
|
GGML_ASSERT(src0->extra);
|
||||||
|
|
@ -9586,6 +9901,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||||
}
|
}
|
||||||
func = ggml_cl_tanh;
|
func = ggml_cl_tanh;
|
||||||
break;
|
break;
|
||||||
|
case GGML_UNARY_OP_EXPM1:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cl_expm1;
|
||||||
|
break;
|
||||||
|
case GGML_UNARY_OP_SOFTPLUS:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cl_softplus;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -9595,6 +9922,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||||
}
|
}
|
||||||
func = ggml_cl_glu;
|
func = ggml_cl_glu;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_FILL:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cl_fill;
|
||||||
|
break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
if (!any_on_device) {
|
if (!any_on_device) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,82 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// expm1
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_expm1_f32_nd(
|
||||||
|
global void * p_src0_base,
|
||||||
|
ulong off_src0_abs,
|
||||||
|
global void * p_dst_base,
|
||||||
|
ulong off_dst_abs,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13
|
||||||
|
) {
|
||||||
|
int i0 = get_global_id(0);
|
||||||
|
int i1 = get_global_id(1);
|
||||||
|
int i2 = get_global_id(2);
|
||||||
|
|
||||||
|
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||||
|
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||||
|
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||||
|
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||||
|
|
||||||
|
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||||
|
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||||
|
|
||||||
|
*dst_val_ptr = exp(*src_val_ptr) - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_expm1_f16_nd(
|
||||||
|
global void * p_src0_base,
|
||||||
|
ulong off_src0_abs,
|
||||||
|
global void * p_dst_base,
|
||||||
|
ulong off_dst_abs,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13
|
||||||
|
) {
|
||||||
|
int i0 = get_global_id(0);
|
||||||
|
int i1 = get_global_id(1);
|
||||||
|
int i2 = get_global_id(2);
|
||||||
|
|
||||||
|
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||||
|
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||||
|
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||||
|
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||||
|
|
||||||
|
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||||
|
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||||
|
|
||||||
|
*dst_val_ptr = exp(*src_val_ptr) - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// fill
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
__kernel void kernel_fill_f32(
|
||||||
|
__global float *dst,
|
||||||
|
ulong offsetd,
|
||||||
|
float v,
|
||||||
|
int n
|
||||||
|
|
||||||
|
) {
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
if(get_global_id(0) < n){
|
||||||
|
dst[get_global_id(0)] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// softplus
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
inline float softplus_f32(float x){
|
||||||
|
float ax = fabs(x);
|
||||||
|
float m = fmax(x, 0.0f);
|
||||||
|
return log1p(exp(-ax)) + m;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_softplus_f32_nd(
|
||||||
|
global void * p_src0_base,
|
||||||
|
ulong off_src0_abs,
|
||||||
|
global void * p_dst_base,
|
||||||
|
ulong off_dst_abs,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13
|
||||||
|
) {
|
||||||
|
int i0 = get_global_id(0);
|
||||||
|
int i1 = get_global_id(1);
|
||||||
|
int i2 = get_global_id(2);
|
||||||
|
|
||||||
|
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||||
|
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||||
|
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||||
|
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||||
|
|
||||||
|
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||||
|
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||||
|
|
||||||
|
*dst_val_ptr = softplus_f32(*src_val_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_softplus_f16_nd(
|
||||||
|
global void * p_src0_base,
|
||||||
|
ulong off_src0_abs,
|
||||||
|
global void * p_dst_base,
|
||||||
|
ulong off_dst_abs,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13
|
||||||
|
) {
|
||||||
|
int i0 = get_global_id(0);
|
||||||
|
int i1 = get_global_id(1);
|
||||||
|
int i2 = get_global_id(2);
|
||||||
|
|
||||||
|
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||||
|
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||||
|
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||||
|
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||||
|
|
||||||
|
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||||
|
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||||
|
|
||||||
|
*dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
|
||||||
return get_op_batch_size(op) >= min_batch_size;
|
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
|
||||||
GGML_UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t
|
static ggml_backend_event_t
|
||||||
|
|
@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
|
|
||||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||||
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
|
||||||
|
|
@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||||
prop, dpct::dev_mgr::instance().get_device(i))));
|
prop, dpct::dev_mgr::instance().get_device(i))));
|
||||||
|
|
||||||
dev_ctx->description = prop.get_name();
|
dev_ctx->description = prop.get_name();
|
||||||
|
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_sycl_device_interface,
|
/* .iface = */ ggml_backend_sycl_device_interface,
|
||||||
|
|
|
||||||
|
|
@ -119,6 +119,8 @@ struct ggml_backend_vk_context;
|
||||||
// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT.
|
// Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT.
|
||||||
#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3)
|
#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3)
|
||||||
|
|
||||||
|
typedef std::shared_ptr<struct vk_pipeline_struct> vk_pipeline;
|
||||||
|
|
||||||
struct vk_pipeline_struct {
|
struct vk_pipeline_struct {
|
||||||
std::string name;
|
std::string name;
|
||||||
vk::ShaderModule shader_module;
|
vk::ShaderModule shader_module;
|
||||||
|
|
@ -136,9 +138,15 @@ struct vk_pipeline_struct {
|
||||||
std::atomic<bool> compiled {};
|
std::atomic<bool> compiled {};
|
||||||
// number of registers used, extracted from pipeline executable properties
|
// number of registers used, extracted from pipeline executable properties
|
||||||
uint32_t register_count {};
|
uint32_t register_count {};
|
||||||
|
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
bool is_64b_indexing {};
|
||||||
|
#endif
|
||||||
|
// linked list of pipelines for multiple compilation variants.
|
||||||
|
// currently only used to compile a 64-bit indexing variant.
|
||||||
|
vk_pipeline next;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
|
||||||
typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
|
typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
|
||||||
|
|
||||||
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
|
||||||
|
|
@ -230,9 +238,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
class vk_memory_logger;
|
class vk_memory_logger;
|
||||||
#endif
|
|
||||||
class vk_perf_logger;
|
class vk_perf_logger;
|
||||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||||
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
|
||||||
|
|
@ -550,6 +556,8 @@ struct vk_device_struct {
|
||||||
uint64_t max_memory_allocation_size;
|
uint64_t max_memory_allocation_size;
|
||||||
uint64_t max_buffer_size;
|
uint64_t max_buffer_size;
|
||||||
uint64_t suballocation_block_size;
|
uint64_t suballocation_block_size;
|
||||||
|
uint64_t min_imported_host_pointer_alignment;
|
||||||
|
bool external_memory_host {};
|
||||||
bool fp16;
|
bool fp16;
|
||||||
bool bf16;
|
bool bf16;
|
||||||
bool pipeline_robustness;
|
bool pipeline_robustness;
|
||||||
|
|
@ -568,6 +576,7 @@ struct vk_device_struct {
|
||||||
bool uma;
|
bool uma;
|
||||||
bool prefer_host_memory;
|
bool prefer_host_memory;
|
||||||
bool float_controls_rte_fp16;
|
bool float_controls_rte_fp16;
|
||||||
|
bool subgroup_basic;
|
||||||
bool subgroup_arithmetic;
|
bool subgroup_arithmetic;
|
||||||
bool subgroup_shuffle;
|
bool subgroup_shuffle;
|
||||||
bool subgroup_ballot;
|
bool subgroup_ballot;
|
||||||
|
|
@ -581,6 +590,8 @@ struct vk_device_struct {
|
||||||
bool add_rms_fusion;
|
bool add_rms_fusion;
|
||||||
uint32_t partials_binding_alignment;
|
uint32_t partials_binding_alignment;
|
||||||
|
|
||||||
|
bool shader_64b_indexing;
|
||||||
|
|
||||||
bool integer_dot_product;
|
bool integer_dot_product;
|
||||||
// 0: default, 1: force mmvq, -1: disable mmvq
|
// 0: default, 1: force mmvq, -1: disable mmvq
|
||||||
int32_t mmvq_mode;
|
int32_t mmvq_mode;
|
||||||
|
|
@ -812,9 +823,7 @@ struct vk_device_struct {
|
||||||
bool allow_sysmem_fallback;
|
bool allow_sysmem_fallback;
|
||||||
bool disable_graph_optimize;
|
bool disable_graph_optimize;
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||||
#endif
|
|
||||||
|
|
||||||
~vk_device_struct() {
|
~vk_device_struct() {
|
||||||
VK_LOG_DEBUG("destroy device " << name);
|
VK_LOG_DEBUG("destroy device " << name);
|
||||||
|
|
@ -1502,6 +1511,11 @@ template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
|
||||||
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
|
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct vk_quantize_q8_1_push_constants {
|
||||||
|
uint32_t ne;
|
||||||
|
uint32_t num_blocks;
|
||||||
|
};
|
||||||
|
|
||||||
// Allow pre-recording command buffers
|
// Allow pre-recording command buffers
|
||||||
struct vk_staging_memcpy {
|
struct vk_staging_memcpy {
|
||||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||||
|
|
@ -1545,8 +1559,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
||||||
static void ggml_vk_load_shaders(vk_device& device);
|
static void ggml_vk_load_shaders(vk_device& device);
|
||||||
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
|
||||||
|
|
||||||
#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
|
static bool vk_memory_logger_enabled = false;
|
||||||
#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
|
|
||||||
|
#define VK_LOG_MEMORY(msg) if (vk_memory_logger_enabled) { std::cerr << "ggml_vulkan memory: " << msg << std::endl; }
|
||||||
|
|
||||||
static std::string format_size(size_t size) {
|
static std::string format_size(size_t size) {
|
||||||
const size_t kib = 1024;
|
const size_t kib = 1024;
|
||||||
|
|
@ -1579,10 +1594,10 @@ private:
|
||||||
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
std::map<vk::Buffer, size_t> allocations; // Track allocations
|
||||||
size_t total_device;
|
size_t total_device;
|
||||||
size_t total_host;
|
size_t total_host;
|
||||||
|
static std::mutex log_mutex;
|
||||||
};
|
};
|
||||||
#else
|
|
||||||
#define VK_LOG_MEMORY(msg) ((void) 0)
|
std::mutex vk_memory_logger::log_mutex;
|
||||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
|
|
||||||
static bool vk_perf_logger_enabled = false;
|
static bool vk_perf_logger_enabled = false;
|
||||||
static bool vk_perf_logger_concurrent = false;
|
static bool vk_perf_logger_concurrent = false;
|
||||||
|
|
@ -1889,10 +1904,10 @@ struct ggml_backend_vk_buffer_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
static std::mutex log_mutex;
|
|
||||||
|
|
||||||
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
||||||
|
if (!vk_memory_logger_enabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
std::lock_guard<std::mutex> guard(log_mutex);
|
std::lock_guard<std::mutex> guard(log_mutex);
|
||||||
vk_buffer buf = buf_ref.lock();
|
vk_buffer buf = buf_ref.lock();
|
||||||
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
|
|
@ -1904,7 +1919,7 @@ void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
||||||
if (buf_ref.expired() || buf_ref.lock()->size == 0) {
|
if (buf_ref.expired() || buf_ref.lock()->size == 0 || !vk_memory_logger_enabled) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1922,7 +1937,6 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
||||||
VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
|
|
||||||
struct vk_instance_t {
|
struct vk_instance_t {
|
||||||
vk::Instance instance;
|
vk::Instance instance;
|
||||||
|
|
@ -2072,6 +2086,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
compute_pipeline_create_info.setPNext(&rci);
|
compute_pipeline_create_info.setPNext(&rci);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
vk::PipelineCreateFlags2CreateInfo pipelineFlags2CreateInfo;
|
||||||
|
if (pipeline->is_64b_indexing)
|
||||||
|
{
|
||||||
|
pipelineFlags2CreateInfo.flags = vk::PipelineCreateFlagBits2::e64BitIndexingEXT;
|
||||||
|
if (device->pipeline_executable_properties_support) {
|
||||||
|
pipelineFlags2CreateInfo.flags |= vk::PipelineCreateFlagBits2::eCaptureStatisticsKHR;
|
||||||
|
}
|
||||||
|
pipelineFlags2CreateInfo.setPNext(compute_pipeline_create_info.pNext);
|
||||||
|
compute_pipeline_create_info.setPNext(&pipelineFlags2CreateInfo);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
try {
|
try {
|
||||||
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
||||||
} catch (const vk::SystemError& e) {
|
} catch (const vk::SystemError& e) {
|
||||||
|
|
@ -2410,7 +2437,8 @@ static std::vector<uint32_t> ggml_vk_find_memory_properties(const vk::PhysicalDe
|
||||||
return indices;
|
return indices;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) {
|
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list,
|
||||||
|
void *import_ptr = nullptr) {
|
||||||
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
|
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
|
||||||
if (size > device->max_buffer_size) {
|
if (size > device->max_buffer_size) {
|
||||||
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
|
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
|
||||||
|
|
@ -2439,6 +2467,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||||
nullptr,
|
nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
|
||||||
|
if (import_ptr) {
|
||||||
|
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||||
|
buffer_create_info.setPNext(&external_memory_bci);
|
||||||
|
}
|
||||||
|
|
||||||
buf->buffer = device->device.createBuffer(buffer_create_info);
|
buf->buffer = device->device.createBuffer(buffer_create_info);
|
||||||
|
|
||||||
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
|
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
|
||||||
|
|
@ -2453,35 +2487,80 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||||
mem_flags_info.setPNext(&mem_priority_info);
|
mem_flags_info.setPNext(&mem_priority_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
|
if (import_ptr) {
|
||||||
const auto & req_flags = *it;
|
vk::MemoryHostPointerPropertiesEXT host_pointer_props;
|
||||||
|
try {
|
||||||
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
|
host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr);
|
||||||
|
} catch (vk::SystemError& e) {
|
||||||
if (memory_type_indices.empty()) {
|
GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what());
|
||||||
continue;
|
device->device.destroyBuffer(buf->buffer);
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
buf->memory_property_flags = req_flags;
|
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||||
|
|
||||||
bool done = false;
|
uint32_t memory_type_idx;
|
||||||
|
vk::MemoryPropertyFlags property_flags = *req_flags_list.begin();
|
||||||
|
for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) {
|
||||||
|
if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx];
|
||||||
try {
|
// check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed
|
||||||
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
if ((memory_type.propertyFlags & property_flags) == property_flags) {
|
||||||
done = true;
|
property_flags = memory_type.propertyFlags;
|
||||||
break;
|
break;
|
||||||
} catch (const vk::SystemError& e) {
|
|
||||||
// loop and retry
|
|
||||||
// during last attempt throw the exception
|
|
||||||
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
|
|
||||||
device->device.destroyBuffer(buf->buffer);
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (memory_type_idx == 32) {
|
||||||
|
GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n");
|
||||||
|
device->device.destroyBuffer(buf->buffer);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
if (done) {
|
buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags;
|
||||||
break;
|
try {
|
||||||
|
vk::ImportMemoryHostPointerInfoEXT import_info;
|
||||||
|
import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||||
|
import_info.pHostPointer = import_ptr;
|
||||||
|
import_info.setPNext(&mem_flags_info);
|
||||||
|
buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info });
|
||||||
|
} catch (const vk::SystemError& e) {
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
|
||||||
|
const auto & req_flags = *it;
|
||||||
|
|
||||||
|
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
|
||||||
|
|
||||||
|
if (memory_type_indices.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
buf->memory_property_flags = req_flags;
|
||||||
|
|
||||||
|
bool done = false;
|
||||||
|
|
||||||
|
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
|
||||||
|
try {
|
||||||
|
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
|
||||||
|
done = true;
|
||||||
|
break;
|
||||||
|
} catch (const vk::SystemError& e) {
|
||||||
|
// loop and retry
|
||||||
|
// during last attempt throw the exception
|
||||||
|
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
|
||||||
|
device->device.destroyBuffer(buf->buffer);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2492,8 +2571,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||||
|
|
||||||
buf->ptr = nullptr;
|
buf->ptr = nullptr;
|
||||||
|
|
||||||
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
if (import_ptr) {
|
||||||
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
buf->ptr = import_ptr;
|
||||||
|
} else {
|
||||||
|
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||||
|
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
||||||
|
|
@ -2506,9 +2589,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
||||||
buf->bda_addr = device->device.getBufferAddress(addressInfo);
|
buf->bda_addr = device->device.getBufferAddress(addressInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
device->memory_logger->log_allocation(buf, size);
|
device->memory_logger->log_allocation(buf, size);
|
||||||
#endif
|
|
||||||
|
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
@ -2565,11 +2646,9 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
if (buf->device != nullptr) {
|
if (buf->device != nullptr) {
|
||||||
buf->device->memory_logger->log_deallocation(buf);
|
buf->device->memory_logger->log_deallocation(buf);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
buf.reset();
|
buf.reset();
|
||||||
}
|
}
|
||||||
|
|
@ -2938,6 +3017,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
|
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
|
||||||
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
||||||
m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
|
||||||
|
} else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) {
|
||||||
|
// This is intentionally using tx_m values, slight performance increase
|
||||||
|
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
|
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
|
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
|
||||||
|
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
|
||||||
|
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||||
|
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
|
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||||
}
|
}
|
||||||
|
|
||||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||||
|
|
@ -2993,7 +3081,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::future<void>> compiles;
|
std::vector<std::future<void>> compiles;
|
||||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint,
|
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint,
|
||||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||||
|
|
||||||
|
|
@ -3001,35 +3089,49 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
required_subgroup_size = get_subgroup_size(name, device->architecture);
|
required_subgroup_size = get_subgroup_size(name, device->architecture);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!pipeline) {
|
vk_pipeline *ptr = &base_pipeline;
|
||||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
|
||||||
}
|
|
||||||
if (!pipeline->initialized) {
|
|
||||||
pipeline->name = name;
|
|
||||||
pipeline->parameter_count = parameter_count;
|
|
||||||
pipeline->push_constant_size = push_constant_size;
|
|
||||||
pipeline->wg_denoms = wg_denoms;
|
|
||||||
pipeline->align = align;
|
|
||||||
pipeline->initialized = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!pipeline->needed || pipeline->compiled) {
|
int num_pipelines = 1;
|
||||||
return;
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
if (device->shader_64b_indexing) {
|
||||||
|
num_pipelines = 2;
|
||||||
}
|
}
|
||||||
// TODO: We're no longer benefitting from the async compiles (shaders are
|
#endif
|
||||||
// compiled individually, as needed) and this complexity can be removed.
|
for (int i = 0; i < num_pipelines; ++i, ptr = &(*ptr)->next) {
|
||||||
{
|
vk_pipeline &pipeline = *ptr;
|
||||||
// wait until fewer than N compiles are in progress
|
if (!pipeline) {
|
||||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||||
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
}
|
||||||
while (compile_count >= N) {
|
if (!pipeline->initialized) {
|
||||||
compile_count_cond.wait(guard);
|
pipeline->name = name;
|
||||||
|
pipeline->parameter_count = parameter_count;
|
||||||
|
pipeline->push_constant_size = push_constant_size;
|
||||||
|
pipeline->wg_denoms = wg_denoms;
|
||||||
|
pipeline->align = align;
|
||||||
|
pipeline->initialized = true;
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
pipeline->is_64b_indexing = (i == 1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
compile_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
|
if (!pipeline->needed || pipeline->compiled) {
|
||||||
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
continue;
|
||||||
|
}
|
||||||
|
// TODO: We're no longer benefitting from the async compiles (shaders are
|
||||||
|
// compiled individually, as needed) and this complexity can be removed.
|
||||||
|
{
|
||||||
|
// wait until fewer than N compiles are in progress
|
||||||
|
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||||
|
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
||||||
|
while (compile_count >= N) {
|
||||||
|
compile_count_cond.wait(guard);
|
||||||
|
}
|
||||||
|
compile_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
|
||||||
|
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint,
|
auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint,
|
||||||
|
|
@ -3278,12 +3380,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
|
|
||||||
GGML_ASSERT(device->subgroup_ballot);
|
GGML_ASSERT(device->subgroup_ballot);
|
||||||
|
|
||||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||||
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
|
||||||
if (device->coopmat_bf16_support) {
|
if (device->coopmat_bf16_support) {
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -3391,9 +3493,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
|
|
||||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||||
|
|
@ -3435,9 +3537,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
|
|
||||||
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
|
|
@ -3552,9 +3654,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) {
|
||||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16);
|
||||||
|
|
||||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||||
|
|
@ -3578,9 +3680,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||||
CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size);
|
||||||
} else {
|
} else {
|
||||||
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
|
|
||||||
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
|
|
@ -3620,6 +3722,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
m_wg_denoms = { 64, 64, 1 };
|
m_wg_denoms = { 64, 64, 1 };
|
||||||
s_wg_denoms = { 32, 32, 1 };
|
s_wg_denoms = { 32, 32, 1 };
|
||||||
|
|
||||||
|
if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2) {
|
||||||
|
// Xe2/Xe3 - bf16 warptile performance tuning
|
||||||
|
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 };
|
||||||
|
}
|
||||||
|
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0);
|
||||||
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -3773,22 +3880,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
|
const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size;
|
||||||
const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
|
const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
|
||||||
}
|
}
|
||||||
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
|
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
|
||||||
}
|
}
|
||||||
|
|
@ -3876,9 +3983,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, 5 * sizeof(uint32_t), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true);
|
||||||
|
|
||||||
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
if (device->subgroup_clustered && device->subgroup_require_full_support) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
||||||
|
|
@ -4086,9 +4193,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
#define CREATE_GLU(name) \
|
#define CREATE_GLU(name) \
|
||||||
if (device->float_controls_rte_fp16) { \
|
if (device->float_controls_rte_fp16) { \
|
||||||
|
|
@ -4234,8 +4341,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||||
|
|
||||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true);
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||||
|
|
@ -4362,9 +4469,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
vk_device device = std::make_shared<vk_device_struct>();
|
vk_device device = std::make_shared<vk_device_struct>();
|
||||||
vk_instance.devices[idx] = device;
|
vk_instance.devices[idx] = device;
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
|
||||||
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
||||||
#endif
|
|
||||||
|
|
||||||
size_t dev_num = vk_instance.device_indices[idx];
|
size_t dev_num = vk_instance.device_indices[idx];
|
||||||
|
|
||||||
|
|
@ -4402,6 +4507,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
bool pipeline_executable_properties_support = false;
|
bool pipeline_executable_properties_support = false;
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
device->integer_dot_product = false;
|
device->integer_dot_product = false;
|
||||||
|
device->shader_64b_indexing = false;
|
||||||
bool bfloat16_support = false;
|
bool bfloat16_support = false;
|
||||||
|
|
||||||
for (const auto& properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
|
|
@ -4447,6 +4553,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
} else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
|
||||||
getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) {
|
getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) {
|
||||||
device->memory_priority = true;
|
device->memory_priority = true;
|
||||||
|
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
|
||||||
|
device->external_memory_host = true;
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
} else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) {
|
||||||
|
device->shader_64b_indexing = true;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4461,6 +4573,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
||||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props;
|
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props;
|
||||||
|
vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props;
|
||||||
|
|
||||||
props2.pNext = &props3;
|
props2.pNext = &props3;
|
||||||
props3.pNext = &subgroup_props;
|
props3.pNext = &subgroup_props;
|
||||||
|
|
@ -4500,11 +4613,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (device->external_memory_host) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props;
|
||||||
|
last_struct = (VkBaseOutStructure *)&external_memory_host_props;
|
||||||
|
}
|
||||||
|
|
||||||
device->physical_device.getProperties2(&props2);
|
device->physical_device.getProperties2(&props2);
|
||||||
device->properties = props2.properties;
|
device->properties = props2.properties;
|
||||||
device->vendor_id = device->properties.vendorID;
|
device->vendor_id = device->properties.vendorID;
|
||||||
device->driver_id = driver_props.driverID;
|
device->driver_id = driver_props.driverID;
|
||||||
|
|
||||||
|
if (device->driver_id == vk::DriverId::eMoltenvk) {
|
||||||
|
// Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622
|
||||||
|
// is available in the Vulkan SDK.
|
||||||
|
device->external_memory_host = false;
|
||||||
|
}
|
||||||
|
|
||||||
// Implementing the async backend interfaces seems broken on older Intel HW,
|
// Implementing the async backend interfaces seems broken on older Intel HW,
|
||||||
// see https://github.com/ggml-org/llama.cpp/issues/17302.
|
// see https://github.com/ggml-org/llama.cpp/issues/17302.
|
||||||
device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
|
device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
|
||||||
|
|
@ -4557,6 +4681,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
}
|
}
|
||||||
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
||||||
|
|
||||||
|
device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||||
|
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic);
|
||||||
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||||
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
|
|
@ -4586,6 +4712,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated;
|
device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated;
|
||||||
|
|
||||||
|
device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment;
|
||||||
|
|
||||||
device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations)));
|
device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations)));
|
||||||
|
|
||||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||||
|
|
@ -4717,6 +4845,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
|
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (device->external_memory_host) {
|
||||||
|
device_extensions.push_back("VK_EXT_external_memory_host");
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {};
|
||||||
|
shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT;
|
||||||
|
if (device->shader_64b_indexing) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&shader_64bit_indexing_features;
|
||||||
|
last_struct = (VkBaseOutStructure *)&shader_64bit_indexing_features;
|
||||||
|
device_extensions.push_back("VK_EXT_shader_64bit_indexing");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||||
|
|
||||||
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
|
device->pipeline_executable_properties_support = pipeline_executable_properties_support;
|
||||||
|
|
@ -4983,11 +5125,23 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
switch (device->vendor_id) {
|
switch (device->vendor_id) {
|
||||||
#ifndef GGML_VULKAN_RUN_TESTS
|
#ifndef GGML_VULKAN_RUN_TESTS
|
||||||
case VK_VENDOR_ID_AMD:
|
case VK_VENDOR_ID_AMD:
|
||||||
|
device->mul_mat_l[i] = device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary;
|
||||||
|
device->mul_mat_m[i] = true;
|
||||||
|
device->mul_mat_s[i] = true;
|
||||||
|
device->mul_mat_id_l[i] = false;
|
||||||
|
device->mul_mat_id_m[i] = true;
|
||||||
|
device->mul_mat_id_s[i] = true;
|
||||||
|
break;
|
||||||
case VK_VENDOR_ID_INTEL:
|
case VK_VENDOR_ID_INTEL:
|
||||||
device->mul_mat_l[i] = false;
|
if (!device->coopmat_support || device->architecture != INTEL_XE2) {
|
||||||
|
device->mul_mat_l[i] = false;
|
||||||
|
device->mul_mat_id_l[i] = false;
|
||||||
|
} else {
|
||||||
|
device->mul_mat_l[i] = true; // if coopmat & XE2+, allow large matmul warptile config for Intel
|
||||||
|
device->mul_mat_id_l[i] = true;
|
||||||
|
}
|
||||||
device->mul_mat_m[i] = true;
|
device->mul_mat_m[i] = true;
|
||||||
device->mul_mat_s[i] = true;
|
device->mul_mat_s[i] = true;
|
||||||
device->mul_mat_id_l[i] = false;
|
|
||||||
device->mul_mat_id_m[i] = true;
|
device->mul_mat_id_m[i] = true;
|
||||||
device->mul_mat_id_s[i] = true;
|
device->mul_mat_id_s[i] = true;
|
||||||
break;
|
break;
|
||||||
|
|
@ -5312,6 +5466,7 @@ static void ggml_vk_instance_init() {
|
||||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||||
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
|
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
|
||||||
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
|
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
|
||||||
|
vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr;
|
||||||
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
|
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
|
||||||
|
|
||||||
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
|
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
|
||||||
|
|
@ -5998,6 +6153,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||||
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||||
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||||
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
|
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
|
||||||
|
GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));
|
||||||
|
|
||||||
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||||
|
|
@ -6780,10 +6936,29 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
||||||
const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
|
const uint64_t max_elements = std::min<uint64_t>(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits<uint32_t>::max());
|
||||||
const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));
|
const uint32_t elements = std::min(ne, static_cast<uint32_t>(max_elements));
|
||||||
|
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 2>{ ne, num_blocks }, { elements, 1, 1 });
|
const vk_quantize_q8_1_push_constants pc = {
|
||||||
|
ne,
|
||||||
|
num_blocks,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 });
|
||||||
ggml_vk_sync_buffers(ctx, subctx);
|
ggml_vk_sync_buffers(ctx, subctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static vk_pipeline ggml_vk_get_64b_indexing_pipeline(ggml_backend_vk_context * ctx, vk_pipeline &pipeline) {
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
|
#if defined(VK_EXT_shader_64bit_indexing)
|
||||||
|
vk_pipeline *ptr = &pipeline;
|
||||||
|
while (*ptr) {
|
||||||
|
if ((*ptr)->is_64b_indexing) {
|
||||||
|
return *ptr;
|
||||||
|
}
|
||||||
|
ptr = &(*ptr)->next;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return pipeline;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) {
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) {
|
||||||
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||||
|
|
@ -6867,6 +7042,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||||
|
|
||||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type));
|
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type));
|
||||||
|
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
|
||||||
|
}
|
||||||
|
|
||||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11;
|
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11;
|
||||||
const uint64_t x_ne = ggml_nelements(src0);
|
const uint64_t x_ne = ggml_nelements(src0);
|
||||||
|
|
@ -7176,6 +7355,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||||
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv);
|
||||||
|
}
|
||||||
|
|
||||||
const bool qx_needs_dequant = x_non_contig;
|
const bool qx_needs_dequant = x_non_contig;
|
||||||
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
|
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
|
||||||
|
|
||||||
|
|
@ -7371,9 +7554,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
gqa_ratio = 1;
|
gqa_ratio = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1];
|
||||||
|
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
|
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
|
||||||
|
|
@ -7415,7 +7604,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
workgroups_z /= gqa_ratio;
|
workgroups_z /= gqa_ratio;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1],
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||||
{
|
{
|
||||||
d_Qx,
|
d_Qx,
|
||||||
d_Qy,
|
d_Qy,
|
||||||
|
|
@ -7465,9 +7654,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
|
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
|
||||||
const uint32_t channel_stride_y = nb12 / sizeof(float);
|
const uint32_t channel_stride_y = nb12 / sizeof(float);
|
||||||
|
|
||||||
|
vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32;
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
|
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true);
|
||||||
|
|
@ -7504,7 +7698,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
|
|
||||||
init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
|
init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
|
||||||
|
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||||
{
|
{
|
||||||
d_Qx,
|
d_Qx,
|
||||||
d_Qy,
|
d_Qy,
|
||||||
|
|
@ -7523,8 +7717,9 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||||
// Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases
|
// Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases
|
||||||
// where the M dimension is very large.
|
// where the M dimension is very large.
|
||||||
// Split_k doesn't work with M splitting.
|
// Split_k doesn't work with M splitting.
|
||||||
|
// This only supports batchsize == 1.
|
||||||
const size_t nbytes = ggml_nbytes(src0);
|
const size_t nbytes = ggml_nbytes(src0);
|
||||||
const bool needs_split = nbytes > ctx->device->properties.limits.maxStorageBufferRange;
|
const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange;
|
||||||
if (needs_split) {
|
if (needs_split) {
|
||||||
// Choose the number of rows that can fit (and divide by two, to allow for any additional offsets)
|
// Choose the number of rows that can fit (and divide by two, to allow for any additional offsets)
|
||||||
const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]);
|
const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]);
|
||||||
|
|
@ -7666,6 +7861,9 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||||
|
|
||||||
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type);
|
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type);
|
||||||
|
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline);
|
||||||
|
}
|
||||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||||
const uint64_t x_ne = ggml_nelements(src0);
|
const uint64_t x_ne = ggml_nelements(src0);
|
||||||
|
|
@ -7927,6 +8125,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||||
const bool qx_needs_dequant = x_non_contig;
|
const bool qx_needs_dequant = x_non_contig;
|
||||||
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
|
const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig);
|
||||||
|
|
||||||
|
if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) {
|
||||||
|
dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv);
|
||||||
|
}
|
||||||
|
|
||||||
// Not implemented
|
// Not implemented
|
||||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||||
|
|
@ -9771,8 +9973,9 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||||
|
|
||||||
std::array<uint32_t, 3> elements;
|
std::array<uint32_t, 3> elements;
|
||||||
|
|
||||||
const int splitH = 16;
|
const uint32_t d_state = src0->ne[0];
|
||||||
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, splitH);
|
uint32_t num_subgroups = d_state / ctx->device->subgroup_size;
|
||||||
|
const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups);
|
||||||
const uint32_t num_workgroups_y = n_seq;
|
const uint32_t num_workgroups_y = n_seq;
|
||||||
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
||||||
|
|
||||||
|
|
@ -14150,6 +14353,7 @@ struct ggml_backend_vk_device_context {
|
||||||
std::string description;
|
std::string description;
|
||||||
bool is_integrated_gpu;
|
bool is_integrated_gpu;
|
||||||
std::string pci_bus_id;
|
std::string pci_bus_id;
|
||||||
|
int op_offload_min_batch_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -14206,6 +14410,19 @@ static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||||
|
|
||||||
|
// reject any tensors larger than the max buffer size
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (op->src[i] && ggml_nbytes(op->src[i]) > device->max_buffer_size) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ggml_nbytes(op) > device->max_buffer_size) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
|
@ -14254,8 +14471,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
|
||||||
if (op->op == GGML_OP_MUL_MAT_ID) {
|
if (op->op == GGML_OP_MUL_MAT_ID) {
|
||||||
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
||||||
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
||||||
|
|
@ -14316,8 +14531,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
}
|
}
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
{
|
{
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
auto device = ggml_vk_get_device(ctx->device);
|
|
||||||
bool coopmat2 = device->coopmat2;
|
bool coopmat2 = device->coopmat2;
|
||||||
uint32_t HSK = op->src[1]->ne[0];
|
uint32_t HSK = op->src[1]->ne[0];
|
||||||
uint32_t HSV = op->src[2]->ne[0];
|
uint32_t HSV = op->src[2]->ne[0];
|
||||||
|
|
@ -14539,8 +14752,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
auto device = ggml_vk_get_device(ctx->device);
|
|
||||||
// pipeline_argsort_large_f32 requires vulkan memory model.
|
// pipeline_argsort_large_f32 requires vulkan memory model.
|
||||||
if (device->vulkan_memory_model) {
|
if (device->vulkan_memory_model) {
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -14553,8 +14764,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
if (!ggml_is_contiguous(op) || !ggml_is_contiguous(op->src[0])) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
auto device = ggml_vk_get_device(ctx->device);
|
|
||||||
// We could potentially support larger, using argsort to sort the
|
// We could potentially support larger, using argsort to sort the
|
||||||
// whole thing. Not clear if this is needed.
|
// whole thing. Not clear if this is needed.
|
||||||
uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1;
|
uint32_t min_pipeline = (uint32_t)log2f(float(op->ne[0])) + 1;
|
||||||
|
|
@ -14601,8 +14810,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
||||||
case GGML_OP_CUMSUM:
|
case GGML_OP_CUMSUM:
|
||||||
{
|
{
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
auto device = ggml_vk_get_device(ctx->device);
|
|
||||||
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]);
|
||||||
}
|
}
|
||||||
|
|
@ -14610,9 +14817,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
}
|
}
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
{
|
{
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
|
||||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
|
||||||
|
|
||||||
if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) {
|
if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -14677,14 +14881,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
size_t shmem_size = d_state * sizeof(float);
|
||||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
|
||||||
|
|
||||||
const uint32_t SPLIT_H = 16;
|
if (shmem_size > device->properties.limits.maxComputeSharedMemorySize) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
size_t stateC_size = SPLIT_H * d_state * sizeof(float);
|
if (!device->subgroup_basic) {
|
||||||
|
|
||||||
if (stateC_size > device->properties.limits.maxComputeSharedMemorySize) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -14724,12 +14927,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
|
||||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||||
|
|
||||||
UNUSED(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
|
||||||
|
|
@ -14773,6 +14974,51 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
||||||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
|
||||||
|
if (!device->external_memory_host) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
|
||||||
|
if (uptr & (device->min_imported_host_pointer_alignment - 1)) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
if (size & (device->min_imported_host_pointer_alignment - 1)) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
|
||||||
|
|
||||||
|
vk_buffer buf {};
|
||||||
|
try {
|
||||||
|
buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr);
|
||||||
|
} catch (vk::SystemError& e) {
|
||||||
|
GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
|
VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")");
|
||||||
|
GGML_UNUSED(max_tensor_size);
|
||||||
|
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
auto device = ggml_vk_get_device(ctx->device);
|
||||||
|
|
||||||
|
vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size);
|
||||||
|
|
||||||
|
if (!buf) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||||
/* .get_name = */ ggml_backend_vk_device_get_name,
|
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||||
/* .get_description = */ ggml_backend_vk_device_get_description,
|
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||||
|
|
@ -14782,7 +15028,7 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||||
/* .init_backend = */ ggml_backend_vk_device_init,
|
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||||
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||||
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||||
/* .buffer_from_host_ptr = */ NULL,
|
/* .buffer_from_host_ptr = */ ggml_backend_vk_device_buffer_from_host_ptr,
|
||||||
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||||
|
|
@ -14810,6 +15056,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||||
char desc[256];
|
char desc[256];
|
||||||
|
|
@ -14819,6 +15066,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||||
ctx->description = desc;
|
ctx->description = desc;
|
||||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||||
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
|
||||||
|
ctx->op_offload_min_batch_size = min_batch_size;
|
||||||
devices.push_back(new ggml_backend_device {
|
devices.push_back(new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_vk_device_i,
|
/* .iface = */ ggml_backend_vk_device_i,
|
||||||
/* .reg = */ reg,
|
/* .reg = */ reg,
|
||||||
|
|
|
||||||
|
|
@ -462,7 +462,8 @@ vec2 get_dm(uint ib, uint a_offset) {
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
||||||
vec2 get_dm(uint ib, uint a_offset) {
|
vec2 get_dm(uint ib, uint a_offset) {
|
||||||
return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
|
const vec2 dm = vec2(data_a_packed32[a_offset + ib].dm);
|
||||||
|
return dm;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,6 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
a_offset /= QUANT_K;
|
|
||||||
|
|
||||||
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,9 +65,9 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
||||||
|
|
||||||
a_offset =
|
a_offset =
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
expert_id * p.batch_stride_a;
|
expert_id * (p.batch_stride_a / QUANT_K);
|
||||||
#else
|
#else
|
||||||
batch_idx_a * p.batch_stride_a;
|
batch_idx_a * (p.batch_stride_a / QUANT_K);
|
||||||
#endif
|
#endif
|
||||||
b_offset =
|
b_offset =
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32,
|
||||||
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||||
// Compute starting index in matrix B for this superblock
|
// Compute starting index in matrix B for this superblock
|
||||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
|
|
||||||
// Precompute indices for quantization lookup tables
|
// Precompute indices for quantization lookup tables
|
||||||
const uint qh_base = 2 * ib32;
|
const uint qh_base = 2 * ib32;
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32,
|
||||||
const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
|
const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
|
||||||
|
|
||||||
// index for data_a
|
// index for data_a
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint nibble_shift = 4 * (itid & 1);
|
const uint nibble_shift = 4 * (itid & 1);
|
||||||
const uint ib32 = itid / 2; // 0..7
|
const uint ib32 = itid / 2; // 0..7
|
||||||
|
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
|
const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||||
const uint nibble_shift = 4 * (itid & 1);
|
const uint nibble_shift = 4 * (itid & 1);
|
||||||
const uint ib32 = itid / 2; // 0..7
|
const uint ib32 = itid / 2; // 0..7
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
// Precompute db multiplication factors
|
// Precompute db multiplication factors
|
||||||
float db_vals[NUM_ROWS];
|
float db_vals[NUM_ROWS];
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
|
@ -22,7 +22,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
db_vals[n] = d * (0.125f + float(scale) * 0.25f);
|
db_vals[n] = d * (0.125f + float(scale) * 0.25f);
|
||||||
ibi += num_blocks_per_row;
|
ibi += num_blocks_per_row;
|
||||||
}
|
}
|
||||||
ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
// Preload grid and sign data for all l values
|
// Preload grid and sign data for all l values
|
||||||
vec4 grid0_vals[2], grid1_vals[2];
|
vec4 grid0_vals[2], grid1_vals[2];
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||||
const uint ib32 = itid / 2; // 0..7
|
const uint ib32 = itid / 2; // 0..7
|
||||||
|
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
const uint signscale = pack32(u16vec2(
|
const uint signscale = pack32(u16vec2(
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||||
|
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
const uint scale = (data_a[ibi].scales[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
const uint scale = (data_a[ibi].scales[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint y_idx = i * QUANT_K + 16 * itid;
|
const uint y_idx = i * QUANT_K + 16 * itid;
|
||||||
const uint ib32 = itid / 2; // 0..7
|
const uint ib32 = itid / 2; // 0..7
|
||||||
|
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset + first_row * num_blocks_per_row + i;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const float d = float(data_a[ibi].d);
|
const float d = float(data_a[ibi].d);
|
||||||
const uint signscale = pack32(u16vec2(
|
const uint signscale = pack32(u16vec2(
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
|
||||||
csel ^= 1;
|
csel ^= 1;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
|
||||||
csel ^= 1;
|
csel ^= 1;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
|
||||||
const uint y2_idx = y1_idx + 128;
|
const uint y2_idx = y1_idx + 128;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
|
||||||
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
|
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
|
||||||
|
|
||||||
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
|
||||||
const uint y2_idx = y1_idx + 128;
|
const uint y2_idx = y1_idx + 128;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
|
||||||
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
|
const FLOAT_TYPE_VEC2 dm = FLOAT_TYPE_VEC2(data_a[ib0 + i].dm);
|
||||||
|
|
||||||
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset + (first_row+n)*num_blocks_per_row;
|
||||||
csel ^= 1;
|
csel ^= 1;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
a_offset /= QUANT_K_Q8_1;
|
a_offset *= QUANT_K / QUANT_K_Q8_1;
|
||||||
b_offset /= QUANT_K_Q8_1;
|
b_offset /= QUANT_K_Q8_1;
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
|
|
||||||
|
|
@ -234,13 +234,13 @@ void main() {
|
||||||
const uint end_k = min(p.K, (ik + 1) * p.k_split);
|
const uint end_k = min(p.K, (ik + 1) * p.k_split);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint pos_a = (
|
uint pos_a =
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
expert_idx * p.batch_stride_a +
|
expert_idx * (p.batch_stride_a / LOAD_VEC_A) +
|
||||||
#else
|
#else
|
||||||
batch_idx_a * p.batch_stride_a +
|
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) +
|
||||||
#endif
|
#endif
|
||||||
ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
|
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
uint pos_b = 0;
|
uint pos_b = 0;
|
||||||
#else
|
#else
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue