Merge branch 'ggml-org:master' into sf/deepseek-ocr
This commit is contained in:
commit
7e47aa8813
|
|
@ -1532,7 +1532,7 @@ jobs:
|
|||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
|
@ -1558,7 +1558,7 @@ jobs:
|
|||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf-sve:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
|
|
|||
27
ci/run.sh
27
ci/run.sh
|
|
@ -635,6 +635,29 @@ function gg_check_build_requirements {
|
|||
fi
|
||||
}
|
||||
|
||||
function gg_run_test_backend_ops_cpu {
|
||||
cd ${SRC}
|
||||
|
||||
cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_test_backend_ops_cpu {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs test-backend-ops for CPU backend\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '\n'
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
|
|
@ -663,6 +686,10 @@ ret=0
|
|||
test $ret -eq 0 && gg_run ctest_debug
|
||||
test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run test_backend_ops_cpu
|
||||
fi
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run embd_bge_small
|
||||
test $ret -eq 0 && gg_run rerank_tiny
|
||||
|
|
|
|||
|
|
@ -144,6 +144,13 @@ value binary_expression::execute_impl(context & ctx) {
|
|||
return false;
|
||||
};
|
||||
|
||||
auto test_is_in = [&]() -> bool {
|
||||
func_args args(ctx);
|
||||
args.push_back(left_val);
|
||||
args.push_back(right_val);
|
||||
return global_builtins().at("test_is_in")(args)->as_bool();
|
||||
};
|
||||
|
||||
// Handle undefined and null values
|
||||
if (is_val<value_undefined>(left_val) || is_val<value_undefined>(right_val)) {
|
||||
if (is_val<value_undefined>(right_val) && (op.value == "in" || op.value == "not in")) {
|
||||
|
|
@ -223,19 +230,11 @@ value binary_expression::execute_impl(context & ctx) {
|
|||
return result;
|
||||
}
|
||||
} else if (is_val<value_array>(right_val)) {
|
||||
auto & arr = right_val->as_array();
|
||||
bool member = false;
|
||||
for (const auto & item : arr) {
|
||||
if (*left_val == *item) {
|
||||
member = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// case: 1 in [0, 1, 2]
|
||||
bool member = test_is_in();
|
||||
if (op.value == "in") {
|
||||
JJ_DEBUG("Checking membership: %s in Array is %d", left_val->type().c_str(), member);
|
||||
return mk_val<value_bool>(member);
|
||||
} else if (op.value == "not in") {
|
||||
JJ_DEBUG("Checking non-membership: %s not in Array is %d", left_val->type().c_str(), !member);
|
||||
return mk_val<value_bool>(!member);
|
||||
}
|
||||
}
|
||||
|
|
@ -252,22 +251,23 @@ value binary_expression::execute_impl(context & ctx) {
|
|||
|
||||
// String membership
|
||||
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
|
||||
auto left_str = left_val->as_string().str();
|
||||
auto right_str = right_val->as_string().str();
|
||||
// case: "a" in "abc"
|
||||
bool member = test_is_in();
|
||||
if (op.value == "in") {
|
||||
return mk_val<value_bool>(right_str.find(left_str) != std::string::npos);
|
||||
return mk_val<value_bool>(member);
|
||||
} else if (op.value == "not in") {
|
||||
return mk_val<value_bool>(right_str.find(left_str) == std::string::npos);
|
||||
return mk_val<value_bool>(!member);
|
||||
}
|
||||
}
|
||||
|
||||
// Value key in object
|
||||
if (is_val<value_object>(right_val)) {
|
||||
bool has_key = right_val->has_key(left_val);
|
||||
// case: key in {key: value}
|
||||
bool member = test_is_in();
|
||||
if (op.value == "in") {
|
||||
return mk_val<value_bool>(has_key);
|
||||
return mk_val<value_bool>(member);
|
||||
} else if (op.value == "not in") {
|
||||
return mk_val<value_bool>(!has_key);
|
||||
return mk_val<value_bool>(!member);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -393,6 +393,33 @@ const func_builtins & global_builtins() {
|
|||
{"test_is_lt", test_compare_fn<value_compare_op::lt>},
|
||||
{"test_is_lessthan", test_compare_fn<value_compare_op::lt>},
|
||||
{"test_is_ne", test_compare_fn<value_compare_op::ne>},
|
||||
{"test_is_in", [](const func_args & args) -> value {
|
||||
args.ensure_count(2);
|
||||
auto needle = args.get_pos(0);
|
||||
auto haystack = args.get_pos(1);
|
||||
if (is_val<value_undefined>(haystack)) {
|
||||
return mk_val<value_bool>(false);
|
||||
}
|
||||
if (is_val<value_array>(haystack)) {
|
||||
for (const auto & item : haystack->as_array()) {
|
||||
if (*needle == *item) {
|
||||
return mk_val<value_bool>(true);
|
||||
}
|
||||
}
|
||||
return mk_val<value_bool>(false);
|
||||
}
|
||||
if (is_val<value_string>(haystack)) {
|
||||
if (!is_val<value_string>(needle)) {
|
||||
throw raised_exception("'in' test expects args[1] as string when args[0] is string, got args[1] as " + needle->type());
|
||||
}
|
||||
return mk_val<value_bool>(
|
||||
haystack->as_string().str().find(needle->as_string().str()) != std::string::npos);
|
||||
}
|
||||
if (is_val<value_object>(haystack)) {
|
||||
return mk_val<value_bool>(haystack->has_key(needle));
|
||||
}
|
||||
throw raised_exception("'in' test expects iterable as first argument, got " + haystack->type());
|
||||
}},
|
||||
{"test_is_test", [](const func_args & args) -> value {
|
||||
args.ensure_vals<value_string>();
|
||||
auto & builtins = global_builtins();
|
||||
|
|
|
|||
|
|
@ -951,12 +951,8 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
|
|||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
|
||||
common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
|
||||
impl->begin(prompt);
|
||||
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -971,14 +967,9 @@ llama_tokens common_speculative_draft(
|
|||
|
||||
for (auto & impl : spec->impls) {
|
||||
{
|
||||
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
|
||||
common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
|
||||
impl->draft(params, prompt_tgt, id_last, result);
|
||||
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
|
||||
impl->drafts_call_count++;
|
||||
impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation
|
||||
}
|
||||
|
||||
if (!result.empty()) {
|
||||
|
|
@ -1006,15 +997,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
|
|||
|
||||
GGML_ASSERT(impl);
|
||||
|
||||
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
if (n_accepted > 0) {
|
||||
impl->drafts_accepted_count++;
|
||||
impl->drafts_accepted_tokens += n_accepted;
|
||||
}
|
||||
{
|
||||
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
|
||||
if (n_accepted > 0) {
|
||||
impl->drafts_accepted_count++;
|
||||
impl->drafts_accepted_tokens += n_accepted;
|
||||
}
|
||||
|
||||
impl->accept(n_accepted);
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation
|
||||
impl->accept(n_accepted);
|
||||
}
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
|
|
|
|||
|
|
@ -22,12 +22,11 @@
|
|||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
|
||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
### Llama.cpp + SYCL
|
||||
|
||||
The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
|
||||
SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
|
||||
SYCL cross-platform capabilities enable support for other vendor GPUs as well.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
|
|
@ -42,6 +41,9 @@ The following releases are verified and recommended:
|
|||
|
||||
## News
|
||||
|
||||
- 2026.02
|
||||
- Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
|
||||
|
||||
- 2025.11
|
||||
- Support malloc memory on device more than 4GB.
|
||||
|
||||
|
|
@ -111,8 +113,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
|||
|-------------------------------|---------|---------------------------------------|
|
||||
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
|
||||
| Intel Arc B-Series | Support | Arc B580 |
|
||||
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
|
||||
| Intel Arc B-Series | Support | Arc B580 |
|
||||
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
|
||||
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
|
||||
|
||||
|
|
@ -127,20 +129,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
|||
|
||||
### Other Vendor GPU
|
||||
|
||||
**Verified devices**
|
||||
|
||||
| Nvidia GPU | Status | Verified Model |
|
||||
|--------------------------|-----------|----------------|
|
||||
| Ampere Series | Supported | A100, A4000 |
|
||||
| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
|
||||
|
||||
| AMD GPU | Status | Verified Model |
|
||||
|--------------------------|--------------|----------------|
|
||||
| Radeon Pro | Experimental | W6800 |
|
||||
| Radeon RX | Experimental | 6700 XT |
|
||||
|
||||
Note: AMD GPU support is highly experimental and is incompatible with F16.
|
||||
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
|
||||
NA
|
||||
|
||||
## Docker
|
||||
|
||||
|
|
@ -149,11 +138,11 @@ The docker build option is currently limited to *Intel GPU* targets.
|
|||
### Build image
|
||||
|
||||
```sh
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
||||
|
||||
# Using FP32
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
|
||||
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
||||
```
|
||||
|
||||
*Notes*:
|
||||
|
|
@ -212,14 +201,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
|||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
- **Nvidia GPU**
|
||||
|
||||
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
||||
|
||||
- **AMD GPU**
|
||||
|
||||
To target AMD GPUs with SYCL, the ROCm stack must be installed first.
|
||||
|
||||
2. **Install Intel® oneAPI Base toolkit**
|
||||
|
||||
SYCL backend depends on:
|
||||
|
|
@ -248,23 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
|||
|2025.1|
|
||||
|2024.1|
|
||||
|
||||
- **Adding support to Nvidia GPUs**
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
|
||||
|
||||
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneDNN.git
|
||||
cd oneDNN
|
||||
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build build-nvidia --config Release
|
||||
```
|
||||
|
||||
- **Adding support to AMD GPUs**
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
|
||||
|
||||
3. **Verify installation and environment**
|
||||
|
||||
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
|
||||
|
|
@ -285,25 +249,6 @@ When targeting an intel GPU, the user should expect one or more devices among th
|
|||
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO [24.39.31294]
|
||||
```
|
||||
|
||||
- **Nvidia GPU**
|
||||
|
||||
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
|
||||
|
||||
```
|
||||
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
||||
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
|
||||
```
|
||||
|
||||
- **AMD GPU**
|
||||
|
||||
For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
||||
|
||||
```
|
||||
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
|
||||
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
||||
```
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
#### Intel GPU
|
||||
|
|
@ -332,47 +277,6 @@ It is possible to come across some precision issues when running tests that stem
|
|||
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
|
||||
as `-cl-fp32-correctly-rounded-divide-sqrt`
|
||||
|
||||
#### Nvidia GPU
|
||||
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
|
||||
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
|
||||
|
||||
# Option 2: Use FP16
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
|
||||
It is possible to come across some precision issues when running tests that stem from using faster
|
||||
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
|
||||
|
||||
#### AMD GPU
|
||||
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||
|
||||
## AMD
|
||||
# Use FP32, FP16 is not supported
|
||||
# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
|
||||
GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
#### Retrieve and prepare model
|
||||
|
|
@ -766,15 +670,15 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||
| Name | Value | Function |
|
||||
|--------------------|---------------------------------------|---------------------------------------------|
|
||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||
| GGML_SYCL_TARGET | INTEL *(default)* | Set the SYCL target device type. |
|
||||
| GGML_SYCL_DEVICE_ARCH | Optional | Set the SYCL device architecture. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
|
||||
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
|
||||
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
|
||||
|
||||
#### Runtime
|
||||
|
||||
|
|
@ -782,7 +686,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||
|
|
|
|||
|
|
@ -252,9 +252,7 @@ CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.ggu
|
|||
|
||||
The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/environment-variables.html#cuda-scale-launch-queues) controls the size of CUDA's command buffer, which determines how many GPU operations can be queued before the CPU must wait for the GPU to catch up. A larger buffer reduces CPU-side stalls and allows more work to be queued on a GPU.
|
||||
|
||||
**Default behavior:** llama.cpp automatically sets `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
||||
|
||||
See PR [#19042](https://github.com/ggml-org/llama.cpp/pull/19042) for performance benchmarks and technical details.
|
||||
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
||||
|
||||
### Unified Memory
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ Legend:
|
|||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||
|
|
|
|||
|
|
@ -9677,168 +9677,168 @@
|
|||
"SYCL0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9847,16 +9847,16 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9865,16 +9865,16 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9883,16 +9883,16 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9901,16 +9901,16 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9919,16 +9919,16 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","SYCL"
|
||||
|
|
@ -9937,51 +9937,51 @@
|
|||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","1","yes","SYCL"
|
||||
|
|
|
|||
|
Can't render this file because it is too large.
|
|
|
@ -33,11 +33,14 @@ DEVICE ?= auto
|
|||
causal-convert-model-bf16: OUTTYPE=bf16
|
||||
causal-convert-model-bf16: causal-convert-model
|
||||
|
||||
causal-convert-model-debug: DEBUG=--debug
|
||||
causal-convert-model-debug: causal-convert-model
|
||||
|
||||
causal-convert-model:
|
||||
$(call validate_model_path,causal-convert-model)
|
||||
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
|
||||
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
|
||||
./scripts/causal/convert-model.sh
|
||||
./scripts/causal/convert-model.sh $(DEBUG)
|
||||
|
||||
causal-convert-mm-model-bf16: OUTTYPE=bf16
|
||||
causal-convert-mm-model-bf16: MM_OUTTYPE=f16
|
||||
|
|
|
|||
|
|
@ -4,12 +4,17 @@ set -e
|
|||
|
||||
# Parse command line arguments
|
||||
MMPROJ=""
|
||||
DEBUG=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--mmproj)
|
||||
MMPROJ="--mmproj"
|
||||
shift
|
||||
;;
|
||||
--debug)
|
||||
DEBUG="1"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
shift
|
||||
;;
|
||||
|
|
@ -28,7 +33,12 @@ echo "Data type: ${TYPE}"
|
|||
echo "Converted model path:: ${CONVERTED_MODEL}"
|
||||
echo "Metadata override: ${METADATA_OVERRIDE}"
|
||||
|
||||
CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
|
||||
if [[ -n "$DEBUG" ]]; then
|
||||
CMD_ARGS=("python" "-m" "pdb")
|
||||
else
|
||||
CMD_ARGS=("python")
|
||||
fi
|
||||
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
|
||||
CMD_ARGS+=("${MODEL_PATH}")
|
||||
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
|
||||
CMD_ARGS+=("--outtype" "${TYPE}")
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ extern "C" {
|
|||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
|
||||
// use only reference implementations
|
||||
bool use_ref;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
|
|
@ -132,6 +135,8 @@ extern "C" {
|
|||
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
||||
|
|
|
|||
|
|
@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
if (backend->iface.set_tensor_async == NULL) {
|
||||
ggml_backend_synchronize(backend);
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
} else {
|
||||
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
||||
|
|
@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
|||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
|
||||
if (backend->iface.get_tensor_async == NULL) {
|
||||
ggml_backend_synchronize(backend);
|
||||
ggml_backend_tensor_get(tensor, data, offset, size);
|
||||
} else {
|
||||
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
||||
|
|
|
|||
|
|
@ -24,6 +24,9 @@ struct ggml_compute_params {
|
|||
void * wdata;
|
||||
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// use reference implementation
|
||||
bool use_ref;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@
|
|||
#include "ggml-backend.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-threading.h"
|
||||
|
|
@ -2867,12 +2866,20 @@ struct ggml_cplan ggml_graph_plan(
|
|||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const int64_t neq2 = node->src[0]->ne[2]; // number of query heads
|
||||
const int64_t DK = node->src[1]->ne[0];
|
||||
const int64_t DV = node->src[2]->ne[0];
|
||||
|
||||
// Tiled flash attention scratch (tile sizes defined in common.h)
|
||||
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
|
||||
cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
|
||||
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
|
||||
|
||||
// Decode path: n_kv_chunks = n_tasks (one chunk per thread)
|
||||
// Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
|
||||
size_t n_chunks = n_tasks;
|
||||
size_t decode = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV));
|
||||
|
||||
cur += MAX(prefill, decode);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
|
|
@ -2929,11 +2936,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
set_numa_thread_affinity(state->ith);
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
/*.threadpool=*/ tp,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
/*.wdata =*/ cplan->work_data,
|
||||
/*.threadpool =*/ tp,
|
||||
/*.use_ref =*/ cplan->use_ref,
|
||||
};
|
||||
|
||||
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||
|
|
|
|||
|
|
@ -105,6 +105,8 @@ struct ggml_backend_cpu_context {
|
|||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
|
||||
bool use_ref; // use reference implementation
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
|
|
@ -143,6 +145,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|||
|
||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
cpu_plan->cplan.use_ref = cpu_ctx->use_ref;
|
||||
|
||||
return cpu_plan;
|
||||
}
|
||||
|
|
@ -182,6 +185,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
|||
|
||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
cplan.use_ref = cpu_ctx->use_ref;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
|
@ -223,6 +227,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
ctx->use_ref = false;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
|
|
@ -270,6 +275,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
|
|||
ctx->abort_callback_data = abort_callback_data;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->use_ref = use_ref;
|
||||
}
|
||||
|
||||
// CPU backend - device
|
||||
|
||||
struct ggml_backend_cpu_device_context {
|
||||
|
|
@ -646,6 +658,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
|||
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
|
||||
return (void *)ggml_is_numa;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_use_ref;
|
||||
}
|
||||
|
||||
// threadpool - TODO: move to ggml-base
|
||||
if (strcmp(name, "ggml_threadpool_new") == 0) {
|
||||
|
|
|
|||
|
|
@ -8042,12 +8042,14 @@ void ggml_compute_forward_top_k(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_ext
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0, int ir1) {
|
||||
int ir0, int ir1,
|
||||
int64_t ic_start, int64_t ic_end,
|
||||
float * partials, int64_t partial_stride) {
|
||||
|
||||
const bool write_partials = (partials != nullptr);
|
||||
const ggml_tensor * q = dst->src[0];
|
||||
const ggml_tensor * k = dst->src[1];
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
|
|
@ -8124,7 +8126,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
|
||||
int ith = params->ith;
|
||||
|
||||
// loop over n_batch and n_head
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
|
|
@ -8165,7 +8166,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
// loop over n_kv and n_head_kv
|
||||
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
||||
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
for (int64_t ic = ic_start; ic < ic_end; ++ic) {
|
||||
const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
|
||||
if (mv == -INFINITY) {
|
||||
continue;
|
||||
|
|
@ -8238,8 +8239,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
}
|
||||
}
|
||||
|
||||
// sinks
|
||||
if (sinks) {
|
||||
// sinks - apply only on the first kv-chunk
|
||||
if (sinks && ic_start == 0) {
|
||||
const float s = ((float *)((char *) sinks->data))[h];
|
||||
|
||||
float ms = 1.0f;
|
||||
|
|
@ -8247,6 +8248,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
|
||||
if (s > M) {
|
||||
ms = expf(M - s);
|
||||
M = s;
|
||||
ggml_vec_scale_f32(DV, VKQ32, ms);
|
||||
} else {
|
||||
vs = expf(s - M);
|
||||
|
|
@ -8255,20 +8257,26 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
S = S*ms + vs;
|
||||
}
|
||||
|
||||
// V /= S
|
||||
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
|
||||
ggml_vec_scale_f32(DV, VKQ32, S_inv);
|
||||
if (write_partials) {
|
||||
// Write M, S, VKQ to partials for later reduction
|
||||
// partials layout: [M, S, VKQ[DV]] per query head
|
||||
float * partial = partials + ir * partial_stride;
|
||||
partial[0] = M;
|
||||
partial[1] = S;
|
||||
memcpy(partial + 2, VKQ32, DV * sizeof(float));
|
||||
} else {
|
||||
// V /= S
|
||||
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
|
||||
ggml_vec_scale_f32(DV, VKQ32, S_inv);
|
||||
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// original
|
||||
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
||||
|
||||
// permute(0, 2, 1, 3)
|
||||
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
|
||||
// permute(0, 2, 1, 3)
|
||||
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -8546,6 +8554,78 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
|
|||
}
|
||||
}
|
||||
|
||||
// Reduction function: combines partial results across KV chunks
|
||||
// Partials layout in wdata: [n_q_heads][n_chunks][2 + DV]
|
||||
static void ggml_flash_attn_ext_reduce_partials(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
const int64_t n_chunks,
|
||||
const int64_t chunk_size) {
|
||||
|
||||
const ggml_tensor * q = dst->src[0];
|
||||
const ggml_tensor * k = dst->src[1];
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
|
||||
const int64_t DK = k->ne[0];
|
||||
const int64_t DV = v->ne[0];
|
||||
const int64_t nek1 = k->ne[1];
|
||||
const int64_t n_q_heads = q->ne[2];
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t wdata_per_thread = DK + 2*DV + CACHE_LINE_SIZE_F32;
|
||||
float * thread_wdata = (float *) params->wdata + ith * wdata_per_thread;
|
||||
|
||||
const int64_t partials_offset = nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
|
||||
const int64_t partial_size = 2 + DV;
|
||||
const float * partials_base = (const float *) params->wdata + partials_offset;
|
||||
|
||||
// Output layout
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
// Each thread reduces a subset of query heads
|
||||
for (int64_t q_head = ith; q_head < n_q_heads; q_head += nth) {
|
||||
float M_final = -INFINITY;
|
||||
float S_final = 0.0f;
|
||||
float * VKQ_final = thread_wdata;
|
||||
memset(VKQ_final, 0, DV * sizeof(float));
|
||||
|
||||
// Combine partials from all chunks
|
||||
for (int64_t chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
|
||||
const int64_t ic_start = chunk_idx * chunk_size;
|
||||
if (ic_start >= nek1) continue;
|
||||
|
||||
const float * partial = partials_base + (q_head * n_chunks + chunk_idx) * partial_size;
|
||||
const float M_chunk = partial[0];
|
||||
const float S_chunk = partial[1];
|
||||
const float * VKQ_chunk = partial + 2;
|
||||
|
||||
if (S_chunk == 0.0f) continue;
|
||||
|
||||
const float M_new = fmaxf(M_final, M_chunk);
|
||||
const float scale_old = expf(M_final - M_new);
|
||||
const float scale_new = expf(M_chunk - M_new);
|
||||
|
||||
for (int64_t d = 0; d < DV; ++d) {
|
||||
VKQ_final[d] = VKQ_final[d] * scale_old + VKQ_chunk[d] * scale_new;
|
||||
}
|
||||
S_final = S_final * scale_old + S_chunk * scale_new;
|
||||
M_final = M_new;
|
||||
}
|
||||
|
||||
// Normalize and write to output
|
||||
if (S_final != 0.0f) {
|
||||
const float S_inv = 1.0f / S_final;
|
||||
ggml_vec_scale_f32(DV, VKQ_final, S_inv);
|
||||
}
|
||||
// iq1=0, iq3=0 for decode
|
||||
memcpy((char *) dst->data + (0*ne2*ne1 + q_head + 0*ne1)*nb1, VKQ_final, nb1);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
|
@ -8567,6 +8647,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
const int64_t DV = nev0;
|
||||
const int64_t N = neq1;
|
||||
|
||||
|
||||
GGML_ASSERT(ne0 == DV);
|
||||
GGML_ASSERT(ne2 == N);
|
||||
|
||||
|
|
@ -8587,60 +8668,92 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int64_t nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
// disable for NUMA
|
||||
const bool disable_chunking = ggml_is_numa();
|
||||
// When use_ref is set, force the vec-only reference implementation (no tiling, no KV-chunking)
|
||||
const bool use_ref = params->use_ref;
|
||||
|
||||
// 4x chunks per thread
|
||||
int nth_scaled = nth * 4;
|
||||
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
|
||||
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
|
||||
|
||||
if (nth == 1 || nchunk < nth || disable_chunking) {
|
||||
nchunk = nth;
|
||||
}
|
||||
|
||||
if (ith == 0) {
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
ggml_threadpool_chunk_set(params->threadpool, nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// The number of elements in each chunk
|
||||
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
||||
|
||||
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
|
||||
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
|
||||
const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16);
|
||||
const bool use_tiled = (q->type == GGML_TYPE_F32 &&
|
||||
kv_is_f32_or_f16 &&
|
||||
k->type == v->type &&
|
||||
nek1 % KV_TILE_SZ == 0 &&
|
||||
neq1 >= Q_TILE_SZ); // Only use tiled for batch >= tile size
|
||||
const bool use_split_kv_path = !use_ref && (neq1 == 1 && neq3 == 1) && kv_is_f32_or_f16 && (k->type == v->type) && q->type == GGML_TYPE_F32 && nek1 >= 512;
|
||||
|
||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||
int current_chunk = ith;
|
||||
if (use_split_kv_path) {
|
||||
const int64_t chunk_size = (nek1 + nth - 1) / nth;
|
||||
|
||||
while (current_chunk < nchunk) {
|
||||
const int64_t ir0 = dr * current_chunk;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
// Partials buffer layout: [q_head][kv_chunk][M, S, VKQ]
|
||||
const int64_t partial_size = 2 + DV;
|
||||
float * partials_base = (float *) params->wdata + nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
|
||||
|
||||
if (use_tiled) {
|
||||
ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
|
||||
const int64_t ic_start = ith * chunk_size;
|
||||
const int64_t ic_end = std::min(ic_start + chunk_size, nek1);
|
||||
|
||||
const int64_t partial_stride = nth * partial_size;
|
||||
float * chunk_partials = partials_base + ith * partial_size;
|
||||
|
||||
if (ic_start < nek1) {
|
||||
for (int64_t q_head = 0; q_head < neq2; q_head++) {
|
||||
ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
||||
params, dst, q_head, q_head + 1, ic_start, ic_end,
|
||||
chunk_partials, partial_stride);
|
||||
}
|
||||
} else {
|
||||
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
|
||||
for (int64_t q_head = 0; q_head < neq2; q_head++) {
|
||||
float * q_partials = chunk_partials + q_head * partial_stride;
|
||||
q_partials[0] = -INFINITY; // M
|
||||
q_partials[1] = 0.0f; // S
|
||||
}
|
||||
}
|
||||
|
||||
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
ggml_barrier(params->threadpool);
|
||||
ggml_flash_attn_ext_reduce_partials(params, dst, nth, chunk_size);
|
||||
} else {
|
||||
|
||||
// total rows in q
|
||||
const int64_t nr = neq1*neq2*neq3;
|
||||
|
||||
// disable for NUMA
|
||||
const bool disable_chunking = ggml_is_numa();
|
||||
|
||||
// 4x chunks per thread
|
||||
int nth_scaled = nth * 4;
|
||||
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
|
||||
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
|
||||
|
||||
if (nth == 1 || nchunk < nth || disable_chunking) {
|
||||
nchunk = nth;
|
||||
}
|
||||
|
||||
if (ith == 0) {
|
||||
ggml_threadpool_chunk_set(params->threadpool, nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
||||
|
||||
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
|
||||
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
|
||||
const bool use_tiled = !use_ref &&
|
||||
(q->type == GGML_TYPE_F32 &&
|
||||
kv_is_f32_or_f16 &&
|
||||
k->type == v->type &&
|
||||
nek1 % KV_TILE_SZ == 0 &&
|
||||
neq1 >= Q_TILE_SZ);
|
||||
|
||||
int current_chunk = ith;
|
||||
|
||||
while (current_chunk < nchunk) {
|
||||
const int64_t ir0 = dr * current_chunk;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
if (use_tiled) {
|
||||
ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
|
||||
} else {
|
||||
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1, 0, nek1, nullptr, 0);
|
||||
}
|
||||
|
||||
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5049,16 +5049,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
// Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
|
||||
// PR: https://github.com/ggml-org/llama.cpp/pull/19042
|
||||
if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
|
||||
#ifdef _WIN32
|
||||
_putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
|
||||
#else
|
||||
setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
|
|
|
|||
|
|
@ -3697,13 +3697,20 @@ static __global__ void mul_mat_q(
|
|||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
const int32_t * expert_bounds,
|
||||
float * __restrict__ dst,
|
||||
const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int ncols_dst,
|
||||
const size_t stride_col_dst,
|
||||
const int nchannels_y,
|
||||
const size_t stride_channel_dst,
|
||||
const int nsamples_y,
|
||||
const size_t stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
|
|
|||
|
|
@ -15,14 +15,22 @@ typedef struct ggml_metal * ggml_metal_t;
|
|||
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev);
|
||||
void ggml_metal_free(ggml_metal_t ctx);
|
||||
|
||||
const char * ggml_metal_get_name(ggml_metal_t ctx);
|
||||
|
||||
void ggml_metal_synchronize(ggml_metal_t ctx);
|
||||
|
||||
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);
|
||||
void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf);
|
||||
|
||||
void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev);
|
||||
void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev);
|
||||
|
||||
ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx);
|
||||
|
||||
void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb);
|
||||
void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data);
|
||||
bool ggml_metal_supports_family (ggml_metal_t ctx, int family);
|
||||
|
|
|
|||
|
|
@ -24,9 +24,13 @@ struct ggml_metal_command_buffer {
|
|||
};
|
||||
|
||||
struct ggml_metal {
|
||||
char name[128];
|
||||
|
||||
ggml_metal_device_t dev;
|
||||
ggml_metal_library_t lib;
|
||||
|
||||
ggml_metal_event_t ev_cpy; // for async copies
|
||||
|
||||
dispatch_queue_t d_queue;
|
||||
|
||||
// additional, inference-time compiled pipelines
|
||||
|
|
@ -117,7 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
|||
}
|
||||
}
|
||||
|
||||
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
res->ev_cpy = ggml_metal_device_event_init(dev);
|
||||
|
||||
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
|
||||
snprintf(res->name, sizeof(res->name), "%s", props_dev->name);
|
||||
|
||||
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
|
|
@ -206,9 +214,15 @@ void ggml_metal_free(ggml_metal_t ctx) {
|
|||
|
||||
dispatch_release(ctx->d_queue);
|
||||
|
||||
ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy);
|
||||
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
const char * ggml_metal_get_name(ggml_metal_t ctx) {
|
||||
return ctx->name;
|
||||
}
|
||||
|
||||
void ggml_metal_synchronize(ggml_metal_t ctx) {
|
||||
// wait for any backend operations to finish
|
||||
if (ctx->cmd_buf_last) {
|
||||
|
|
@ -273,8 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
|
|||
// wrap the source data into a Metal buffer
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared];
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared];
|
||||
|
||||
GGML_ASSERT(buf_src);
|
||||
|
||||
|
|
@ -316,9 +330,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
|
|||
@autoreleasepool {
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared
|
||||
deallocator:nil];
|
||||
length:size
|
||||
options:MTLResourceStorageModeShared
|
||||
deallocator:nil];
|
||||
|
||||
GGML_ASSERT(buf_dst);
|
||||
|
||||
|
|
@ -356,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
@autoreleasepool {
|
||||
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);
|
||||
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst);
|
||||
|
||||
if (bid_src.metal == nil || bid_dst.metal == nil) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// queue the copy operation into the Metal context
|
||||
// this will be queued at the end, after any currently ongoing GPU operations
|
||||
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx_src->dev);
|
||||
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||
|
||||
[encoder copyFromBuffer:bid_src.metal
|
||||
sourceOffset:bid_src.offs
|
||||
toBuffer:bid_dst.metal
|
||||
destinationOffset:bid_dst.offs
|
||||
size:ggml_nbytes(src)];
|
||||
|
||||
[encoder endEncoding];
|
||||
|
||||
ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
|
||||
ggml_metal_event_record(ctx_src, ev_cpy);
|
||||
|
||||
[cmd_buf commit];
|
||||
|
||||
// do not wait here for completion
|
||||
//[cmd_buf waitUntilCompleted];
|
||||
|
||||
// instead, remember a reference to the command buffer and wait for it later if needed
|
||||
[ctx_src->cmd_bufs_ext addObject:cmd_buf];
|
||||
ctx_src->cmd_buf_last = cmd_buf;
|
||||
|
||||
[cmd_buf retain];
|
||||
|
||||
ggml_metal_event_wait(ctx_dst, ev_cpy);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
||||
// number of nodes encoded by the main thread (empirically determined)
|
||||
const int n_main = 64;
|
||||
|
|
@ -530,6 +587,42 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
|
|||
//printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
|
||||
}
|
||||
|
||||
void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) {
|
||||
@autoreleasepool {
|
||||
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
|
||||
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
||||
|
||||
ggml_metal_event_encode_signal(ev, cmd_buf);
|
||||
|
||||
[cmd_buf commit];
|
||||
|
||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||
ctx->cmd_buf_last = cmd_buf;
|
||||
|
||||
[cmd_buf retain];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) {
|
||||
@autoreleasepool {
|
||||
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
|
||||
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
||||
|
||||
ggml_metal_event_encode_wait(ev, cmd_buf);
|
||||
|
||||
[cmd_buf commit];
|
||||
|
||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||
ctx->cmd_buf_last = cmd_buf;
|
||||
|
||||
[cmd_buf retain];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
|
||||
return ctx->ev_cpy;
|
||||
}
|
||||
|
||||
void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
|
||||
if (ctx->n_cb != n_cb) {
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
|
||||
|
|
|
|||
|
|
@ -17,10 +17,12 @@ struct ggml_metal_device_deleter {
|
|||
|
||||
typedef std::unique_ptr<ggml_metal_device, ggml_metal_device_deleter> ggml_metal_device_ptr;
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_get(void) {
|
||||
static ggml_metal_device_ptr ctx { ggml_metal_device_init() };
|
||||
ggml_metal_device_t ggml_metal_device_get(int device) {
|
||||
static std::vector<ggml_metal_device_ptr> devs;
|
||||
|
||||
return ctx.get();
|
||||
devs.emplace_back(ggml_metal_device_init(device));
|
||||
|
||||
return devs.back().get();
|
||||
}
|
||||
|
||||
struct ggml_metal_pipelines {
|
||||
|
|
|
|||
|
|
@ -205,7 +205,9 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
|
|||
//
|
||||
|
||||
struct ggml_metal_device_props {
|
||||
int device;
|
||||
char name[128];
|
||||
char desc[128];
|
||||
|
||||
size_t max_buffer_size;
|
||||
size_t max_working_set_size;
|
||||
|
|
@ -224,11 +226,15 @@ struct ggml_metal_device_props {
|
|||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void);
|
||||
typedef struct ggml_metal_event * ggml_metal_event_t;
|
||||
|
||||
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
|
||||
void ggml_metal_event_encode_wait (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(int device);
|
||||
void ggml_metal_device_free(ggml_metal_device_t dev);
|
||||
|
||||
// return a singleton that is automatically destroyed when the program exits
|
||||
ggml_metal_device_t ggml_metal_device_get(void);
|
||||
ggml_metal_device_t ggml_metal_device_get(int device);
|
||||
|
||||
void * ggml_metal_device_get_obj (ggml_metal_device_t dev); // id<MTLDevice>
|
||||
void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQueue>
|
||||
|
|
@ -240,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset
|
|||
|
||||
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
|
||||
|
||||
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev);
|
||||
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev);
|
||||
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev);
|
||||
|
||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
|
||||
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
|
||||
|
||||
|
|
|
|||
|
|
@ -24,9 +24,6 @@
|
|||
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
|
||||
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
|
||||
|
||||
// virtual address for GPU memory allocations
|
||||
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
|
||||
|
||||
#if !GGML_METAL_EMBED_LIBRARY
|
||||
// Here to assist with NSBundle Path Hack
|
||||
@interface GGMLMetalClass : NSObject
|
||||
|
|
@ -523,6 +520,9 @@ struct ggml_metal_device {
|
|||
ggml_metal_library_t library;
|
||||
|
||||
struct ggml_metal_device_props props;
|
||||
|
||||
// virtual address for GPU memory allocations
|
||||
atomic_uintptr_t addr_virt;
|
||||
};
|
||||
|
||||
//
|
||||
|
|
@ -618,7 +618,7 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
|
|||
free(rsets);
|
||||
}
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
||||
|
||||
assert(dev != NULL);
|
||||
|
|
@ -632,6 +632,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||
}
|
||||
|
||||
dev->addr_virt = 0x000000400ULL;
|
||||
|
||||
dev->props.device = device;
|
||||
dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
|
||||
|
|
@ -792,7 +795,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
|
||||
}
|
||||
|
||||
strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
|
||||
snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
|
||||
snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
|
||||
|
||||
dev->library = ggml_metal_library_init(dev);
|
||||
if (!dev->library) {
|
||||
|
|
@ -922,6 +926,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
|
|||
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
|
||||
}
|
||||
|
||||
struct ggml_metal_event {
|
||||
void * obj; // id<MTLEvent>
|
||||
|
||||
atomic_int value;
|
||||
};
|
||||
|
||||
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
[cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
|
||||
}
|
||||
|
||||
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
|
||||
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
|
||||
|
||||
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
||||
}
|
||||
|
||||
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
|
||||
id<MTLEvent> event = [dev->mtl_device newEvent];
|
||||
|
||||
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
|
||||
|
||||
ev->obj = (__bridge void *)event;
|
||||
ev->value = 0;
|
||||
|
||||
return ev;
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
id<MTLEvent> event = ev->obj;
|
||||
[event release];
|
||||
|
||||
free(ev);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
|
||||
@autoreleasepool {
|
||||
id<MTLEvent> event = ev->obj;
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
|
||||
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
|
||||
[cmd_buf commit];
|
||||
[cmd_buf waitUntilCompleted];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
|
|
@ -1344,8 +1401,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
|||
res->all_data = ggml_metal_host_malloc(size_aligned);
|
||||
res->is_shared = true;
|
||||
} else {
|
||||
// use virtual address from g_addr_device counter
|
||||
res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
|
||||
// use virtual address
|
||||
res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
|
||||
res->is_shared = false;
|
||||
}
|
||||
res->all_size = size_aligned;
|
||||
|
|
|
|||
|
|
@ -81,10 +81,10 @@
|
|||
#define FC_COUNT_EQUAL 1000
|
||||
|
||||
// op-specific constants
|
||||
#define OP_FLASH_ATTN_EXT_NQPTG 8
|
||||
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
||||
#define OP_FLASH_ATTN_EXT_NCPSG 64
|
||||
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NQPSG 1
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NCPSG 32
|
||||
|
||||
// kernel argument structs
|
||||
|
|
|
|||
|
|
@ -2295,7 +2295,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_blk(const ggml_tensor * op) {
|
|||
// return res;
|
||||
//}
|
||||
|
||||
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG;
|
||||
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPSG : OP_FLASH_ATTN_EXT_NQPSG;
|
||||
const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG;
|
||||
|
||||
const int64_t ne1 = (ne01 + nqptg - 1)/nqptg;
|
||||
|
|
@ -2411,7 +2411,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
|
||||
if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
|
||||
// half8x8 kernel
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup
|
||||
const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup
|
||||
|
||||
GGML_ASSERT(nqptg <= 32);
|
||||
|
|
@ -2578,9 +2578,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
#undef FATTN_SMEM
|
||||
} else {
|
||||
// half4x4 kernel
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPSG; // queries per threadgroup
|
||||
const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !!
|
||||
const int nkpsg = 1*ncpsg;
|
||||
const int nhptg = 1; // heads per threadgroup
|
||||
|
||||
GGML_ASSERT(nqptg <= 32);
|
||||
GGML_ASSERT(nqptg % 1 == 0);
|
||||
|
|
@ -2632,6 +2632,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
ggml_metal_op_concurrency_reset(ctx);
|
||||
}
|
||||
|
||||
// note: for simplicity assume the K is larger or equal than V
|
||||
GGML_ASSERT(ne10 >= ne20);
|
||||
|
||||
// ne00 + 2*ncpsg*(nsg)
|
||||
// for each query, we load it as f16 in shared memory (ne00)
|
||||
// and store the soft_max values and the mask
|
||||
|
|
@ -2639,28 +2642,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
// ne20*(nsg)
|
||||
// each simdgroup has a full f32 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
while (true) {
|
||||
const size_t smem = FATTN_SMEM(nsgmax);
|
||||
// avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
|
||||
if (smem > props_dev->max_theadgroup_memory_size/2) {
|
||||
break;
|
||||
}
|
||||
nsgmax *= 2;
|
||||
}
|
||||
nsgmax /= 2;
|
||||
|
||||
// simdgroups per threadgroup (a.k.a. warps)
|
||||
//const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
|
||||
const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsg = 1;
|
||||
while (nsg <= nsgt) {
|
||||
nsg *= 2;
|
||||
}
|
||||
nsg /= 2;
|
||||
|
||||
// workgroups
|
||||
// each workgroup handles nsg*nkpsg cache values
|
||||
|
|
@ -2673,7 +2657,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
} else {
|
||||
nwg = 32;
|
||||
nsg = 1;
|
||||
while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
|
||||
while (2*nwg*nsg*ncpsg < ne11 && nsg < 4) {
|
||||
nsg *= 2;
|
||||
}
|
||||
}
|
||||
|
|
@ -2739,7 +2723,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
|
||||
} else {
|
||||
// sanity checks
|
||||
assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0);
|
||||
|
|
@ -2752,7 +2736,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
ggml_metal_encoder_set_buffer(enc, bid_tmp, 7);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
|
||||
|
||||
// sync the 2 kernels
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
|
|
|||
|
|
@ -7,11 +7,12 @@
|
|||
#include "ggml-metal-context.h"
|
||||
#include "ggml-metal-ops.h"
|
||||
|
||||
// globals
|
||||
#define GGML_METAL_NAME "MTL"
|
||||
#define GGML_METAL_MAX_DEVICES 16
|
||||
|
||||
// initialized in ggml_backend_metal_reg
|
||||
static ggml_backend_reg g_ggml_metal_reg;
|
||||
static ggml_backend_device g_ggml_metal_device;
|
||||
// number of Metal devices
|
||||
// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
|
||||
static int g_devices = 1;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// backend interface
|
||||
|
|
@ -165,10 +166,28 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
|
|||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer ||
|
||||
buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer;
|
||||
}
|
||||
|
||||
//
|
||||
// buffer types
|
||||
//
|
||||
|
||||
struct ggml_backend_metal_buffer_type {
|
||||
int device;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
struct ggml_backend_metal_buffer_type_deleter {
|
||||
void operator()(ggml_backend_metal_buffer_type * ctx) const {
|
||||
delete ctx;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<ggml_backend_metal_buffer_type, ggml_backend_metal_buffer_type_deleter> ggml_backend_metal_buffer_type_ptr;
|
||||
|
||||
// common method for allocating shread or private Metal buffers
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
|
||||
|
|
@ -218,9 +237,9 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
|
|||
// default (shared) buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal";
|
||||
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
|
|
@ -249,29 +268,54 @@ static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_ty
|
|||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) {
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(int device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
return &ggml_backend_buffer_type_metal;
|
||||
static std::vector<ggml_backend_buffer_type> bufts;
|
||||
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
|
||||
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
bufts.reserve(g_devices);
|
||||
ctxs.reserve(g_devices);
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
ggml_backend_metal_buffer_type * raw_ctx =
|
||||
new ggml_backend_metal_buffer_type {
|
||||
/* .device = */ i,
|
||||
/* .name = */ GGML_METAL_NAME + std::to_string(i),
|
||||
};
|
||||
ctxs.emplace_back(raw_ctx);
|
||||
|
||||
ggml_backend_buffer_type buft = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
|
||||
/* .context = */ raw_ctx,
|
||||
};
|
||||
|
||||
bufts.emplace_back(buft);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return &bufts[device];
|
||||
}
|
||||
|
||||
// default (private) buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Private";
|
||||
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
|
|
@ -300,29 +344,53 @@ static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_t
|
|||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) {
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(int device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
return &ggml_backend_buffer_type_metal;
|
||||
static std::vector<ggml_backend_buffer_type> bufts;
|
||||
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
|
||||
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
bufts.reserve(g_devices);
|
||||
ctxs.reserve(g_devices);
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
|
||||
/* .device = */ i,
|
||||
/* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Private"
|
||||
};
|
||||
ctxs.emplace_back(raw_ctx);
|
||||
|
||||
ggml_backend_buffer_type buft = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
|
||||
/* .context = */ raw_ctx,
|
||||
};
|
||||
|
||||
bufts.emplace_back(buft);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return &bufts[device];
|
||||
}
|
||||
|
||||
// mapped buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Mapped";
|
||||
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
|
|
@ -352,31 +420,55 @@ static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_ty
|
|||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
|
||||
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
|
||||
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
|
||||
static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
|
||||
},
|
||||
/* .device = */ &g_ggml_metal_device,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(int device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
return &ggml_backend_buffer_type_mapped_metal;
|
||||
static std::vector<ggml_backend_buffer_type> bufts;
|
||||
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
|
||||
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
bufts.reserve(g_devices);
|
||||
ctxs.reserve(g_devices);
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
|
||||
/* .device = */ i,
|
||||
/* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Mapped"
|
||||
};
|
||||
ctxs.emplace_back(raw_ctx);
|
||||
|
||||
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
|
||||
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
|
||||
ggml_backend_buffer_type buft = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
|
||||
/* .context = */ raw_ctx,
|
||||
};
|
||||
|
||||
bufts.emplace_back(buft);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return &bufts[device];
|
||||
}
|
||||
|
||||
// backend
|
||||
|
||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||
return "Metal";
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
return ggml_metal_get_name(ctx);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
|
|
@ -409,12 +501,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg
|
|||
}
|
||||
|
||||
static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
return false;
|
||||
if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(backend_src);
|
||||
GGML_UNUSED(backend_dst);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context;
|
||||
ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context;
|
||||
|
||||
//ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
//ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
//ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context;
|
||||
//ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context;
|
||||
|
||||
return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
|
|
@ -423,6 +527,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend,
|
|||
return ggml_metal_graph_compute(ctx, cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
|
||||
|
||||
ggml_metal_event_record(ctx, ev);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
|
||||
|
||||
ggml_metal_event_wait(ctx, ev);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
|
|
@ -435,7 +553,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|||
ggml_metal_t ctx = (ggml_metal_t)backend->context;
|
||||
|
||||
ggml_metal_set_n_cb(ctx, n_cb);
|
||||
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_metal_i = {
|
||||
|
|
@ -450,12 +567,8 @@ static ggml_backend_i ggml_backend_metal_i = {
|
|||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||
|
||||
// the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
|
||||
// in any case, these docs seem relevant if we ever decide to implement it:
|
||||
// https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_record = */ ggml_backend_metal_event_record,
|
||||
/* .event_wait = */ ggml_backend_metal_event_wait,
|
||||
/* .graph_optimize = */ ggml_backend_metal_graph_optimize,
|
||||
};
|
||||
|
||||
|
|
@ -519,15 +632,17 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
|||
// backend device
|
||||
|
||||
static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "Metal";
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
|
||||
|
||||
return props_dev->name;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
return ggml_metal_device_get_props(ctx_dev)->name;
|
||||
return ggml_metal_device_get_props(ctx_dev)->desc;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
|
|
@ -550,14 +665,14 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
|
|||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
props->caps = {
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ true,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_t ctx = ggml_metal_init(ctx_dev);
|
||||
|
|
@ -587,7 +702,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml
|
|||
|
||||
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
|
||||
|
||||
return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private();
|
||||
return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared(props_dev->device) : ggml_backend_metal_buffer_type_private(props_dev->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
|
|
@ -595,7 +710,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
|
|||
|
||||
ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size);
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size);
|
||||
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(props_dev->device), ggml_backend_metal_buffer_shared_i, res, size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
|
|
@ -606,9 +723,10 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
|
|||
|
||||
static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return
|
||||
buft->device == dev && (
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name ||
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name ||
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name;
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
|
@ -632,45 +750,97 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g
|
|||
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev);
|
||||
GGML_ASSERT(event);
|
||||
|
||||
ggml_backend_event_t ev = new ggml_backend_event {
|
||||
/* .device = */ dev,
|
||||
/* .context = */ event,
|
||||
};
|
||||
|
||||
return ev;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
|
||||
|
||||
ggml_metal_device_event_free(ctx_dev, ev);
|
||||
|
||||
delete event;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
||||
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
|
||||
|
||||
ggml_metal_event_t evt = (ggml_metal_event_t)event->context;
|
||||
|
||||
ggml_metal_device_event_synchronize(ctx_dev, evt);
|
||||
}
|
||||
|
||||
static ggml_backend_device_i ggml_backend_metal_device_i = {
|
||||
/* .get_name = */ ggml_backend_metal_device_get_name,
|
||||
/* .get_description = */ ggml_backend_metal_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_metal_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_metal_device_get_type,
|
||||
/* .get_props = */ ggml_backend_metal_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_metal_device_init,
|
||||
/* .init_backend = */ ggml_backend_metal_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped,
|
||||
/* .supports_op = */ ggml_backend_metal_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_metal_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_metal_device_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
/* .event_new = */ ggml_backend_metal_device_event_new,
|
||||
/* .event_free = */ ggml_backend_metal_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_metal_device_event_synchronize,
|
||||
};
|
||||
|
||||
// backend registry
|
||||
|
||||
struct ggml_backend_metal_reg {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
};
|
||||
|
||||
typedef struct ggml_backend_metal_reg * ggml_backend_metal_reg_t;
|
||||
|
||||
static ggml_backend_metal_reg_t ggml_backend_metal_reg_init(void) {
|
||||
ggml_backend_metal_reg_t ctx = new struct ggml_backend_metal_reg;
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_reg_free(ggml_backend_metal_reg_t ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
struct ggml_backend_metal_reg_deleter {
|
||||
void operator()(ggml_backend_metal_reg_t ctx) {
|
||||
ggml_backend_metal_reg_free(ctx);
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<struct ggml_backend_metal_reg, ggml_backend_metal_reg_deleter> ggml_backend_metal_reg_ptr;
|
||||
|
||||
static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "Metal";
|
||||
return GGML_METAL_NAME;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
|
||||
return ctx->devices.size();
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
return &g_ggml_metal_device;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(index);
|
||||
ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
|
||||
GGML_ASSERT(index < ctx->devices.size());
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
static ggml_backend_feature g_ggml_backend_metal_features[] = {
|
||||
|
|
@ -698,27 +868,67 @@ static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const
|
|||
|
||||
static ggml_backend_reg_i ggml_backend_metal_reg_i = {
|
||||
/* .get_name = */ ggml_backend_metal_reg_get_name,
|
||||
/* .device_count = */ ggml_backend_metal_reg_device_count,
|
||||
/* .device_get = */ ggml_backend_metal_reg_device_get,
|
||||
/* .get_device_count = */ ggml_backend_metal_reg_device_count,
|
||||
/* .get_device = */ ggml_backend_metal_reg_device_get,
|
||||
/* .get_proc_address = */ ggml_backend_metal_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
{
|
||||
g_ggml_metal_reg = {
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_metal_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
static ggml_backend_dev_t ggml_backend_metal_device_init(ggml_backend_reg_t reg, int device) {
|
||||
return new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_metal_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ggml_metal_device_get(device),
|
||||
};
|
||||
}
|
||||
|
||||
g_ggml_metal_device = {
|
||||
/* .iface = */ ggml_backend_metal_device_i,
|
||||
/* .reg = */ &g_ggml_metal_reg,
|
||||
/* .context = */ ggml_metal_device_get(),
|
||||
};
|
||||
static void ggml_backend_metal_device_free(ggml_backend_dev_t dev) {
|
||||
delete dev;
|
||||
}
|
||||
|
||||
struct ggml_backend_device_deleter {
|
||||
void operator()(ggml_backend_dev_t ctx) {
|
||||
ggml_backend_metal_device_free(ctx);
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<ggml_backend_device, ggml_backend_device_deleter> ggml_backend_device_ptr;
|
||||
|
||||
ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
static ggml_backend_reg reg;
|
||||
static bool initialized = false;
|
||||
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
const char * env = getenv("GGML_METAL_DEVICES");
|
||||
if (env) {
|
||||
g_devices = atoi(env);
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_device_ptr> devs;
|
||||
|
||||
if (!initialized) {
|
||||
static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
|
||||
|
||||
for (int i = 0; i < g_devices; ++i) {
|
||||
auto * dev = ggml_backend_metal_device_init(®, i);
|
||||
devs.emplace_back(dev);
|
||||
|
||||
reg_ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = {
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_metal_reg_i,
|
||||
/* .context = */ reg_ctx.get(),
|
||||
};
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return &g_ggml_metal_reg;
|
||||
return ®
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
|
||||
|
|
|
|||
|
|
@ -5931,7 +5931,7 @@ template<
|
|||
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short Q = OP_FLASH_ATTN_EXT_NQPTG, // queries per threadgroup
|
||||
short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext(
|
||||
constant ggml_metal_kargs_flash_attn_ext & args,
|
||||
|
|
@ -6141,11 +6141,10 @@ template<
|
|||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short NE, // head elements per thread
|
||||
short Q, // queries per threadgroup
|
||||
short C, // cache items per threadgroup
|
||||
short NSG> // number of simd groups
|
||||
void kernel_flash_attn_ext_vec_impl(
|
||||
short NE = 4, // head elements per thread
|
||||
short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec(
|
||||
constant ggml_metal_kargs_flash_attn_ext_vec & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
|
|
@ -6162,6 +6161,7 @@ void kernel_flash_attn_ext_vec_impl(
|
|||
static_assert(DV % 32 == 0, "DV must be divisible by 32");
|
||||
|
||||
#define NWG (FC_flash_attn_ext_vec_nwg)
|
||||
#define NSG (FC_flash_attn_ext_vec_nsg)
|
||||
|
||||
#define NS10 (FC_flash_attn_ext_vec_ns10)
|
||||
#define NS20 (FC_flash_attn_ext_vec_ns20)
|
||||
|
|
@ -6190,12 +6190,12 @@ void kernel_flash_attn_ext_vec_impl(
|
|||
|
||||
const short T = PK + NSG*SH; // shared memory size per query in (half)
|
||||
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*PK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*PK); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + Q*PK); // scratch buffer for mask
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + Q*T); // scratch buffer for the results
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results
|
||||
|
||||
// store the result for all queries in shared memory (the O matrix from the paper)
|
||||
so4 += tiisg;
|
||||
|
|
@ -6213,11 +6213,13 @@ void kernel_flash_attn_ext_vec_impl(
|
|||
// load heads from Q to shared memory
|
||||
device const float4 * q4 = (device const float4 *) ((device const char *) q);
|
||||
|
||||
for (short i = tiisg; i < PK4; i += NW) {
|
||||
if (iq1 < args.ne01 && i < DK4) {
|
||||
sq4[i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[i] = (q4_t) 0.0f;
|
||||
if (iq1 < args.ne01) {
|
||||
for (short i = tiisg; i < PK4; i += NW) {
|
||||
if (i < DK4) {
|
||||
sq4[i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[i] = (q4_t) 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -6295,7 +6297,7 @@ void kernel_flash_attn_ext_vec_impl(
|
|||
}
|
||||
|
||||
// skip -INF blocks
|
||||
if (simd_max(sm[tiisg]) == -INFINITY) {
|
||||
if (simd_max(sm[tiisg]) <= -MAXHALF) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -6569,57 +6571,11 @@ void kernel_flash_attn_ext_vec_impl(
|
|||
}
|
||||
|
||||
#undef NWG
|
||||
#undef NSG
|
||||
#undef NS10
|
||||
#undef NS20
|
||||
}
|
||||
|
||||
template<
|
||||
typename q4_t, // query types in shared memory
|
||||
typename k4_t, // key types in shared memory
|
||||
typename v4_t, // value types in shared memory
|
||||
typename qk_t, // Q*K types
|
||||
typename s_t, // soft-max types
|
||||
typename s4_t,
|
||||
typename o4_t, // attention accumulation types
|
||||
typename kd4_t, // key type in device memory
|
||||
short nl_k,
|
||||
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
|
||||
typename vd4_t, // value type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short NE = 4, // head elements per thread
|
||||
short Q = OP_FLASH_ATTN_EXT_VEC_NQPTG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec(
|
||||
constant ggml_metal_kargs_flash_attn_ext_vec & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device const char * sinks,
|
||||
device const char * pad,
|
||||
device char * dst,
|
||||
threadgroup half * shmem_f16 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
#define FWD_TMPL q4_t, k4_t, v4_t, qk_t, s_t, s4_t, o4_t, kd4_t, nl_k, deq_k_t4, vd4_t, nl_v, deq_v_t4, DK, DV, NE, Q, C
|
||||
#define FWD_ARGS args, q, k, v, mask, sinks, pad, dst, shmem_f16, tgpig, tiisg, sgitg
|
||||
switch (FC_flash_attn_ext_vec_nsg) {
|
||||
// note: disabled cases to reduce library load time
|
||||
case 1: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 1>(FWD_ARGS); break;
|
||||
case 2: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 2>(FWD_ARGS); break;
|
||||
case 4: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 4>(FWD_ARGS); break;
|
||||
//case 8: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 8>(FWD_ARGS); break;
|
||||
//case 16: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 16>(FWD_ARGS); break;
|
||||
//case 32: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 32>(FWD_ARGS); break;
|
||||
}
|
||||
#undef FWD_TMPL
|
||||
#undef FWD_ARGS
|
||||
}
|
||||
|
||||
// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem
|
||||
// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
|
||||
//
|
||||
|
|
|
|||
|
|
@ -453,7 +453,6 @@ struct ggml_backend_opencl_context {
|
|||
cl_program program_rms_norm;
|
||||
cl_program program_group_norm;
|
||||
cl_program program_rope;
|
||||
cl_program program_scale;
|
||||
cl_program program_silu;
|
||||
cl_program program_sigmoid;
|
||||
cl_program program_softmax_f32;
|
||||
|
|
@ -462,11 +461,8 @@ struct ggml_backend_opencl_context {
|
|||
cl_program program_softmax_4_f16;
|
||||
cl_program program_argsort_f32_i32;
|
||||
cl_program program_sum_rows_f32;
|
||||
cl_program program_repeat;
|
||||
cl_program program_pad;
|
||||
cl_program program_tanh;
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_conv_2d_f16;
|
||||
cl_program program_conv_2d_f32;
|
||||
cl_program program_conv_2d_f16_f32;
|
||||
|
|
@ -485,7 +481,7 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
||||
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
||||
cl_kernel kernel_add_id;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_scale_f32, kernel_scale_f32_4;
|
||||
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
|
||||
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
|
||||
cl_kernel kernel_mean_f32;
|
||||
|
|
@ -544,18 +540,17 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
cl_kernel kernel_repeat;
|
||||
cl_kernel kernel_repeat_f32;
|
||||
cl_kernel kernel_pad;
|
||||
cl_kernel kernel_tanh_f32_nd;
|
||||
cl_kernel kernel_tanh_f16_nd;
|
||||
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
|
||||
cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
|
||||
cl_kernel kernel_expm1_f32_nd;
|
||||
cl_kernel kernel_expm1_f16_nd;
|
||||
cl_kernel kernel_softplus_f32_nd;
|
||||
cl_kernel kernel_softplus_f16_nd;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_concat_f32;
|
||||
cl_kernel kernel_conv_2d_f16;
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
|
|
@ -1483,10 +1478,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
#else
|
||||
const std::string kernel_src = read_file("scale.cl");
|
||||
#endif
|
||||
backend_ctx->program_scale =
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_scale_f32 = clCreateKernel(prog, "kernel_scale_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1814,16 +1811,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
#else
|
||||
const std::string kernel_src = read_file("repeat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_repeat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
||||
backend_ctx->program_repeat = nullptr;
|
||||
backend_ctx->kernel_repeat = nullptr;
|
||||
}
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// pad
|
||||
|
|
@ -1856,18 +1848,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
#else
|
||||
const std::string kernel_src = read_file("tanh.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_tanh =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
|
||||
backend_ctx->program_tanh = nullptr;
|
||||
backend_ctx->kernel_tanh_f32_nd = nullptr;
|
||||
backend_ctx->kernel_tanh_f16_nd = nullptr;
|
||||
}
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32 = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32_4 = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16 = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16_4 = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// expm1
|
||||
|
|
@ -1959,22 +1949,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
#include "concat.cl.h"
|
||||
};
|
||||
#else
|
||||
|
||||
const std::string kernel_src = read_file("concat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_concat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
||||
backend_ctx->program_concat = nullptr;
|
||||
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
||||
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
||||
}
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// timestep_embedding
|
||||
|
|
@ -3318,8 +3299,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
|
|
@ -7029,79 +7009,87 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel;
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32_nd;
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_tanh_f16_nd;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
|
||||
}
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
|
||||
const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
|
||||
const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
|
||||
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||
|
||||
size_t global_work_size[3];
|
||||
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||
return;
|
||||
}
|
||||
global_work_size[0] = (size_t)ne10;
|
||||
global_work_size[1] = (size_t)ne11;
|
||||
global_work_size[2] = (size_t)ne12;
|
||||
|
||||
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||
if (ne10 < 16) lws0 = ne10;
|
||||
if (ne11 < 4) lws1 = ne11;
|
||||
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||
|
||||
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||
|
||||
|
||||
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||
|
||||
size_t* local_work_size_ptr = local_work_size;
|
||||
if (!backend_ctx->non_uniform_workgroups) {
|
||||
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||
global_work_size[1] % local_work_size[1] != 0 ||
|
||||
global_work_size[2] % local_work_size[2] != 0) {
|
||||
local_work_size_ptr = NULL;
|
||||
if (ggml_is_contiguous(src0)) {
|
||||
// Handle contiguous input
|
||||
int n = ggml_nelements(dst);
|
||||
if (n % 4 == 0) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_tanh_f16_4;
|
||||
}
|
||||
n /= 4;
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_tanh_f16;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
} else {
|
||||
// Handle non-contiguous input
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32_nc;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_tanh_f16_nc;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
||||
|
||||
int nth = 64;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
|
@ -7319,53 +7307,58 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
if (backend_ctx->kernel_repeat == nullptr) {
|
||||
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
|
||||
const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
|
||||
const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_repeat;
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
||||
cl_kernel kernel = backend_ctx->kernel_repeat_f32;
|
||||
|
||||
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
||||
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
||||
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = { gws0, gws1, gws2 };
|
||||
int nth = 64;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
|
|
@ -7589,121 +7582,76 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
|
||||
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
|
||||
cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
|
||||
cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const int32_t dim = ((const int32_t *) dst->op_params)[0];
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
const cl_int dim = ((const int32_t *) dst->op_params)[0];
|
||||
GGML_ASSERT(dim >= 0 && dim <= 3);
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
if (dim == 3) {
|
||||
int nth = MIN(64, ne0);
|
||||
|
||||
size_t nbytes_src0 = ggml_nbytes(src0);
|
||||
size_t nbytes_src1 = ggml_nbytes(src1);
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32;
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
|
||||
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
|
||||
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
|
||||
} else {
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
|
||||
size_t global_work_size[3];
|
||||
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
||||
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
||||
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
||||
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
||||
|
||||
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
||||
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
||||
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
||||
|
||||
global_work_size[0] = d_ne0;
|
||||
global_work_size[1] = d_ne1;
|
||||
global_work_size[2] = d_ne2;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
||||
|
||||
cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
||||
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
||||
|
||||
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
||||
|
||||
cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
||||
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
||||
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
||||
|
||||
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
||||
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
||||
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
||||
}
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
|
|
@ -8394,6 +8342,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
|||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
|
|
@ -9913,7 +9862,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_scale;
|
||||
cl_kernel kernel;
|
||||
|
||||
int n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
kernel = backend_ctx->kernel_scale_f32_4;
|
||||
n /= 4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_scale_f32;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -9922,8 +9880,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
|
||||
|
||||
int n = ggml_nelements(dst)/4;
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,109 +1,51 @@
|
|||
kernel void kernel_concat_f32_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
|
||||
int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
|
||||
int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice
|
||||
int dim
|
||||
kernel void kernel_concat_f32(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global const char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
int dim
|
||||
) {
|
||||
global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
|
||||
global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
|
||||
global float * dst = (global float*)((global char*)p_dst + off_dst);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i0 = get_global_id(0); // Index along dst's 0th dimension
|
||||
int i1 = get_global_id(1); // Index along dst's 1st dimension
|
||||
int i2 = get_global_id(2); // Index along dst's 2nd dimension
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
|
||||
return;
|
||||
}
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
|
||||
ulong src_idx;
|
||||
global const float * x;
|
||||
|
||||
if (dim == 0) {
|
||||
if (i0 < d_ne00) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 1) {
|
||||
if (i1 < d_ne01) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 2) {
|
||||
if (i2 < d_ne02) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
|
||||
src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat_f32_non_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
|
||||
long ne00, long ne01, long ne02, long ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
|
||||
|
||||
long d_ne0, long d_ne1, long d_ne2, long d_ne3,
|
||||
ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
|
||||
int dim
|
||||
) {
|
||||
global const char * src0_base = p_src0 + off_src0;
|
||||
global const char * src1_base = p_src1 + off_src1;
|
||||
global char * dst_base = p_dst + off_dst;
|
||||
|
||||
long current_i1 = get_global_id(0); // Index for dst_dim_1
|
||||
long current_i2 = get_global_id(1); // Index for dst_dim_2
|
||||
long current_i3 = get_global_id(2); // Index for dst_dim_3
|
||||
|
||||
if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
global const float * x_val_ptr;
|
||||
global float * y_val_ptr;
|
||||
|
||||
for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
|
||||
bool use_src0;
|
||||
long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
|
||||
|
||||
if (dim == 0) {
|
||||
use_src0 = (current_i0 < ne00);
|
||||
if (!use_src0) { s_i0 = current_i0 - ne00; }
|
||||
} else if (dim == 1) {
|
||||
use_src0 = (current_i1 < ne01);
|
||||
if (!use_src0) { s_i1 = current_i1 - ne01; }
|
||||
} else if (dim == 2) {
|
||||
use_src0 = (current_i2 < ne02);
|
||||
if (!use_src0) { s_i2 = current_i2 - ne02; }
|
||||
} else { // dim == 3
|
||||
use_src0 = (current_i3 < ne03);
|
||||
if (!use_src0) { s_i3 = current_i3 - ne03; }
|
||||
}
|
||||
|
||||
if (use_src0) {
|
||||
x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
|
||||
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
}
|
||||
|
||||
y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
|
||||
*y_val_ptr = *x_val_ptr;
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,39 +1,38 @@
|
|||
kernel void kernel_repeat(
|
||||
global const char * src0_data_in,
|
||||
global char * dst_data_in,
|
||||
ulong src0_offset,
|
||||
ulong dst_offset,
|
||||
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
|
||||
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
|
||||
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
|
||||
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
|
||||
kernel void kernel_repeat_f32(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
global const char * src0_data = src0_data_in + src0_offset;
|
||||
global char * dst_data = dst_data_in + dst_offset;
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int d3 = get_global_id(2);
|
||||
const int d2 = get_global_id(1);
|
||||
const int d1 = get_global_id(0);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
|
||||
return;
|
||||
}
|
||||
const int i03 = i3%ne03;
|
||||
const int i02 = i2%ne02;
|
||||
const int i01 = i1%ne01;
|
||||
|
||||
const int s3 = d3 % src0_ne3;
|
||||
const int s2 = d2 % src0_ne2;
|
||||
const int s1 = d1 % src0_ne1;
|
||||
global const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
|
||||
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
|
||||
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
|
||||
|
||||
for (int d0 = 0; d0 < dst_ne0; ++d0) {
|
||||
// Determine source index for dimension 0 based on tiling/broadcasting.
|
||||
const int s0 = d0 % src0_ne0;
|
||||
|
||||
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
|
||||
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
|
||||
for (int k = 0; k < src0_nb0; ++k) {
|
||||
current_dst_el_ptr[k] = current_src_el_ptr[k];
|
||||
}
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i00 = i0%ne00;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i00*nb00));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,19 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// scale
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_scale(
|
||||
kernel void kernel_scale_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
float scale,
|
||||
float bias
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
|
||||
}
|
||||
|
||||
kernel void kernel_scale_f32_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
|
|
|
|||
|
|
@ -1,63 +1,109 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
kernel void kernel_tanh_f32_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
kernel void kernel_tanh_f32(
|
||||
global const float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
|
||||
}
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
kernel void kernel_tanh_f32_4(
|
||||
global const float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f16(
|
||||
global const half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f16_4(
|
||||
global const half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f32_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = tanh(*x);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f16_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
kernel void kernel_tanh_f16_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
*y = tanh(*x);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
|
||||
|
||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
|
||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
|
||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
|
||||
message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
|
||||
endif()
|
||||
|
||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||
|
|
@ -125,25 +125,22 @@ endif()
|
|||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
|
||||
|
||||
if (GGML_SYCL_F16)
|
||||
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
|
||||
endif()
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
# INFO: Allowed Sub_group_sizes are not consistent through all
|
||||
# hip targets. For example, 64 is used for certain models, but the backend
|
||||
# does not support it.
|
||||
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
|
||||
# Link against Intel oneMKL
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(SYCL_COMPILER ON)
|
||||
endif()
|
||||
find_package(MKL REQUIRED)
|
||||
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
||||
else()
|
||||
# default for other target
|
||||
message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
endif()
|
||||
|
||||
|
|
@ -151,82 +148,6 @@ if (GGML_SYCL_GRAPH)
|
|||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
||||
endif()
|
||||
|
||||
# Link against Intel oneMKL or oneMath
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
||||
# See https://github.com/uxlfoundation/oneMath/issues/654
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
set(SYCL_COMPILER ON)
|
||||
endif()
|
||||
find_package(MKL REQUIRED)
|
||||
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
else()
|
||||
find_package(oneMath QUIET)
|
||||
if (NOT oneMath_FOUND)
|
||||
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
|
||||
# Use FetchContent to automatically pull and build oneMath
|
||||
include(FetchContent)
|
||||
set(BUILD_FUNCTIONAL_TESTS False)
|
||||
set(BUILD_EXAMPLES False)
|
||||
set(TARGET_DOMAINS blas)
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_CUBLAS_BACKEND True)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_ROCBLAS_BACKEND True)
|
||||
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
|
||||
cmake_policy(SET CMP0126 NEW)
|
||||
# Setting the device architecture is only needed and useful for AMD devices in oneMath
|
||||
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
|
||||
endif()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
function(onemath_alias target)
|
||||
if (TARGET ${target}_obj)
|
||||
# Silence verbose warnings from external libraries
|
||||
target_compile_options(${target}_obj PRIVATE -w)
|
||||
endif()
|
||||
if (TARGET ${target})
|
||||
add_library(ONEMATH::${target} ALIAS ${target})
|
||||
endif()
|
||||
endfunction()
|
||||
onemath_alias(onemath)
|
||||
onemath_alias(onemath_blas_mklcpu)
|
||||
onemath_alias(onemath_blas_mklgpu)
|
||||
onemath_alias(onemath_blas_cublas)
|
||||
onemath_alias(onemath_blas_rocblas)
|
||||
endif()
|
||||
|
||||
# Below oneMath compile-time dispatching is used for better performance
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
endif()
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
|
||||
else()
|
||||
# Fallback to oneMath runtime dispatcher
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
|
|
|
|||
|
|
@ -15,17 +15,9 @@
|
|||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <map>
|
||||
|
||||
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
|
||||
#include <oneapi/mkl.hpp>
|
||||
// Allow to use the same namespace for Intel oneMKL and oneMath
|
||||
namespace oneapi {
|
||||
namespace math = mkl;
|
||||
}
|
||||
#else
|
||||
#include <oneapi/math.hpp>
|
||||
#endif
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
|
@ -91,32 +83,13 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
|
|||
}
|
||||
|
||||
template <typename Ts> struct matrix_info_t {
|
||||
oneapi::math::transpose transpose_info[2];
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
inline auto get_onemath_backend(sycl::queue& queue)
|
||||
#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
-> sycl::queue&
|
||||
#endif
|
||||
{
|
||||
// If the backend is known at compile-time, use oneMath backend_selector to use
|
||||
// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
|
||||
// fallback to runtime dispatching.
|
||||
#if defined(GGML_SYCL_NVIDIA)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
|
||||
#elif defined(GGML_SYCL_AMD)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
|
||||
#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
return queue;
|
||||
#else
|
||||
static_assert(false, "Unsupported backend");
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
|
@ -1734,7 +1707,7 @@ namespace dpct
|
|||
namespace detail
|
||||
{
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
|
||||
const void * beta, void * c, int ldc) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
|
|
@ -1742,7 +1715,7 @@ namespace dpct
|
|||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
|
||||
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a,
|
||||
lda, data_b, ldb, beta_value, data_c, ldc);
|
||||
}
|
||||
|
||||
|
|
@ -1774,7 +1747,7 @@ namespace dpct
|
|||
};
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
|
||||
int ldb, const void * beta, void ** c, int ldc, int batch_size,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
|
|
@ -1793,8 +1766,8 @@ namespace dpct
|
|||
matrix_info->ld_info[2] = ldc;
|
||||
matrix_info->groupsize_info = batch_size;
|
||||
|
||||
sycl::event e = oneapi::math::blas::column_major::gemm_batch(
|
||||
get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
|
||||
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
|
||||
reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
|
||||
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
|
||||
|
|
@ -1803,7 +1776,7 @@ namespace dpct
|
|||
}
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void * a, int lda,
|
||||
long long int stride_a, const void * b, int ldb, long long int stride_b,
|
||||
const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
|
||||
|
|
@ -1812,7 +1785,7 @@ namespace dpct
|
|||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
|
||||
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value,
|
||||
data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
|
||||
data_c, ldc, stride_c, batch_size);
|
||||
}
|
||||
|
|
@ -2299,7 +2272,7 @@ namespace dpct
|
|||
sycl::range<3>(x, y, 1), direction);
|
||||
}
|
||||
|
||||
inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
|
||||
inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n,
|
||||
int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
|
||||
library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
library_data_t scaling_type) {
|
||||
|
|
@ -2366,7 +2339,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
|
|
@ -2405,7 +2378,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
|
|
@ -2447,7 +2420,7 @@ namespace dpct
|
|||
/// \param [in] ldc Leading dimension of C.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
|
||||
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
|
||||
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
|
||||
|
|
@ -2485,7 +2458,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
|
|
@ -2493,7 +2466,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
|
|
@ -2569,7 +2542,7 @@ namespace dpct
|
|||
/// \param [in] stride_c Stride between the different C matrices.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
|
||||
long long int stride_a, const void * b, library_data_t b_type, int ldb,
|
||||
long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
|
|
@ -2642,7 +2615,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
|
|
@ -2651,7 +2624,7 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -1840,6 +1840,110 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|||
}
|
||||
}
|
||||
|
||||
static void top_k_f32_sycl(
|
||||
const float * src,
|
||||
int32_t * dst_indices,
|
||||
const int64_t ncols,
|
||||
const int64_t nrows,
|
||||
const int k,
|
||||
dpct::queue_ptr main_stream
|
||||
) {
|
||||
const int block_size = 128;
|
||||
|
||||
const sycl::range<1> block_dims(block_size);
|
||||
const sycl::range<1> grid_dims(nrows);
|
||||
|
||||
main_stream->submit([&](sycl::handler &cgh) {
|
||||
sycl::local_accessor<float, 1> shared_vals(sycl::range<1>(block_size * k), cgh);
|
||||
sycl::local_accessor<int, 1> shared_idx(sycl::range<1>(block_size * k), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<1>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
const int row = item_ct1.get_group(0);
|
||||
const int tid = item_ct1.get_local_id(0);
|
||||
|
||||
if (row >= nrows) return;
|
||||
|
||||
const float * src_row = src + row * ncols;
|
||||
int32_t * dst_idx_row = dst_indices + row * k;
|
||||
|
||||
float local_vals[32];
|
||||
int local_idx[32];
|
||||
|
||||
for (int i = 0; i < k; i++) {
|
||||
local_vals[i] = -FLT_MAX;
|
||||
local_idx[i] = -1;
|
||||
}
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
float val = src_row[col];
|
||||
|
||||
if (val > local_vals[k-1]) {
|
||||
int pos = k - 1;
|
||||
while (pos > 0 && val > local_vals[pos - 1]) {
|
||||
pos--;
|
||||
}
|
||||
|
||||
for (int i = k - 1; i > pos; i--) {
|
||||
local_vals[i] = local_vals[i - 1];
|
||||
local_idx[i] = local_idx[i - 1];
|
||||
}
|
||||
local_vals[pos] = val;
|
||||
local_idx[pos] = col;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < k; i++) {
|
||||
shared_vals[tid * k + i] = local_vals[i];
|
||||
shared_idx[tid * k + i] = local_idx[i];
|
||||
}
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (tid == 0) {
|
||||
float final_vals[32];
|
||||
int final_idx[32];
|
||||
|
||||
for (int i = 0; i < k; i++) {
|
||||
final_vals[i] = -FLT_MAX;
|
||||
final_idx[i] = -1;
|
||||
}
|
||||
|
||||
for (int t = 0; t < block_size; t++) {
|
||||
for (int i = 0; i < k; i++) {
|
||||
float val = shared_vals[t * k + i];
|
||||
int idx = shared_idx[t * k + i];
|
||||
|
||||
if (val > final_vals[k-1]) {
|
||||
int pos = k - 1;
|
||||
while (pos > 0 && val > final_vals[pos - 1]) {
|
||||
pos--;
|
||||
}
|
||||
|
||||
for (int j = k - 1; j > pos; j--) {
|
||||
final_vals[j] = final_vals[j - 1];
|
||||
final_idx[j] = final_idx[j - 1];
|
||||
}
|
||||
final_vals[pos] = val;
|
||||
final_idx[pos] = idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < k; i++) {
|
||||
dst_idx_row[i] = final_idx[i];
|
||||
}
|
||||
|
||||
if (k > 1) {
|
||||
int32_t temp = dst_idx_row[0];
|
||||
dst_idx_row[0] = dst_idx_row[1];
|
||||
dst_idx_row[1] = temp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
const int nrows, queue_ptr stream) {
|
||||
const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
|
||||
|
|
@ -2063,8 +2167,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
*stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
|
|
@ -2107,8 +2211,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
||||
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
}
|
||||
|
|
@ -2231,6 +2335,30 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
|
|||
main_stream, ctx.device);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_top_k(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
|
||||
const float * src0_dd = static_cast<const float *>(src0->data);
|
||||
int32_t * dst_dd = static_cast<int32_t *>(dst->data);
|
||||
|
||||
const int k = dst->ne[0];
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(k > 0 && k <= 32);
|
||||
GGML_ASSERT(k <= ncols);
|
||||
|
||||
top_k_f32_sycl(src0_dd, dst_dd, ncols, nrows, k, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
|
|
@ -3037,8 +3165,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|||
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
|
|
@ -3062,7 +3190,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|||
});
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
*queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
||||
|
|
@ -3396,12 +3524,11 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||
#endif // SYCL_USE_XMX
|
||||
|
||||
// mmvq path is faster in the CUDA backend.
|
||||
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
||||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
||||
// is enabled takes precedence over DMMV, the current if-else implementation
|
||||
// requires disabling DMMV if both conditions are met
|
||||
|| (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
||||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
||||
// is enabled takes precedence over DMMV, the current if-else implementation
|
||||
// requires disabling DMMV if both conditions are met
|
||||
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
|
||||
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
}
|
||||
|
||||
|
|
@ -4007,6 +4134,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|||
case GGML_OP_ARGSORT:
|
||||
ggml_sycl_argsort(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_TOP_K:
|
||||
ggml_sycl_op_top_k(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_sycl_op_timestep_embedding(ctx, dst);
|
||||
break;
|
||||
|
|
@ -4058,16 +4188,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
|||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
||||
ggml_sycl_set_device(device);
|
||||
|
||||
/*
|
||||
DPCT1009:218: SYCL uses exceptions to report errors and does not use the
|
||||
error codes. The original code was commented out and a warning string was
|
||||
inserted. You need to rewrite this code.
|
||||
*/
|
||||
/*
|
||||
DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
|
||||
device information which may not be supported by all compilers or runtimes.
|
||||
You may need to adjust the code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
|
||||
}
|
||||
|
|
@ -4710,6 +4830,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_ARGSORT:
|
||||
return op->src[0]->ne[0] * sizeof(int) <=
|
||||
ggml_sycl_info().devices[device].smpbo;
|
||||
case GGML_OP_TOP_K: {
|
||||
const ggml_tensor * src0 = op->src[0];
|
||||
const int k = op->ne[0];
|
||||
return src0 &&
|
||||
op->type == GGML_TYPE_I32 &&
|
||||
src0->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
k > 0 && k <= 32;
|
||||
}
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_ACC:
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -32,12 +32,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|||
|
||||
// Handle transposition of src1
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
|
||||
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMath GEMM
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
|
|
|
|||
|
|
@ -207,7 +207,6 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons
|
|||
const int p = sector;
|
||||
theta_base = pos[channel_x] * sycl::pow(theta_scale, (float) p);
|
||||
} else {
|
||||
// Simplified from CUDA backend code: if (sector >= sections.v[0] && sector < sec_w) which is just sector >= sections.v[0]
|
||||
const int p = sector - sections.v[0];
|
||||
theta_base = pos[channel_x + ne2] * sycl::pow(theta_scale, (float) p);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include <sycl/sycl.hpp>
|
||||
#include "wkv.hpp"
|
||||
|
||||
constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE
|
||||
constexpr int WKV_BLOCK_SIZE = 64;
|
||||
|
||||
// Helper function for the main kernel
|
||||
template <int block_size>
|
||||
|
|
|
|||
|
|
@ -7519,8 +7519,11 @@ void ggml_quantize_free(void) {
|
|||
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_XS);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_S);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ1_S);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ1_M);
|
||||
iq3xs_free_impl(256);
|
||||
iq3xs_free_impl(512);
|
||||
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -284,6 +284,8 @@ class Keys:
|
|||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
|
||||
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
|
|
|
|||
|
|
@ -1113,6 +1113,12 @@ class GGUFWriter:
|
|||
def add_vision_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
|
||||
|
||||
def add_vision_max_pixels(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value)
|
||||
|
||||
def add_vision_min_pixels(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
|
||||
|
||||
def add_vision_preproc_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
|
||||
|
||||
|
|
|
|||
|
|
@ -317,6 +317,7 @@ llama_context::llama_context(
|
|||
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
// ignore CPU backend
|
||||
// TODO: should we ignore ACCEL types too?
|
||||
continue;
|
||||
}
|
||||
auto * dev = ggml_backend_get_device(backend.get());
|
||||
|
|
|
|||
|
|
@ -2262,6 +2262,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<PRE>"
|
||||
|| t.first == "▁<PRE>" // CodeLlama
|
||||
|| t.first == "<|code_prefix|>" // GLM-4.5
|
||||
|| t.first == "<|prefix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_pre_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2282,6 +2283,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<SUF>"
|
||||
|| t.first == "▁<SUF>" // CodeLlama
|
||||
|| t.first == "<|code_suffix|>" // GLM-4.5
|
||||
|| t.first == "<|suffix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_suf_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
@ -2302,6 +2304,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
|| t.first == "<MID>"
|
||||
|| t.first == "▁<MID>" // CodeLlama
|
||||
|| t.first == "<|code_middle|>" // GLM-4.5
|
||||
|| t.first == "<|middle|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_mid_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
|
|
|||
|
|
@ -8591,6 +8591,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
output_printer->print_operation(info);
|
||||
return false;
|
||||
}
|
||||
// Use reference implementation on the CPU backend for comparison
|
||||
using ggml_backend_cpu_set_use_ref_t = void (*)(ggml_backend_t, bool);
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
||||
auto * set_use_ref = (ggml_backend_cpu_set_use_ref_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_use_ref");
|
||||
if (set_use_ref) {
|
||||
set_use_ref(backend_cpu, true);
|
||||
}
|
||||
|
||||
size_t n_ok = 0;
|
||||
size_t tests_run = 0;
|
||||
|
|
|
|||
|
|
@ -189,12 +189,24 @@ static void test_conditionals(testing & t) {
|
|||
"negated"
|
||||
);
|
||||
|
||||
test_template(t, "in operator",
|
||||
test_template(t, "in operator (element in array)",
|
||||
"{% if 'x' in items %}found{% endif %}",
|
||||
{{"items", json::array({"x", "y"})}},
|
||||
"found"
|
||||
);
|
||||
|
||||
test_template(t, "in operator (substring)",
|
||||
"{% if 'bc' in 'abcd' %}found{% endif %}",
|
||||
json::object(),
|
||||
"found"
|
||||
);
|
||||
|
||||
test_template(t, "in operator (object key)",
|
||||
"{% if 'key' in obj %}found{% endif %}",
|
||||
{{"obj", {{"key", 1}, {"other", 2}}}},
|
||||
"found"
|
||||
);
|
||||
|
||||
test_template(t, "is defined",
|
||||
"{% if x is defined %}yes{% else %}no{% endif %}",
|
||||
{{"x", 1}},
|
||||
|
|
@ -1036,6 +1048,42 @@ static void test_tests(testing & t) {
|
|||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is in (array, true)",
|
||||
"{{ 'yes' if 2 is in([1, 2, 3]) }}",
|
||||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is in (array, false)",
|
||||
"{{ 'yes' if 5 is in([1, 2, 3]) else 'no' }}",
|
||||
json::object(),
|
||||
"no"
|
||||
);
|
||||
|
||||
test_template(t, "is in (string)",
|
||||
"{{ 'yes' if 'bc' is in('abcde') }}",
|
||||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is in (object keys)",
|
||||
"{{ 'yes' if 'a' is in(obj) }}",
|
||||
{{"obj", {{"a", 1}, {"b", 2}}}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "reject with in test",
|
||||
"{{ items | reject('in', skip) | join(', ') }}",
|
||||
{{"items", json::array({"a", "b", "c", "d"})}, {"skip", json::array({"b", "d"})}},
|
||||
"a, c"
|
||||
);
|
||||
|
||||
test_template(t, "select with in test",
|
||||
"{{ items | select('in', keep) | join(', ') }}",
|
||||
{{"items", json::array({"a", "b", "c", "d"})}, {"keep", json::array({"b", "c"})}},
|
||||
"b, c"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_string_methods(testing & t) {
|
||||
|
|
|
|||
|
|
@ -37,6 +37,8 @@
|
|||
// vision-specific
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
|
||||
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
|
||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
|
|
|
|||
|
|
@ -767,7 +767,7 @@ static bool router_validate_model(const std::string & name, server_models & mode
|
|||
}
|
||||
auto meta = models.get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
if (models_autoload) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue