vulkan: add RTE variants of exp shader (#16165)
This fixes some failures on Turing where "round to zero" rounds to the max f16 value but the CPU reference value is infinite.
This commit is contained in:
parent
4d0a7cbc61
commit
a20d810d79
|
|
@ -3391,7 +3391,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
CREATE_UNARY(exp)
|
|
||||||
CREATE_UNARY(gelu)
|
CREATE_UNARY(gelu)
|
||||||
CREATE_UNARY(gelu_erf)
|
CREATE_UNARY(gelu_erf)
|
||||||
CREATE_UNARY(gelu_quick)
|
CREATE_UNARY(gelu_quick)
|
||||||
|
|
@ -3403,6 +3402,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_UNARY(hardswish)
|
CREATE_UNARY(hardswish)
|
||||||
#undef CREATE_UNARY
|
#undef CREATE_UNARY
|
||||||
|
|
||||||
|
#define CREATE_UNARY_RTE(name) \
|
||||||
|
if (device->float_controls_rte_fp16) { \
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16_rte", name ## _f16_rte_len, name ## _f16_rte_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||||
|
} else { \
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||||
|
}
|
||||||
|
CREATE_UNARY_RTE(exp)
|
||||||
|
#undef CREATE_UNARY_RTE
|
||||||
|
|
||||||
#define CREATE_GLU(name) \
|
#define CREATE_GLU(name) \
|
||||||
if (device->float_controls_rte_fp16) { \
|
if (device->float_controls_rte_fp16) { \
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32_rte", name ## _f32_rte_len, name ## _f32_rte_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
|
#include "rte.comp"
|
||||||
#include "generic_head.comp"
|
#include "generic_head.comp"
|
||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -704,8 +704,11 @@ void process_shaders() {
|
||||||
|
|
||||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
|
||||||
string_to_spv("exp_f16", "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
for (auto rte : {false, true}) {
|
||||||
string_to_spv("exp_f32", "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
std::string suffix = rte ? "_rte" : "";
|
||||||
|
string_to_spv("exp_f16" + suffix, "exp.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", rte ? "1" : "0"}});
|
||||||
|
string_to_spv("exp_f32" + suffix, "exp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"} , {"RTE16", rte ? "1" : "0"}});
|
||||||
|
}
|
||||||
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue