Add inplace softmax

This commit is contained in:
Reese Levine 2025-09-24 14:56:33 -07:00
parent 8d78cd2613
commit 483615da76
5 changed files with 483 additions and 13 deletions

View File

@ -1617,6 +1617,13 @@ extern "C" {
float scale,
float max_bias);
GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale,
float max_bias);
GGML_API void ggml_soft_max_add_sinks(
struct ggml_tensor * a,
struct ggml_tensor * sinks);

View File

@ -130,15 +130,16 @@ struct webgpu_context_struct {
wgpu::ComputePipeline set_rows_pipeline;
wgpu::ComputePipeline get_rows_pipeline[30];
wgpu::ComputePipeline get_rows_f32_no_vec_pipeline;
wgpu::ComputePipeline cpy_pipeline[2][2]; // src type, dst type
wgpu::ComputePipeline add_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline sub_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline mul_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline div_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline rms_norm_pipeline[2]; // inplace
wgpu::ComputePipeline rope_pipeline[2][2][2]; // type, ff, inplace
wgpu::ComputePipeline glu_pipeline[7][2][2]; // glu-op, type, split
wgpu::ComputePipeline scale_pipeline[2]; // inplace
wgpu::ComputePipeline cpy_pipeline[2][2]; // src type, dst type
wgpu::ComputePipeline add_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline sub_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline mul_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline div_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline rms_norm_pipeline[2]; // inplace
wgpu::ComputePipeline rope_pipeline[2][2][2]; // type, ff, inplace
wgpu::ComputePipeline glu_pipeline[7][2][2]; // glu-op, type, split
wgpu::ComputePipeline scale_pipeline[2]; // inplace
wgpu::ComputePipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace
size_t memset_bytes_per_thread;
@ -912,6 +913,79 @@ static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tens
ggml_op_name(dst->op));
}
static void ggml_webgpu_soft_max(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * src2,
ggml_tensor * dst) {
const int inplace = ggml_webgpu_tensor_equal(src0, dst);
const int mask_type = (src1 != nullptr) ? src1->type : 2; // use 2 for no mask here
const int has_sink = (src2 != nullptr);
float max_bias;
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
float n_head_log2 = float(1u << (uint32_t) floor(log2(src0->ne[2])));
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
mask_type < 2 ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
has_sink ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
mask_type < 2 ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0,
mask_type < 2 ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) : 0,
mask_type < 2 ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) : 0,
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) ggml_nelements(dst),
(uint32_t) src0->ne[0],
(uint32_t) src0->ne[1],
(uint32_t) src0->ne[2],
mask_type < 2 ? (uint32_t) src1->ne[2] : 0,
mask_type < 2 ? (uint32_t) src1->ne[3] : 0,
*(uint32_t *) dst->op_params, // scale
*(uint32_t *) &max_bias,
*(uint32_t *) &n_head_log2,
*(uint32_t *) &m0,
*(uint32_t *) &m1
};
std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_webgpu_tensor_binding_size(ctx, src0) }
};
uint32_t binding_num = 1;
if (mask_type < 2) {
entries.push_back({ .binding = binding_num,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1) });
binding_num++;
}
if (has_sink) {
entries.push_back({ .binding = binding_num,
.buffer = ggml_webgpu_tensor_buf(src2),
.offset = ggml_webgpu_tensor_align_offset(ctx, src2),
.size = ggml_webgpu_tensor_binding_size(ctx, src2) });
binding_num++;
}
if (!inplace) {
entries.push_back({ .binding = binding_num,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
ggml_nrows(dst), ggml_op_name(dst->op));
}
// Returns true if node has enqueued work into the queue, false otherwise
static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
if (ggml_is_empty(node)) {
@ -1512,6 +1586,38 @@ static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
"scale_f32_inplace", constants);
}
static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants(1);
constants[0].key = "wg_size";
constants[0].value = 64;
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][0], wgsl_soft_max_f32,
"soft_max_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][1], wgsl_soft_max_f32_inplace,
"soft_max_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][0], wgsl_soft_max_f32_sink,
"soft_max_f32_sink", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][1],
wgsl_soft_max_f32_sink_inplace, "soft_max_f32_sink_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][0], wgsl_soft_max_f32_mask_f32,
"soft_max_f32_mask_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][1],
wgsl_soft_max_f32_mask_f32_inplace, "soft_max_f32_mask_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][0], wgsl_soft_max_f32_mask_f16,
"soft_max_f32_mask_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][1],
wgsl_soft_max_f32_mask_f16_inplace, "soft_max_f32_mask_f16_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][0],
wgsl_soft_max_f32_mask_f32_sink, "soft_max_f32_mask_f32_sink", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][1],
wgsl_soft_max_f32_mask_f32_sink_inplace, "soft_max_f32_mask_f32_sink_inplace",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][0],
wgsl_soft_max_f32_mask_f16_sink, "soft_max_f32_mask_f16_sink", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][1],
wgsl_soft_max_f32_mask_f16_sink_inplace, "soft_max_f32_mask_f16_sink_inplace",
constants);
}
static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);

View File

@ -0,0 +1,338 @@
#define(VARIANTS)
[
{
"SHADER_NAME": "soft_max_f32",
"DECLS": ["BASE_BINDINGS", "NOT_INPLACE", "NO_MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_inplace",
"DECLS": ["BASE_BINDINGS_INPLACE", "INPLACE", "NO_MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_sink",
"DECLS": ["SINK_BINDINGS", "NOT_INPLACE", "NO_MASK", "SINK"]
},
{
"SHADER_NAME": "soft_max_f32_sink_inplace",
"DECLS": ["SINK_BINDINGS_INPLACE", "INPLACE", "NO_MASK", "SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f32",
"REPLS": {
"MASK_TYPE" : "f32",
},
"DECLS": ["MASK_BINDINGS", "NOT_INPLACE", "MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f32_inplace",
"REPLS": {
"MASK_TYPE" : "f32",
},
"DECLS": ["MASK_BINDINGS_INPLACE", "INPLACE", "MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f16",
"REPLS": {
"MASK_TYPE" : "f16",
},
"DECLS": ["MASK_BINDINGS", "NOT_INPLACE", "MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f16_inplace",
"REPLS": {
"MASK_TYPE" : "f16",
},
"DECLS": ["MASK_BINDINGS_INPLACE", "INPLACE", "MASK", "NO_SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f32_sink",
"REPLS": {
"MASK_TYPE" : "f32",
},
"DECLS": ["MASK_SINK_BINDINGS", "NOT_INPLACE", "MASK", "SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f32_sink_inplace",
"REPLS": {
"MASK_TYPE" : "f32",
},
"DECLS": ["MASK_SINK_BINDINGS_INPLACE", "INPLACE", "MASK", "SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f16_sink",
"REPLS": {
"MASK_TYPE" : "f16",
},
"DECLS": ["MASK_SINK_BINDINGS", "NOT_INPLACE", "MASK", "SINK"]
},
{
"SHADER_NAME": "soft_max_f32_mask_f16_sink_inplace",
"REPLS": {
"MASK_TYPE" : "f16",
},
"DECLS": ["MASK_SINK_BINDINGS_INPLACE", "INPLACE", "MASK", "SINK"]
}
]
#end(VARIANTS)
#define(DECLS)
#decl(BASE_BINDINGS)
@group(0) @binding(1)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(BASE_BINDINGS)
#decl(BASE_BINDINGS_INPLACE)
@group(0) @binding(1)
var<uniform> params: Params;
#enddecl(BASE_BINDINGS_INPLACE)
#decl(SINK_BINDINGS)
@group(0) @binding(1)
var<storage, read_write> sinks: array<f32>;
@group(0) @binding(2)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(SINK_BINDINGS)
#decl(SINK_BINDINGS_INPLACE)
@group(0) @binding(1)
var<storage, read_write> sinks: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(SINK_BINDINGS_INPLACE)
#decl(MASK_BINDINGS)
@group(0) @binding(1)
var<storage, read_write> mask: array<{{MASK_TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(MASK_BINDINGS)
#decl(MASK_BINDINGS_INPLACE)
@group(0) @binding(1)
var<storage, read_write> mask: array<{{MASK_TYPE}}>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(MASK_BINDINGS_INPLACE)
#decl(MASK_SINK_BINDINGS)
@group(0) @binding(1)
var<storage, read_write> mask: array<{{MASK_TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> sinks: array<f32>;
@group(0) @binding(3)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(4)
var<uniform> params: Params;
#enddecl(MASK_SINK_BINDINGS)
#decl(MASK_SINK_BINDINGS_INPLACE)
@group(0) @binding(1)
var<storage, read_write> mask: array<{{MASK_TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> sinks: array<f32>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(MASK_SINK_BINDINGS_INPLACE)
#decl(NOT_INPLACE)
fn inter_value(i: u32) -> f32 {
return dst[i];
}
fn update(i: u32, val: f32) {
dst[i] = val;
}
#enddecl(NOT_INPLACE)
#decl(INPLACE)
fn inter_value(i: u32) -> f32 {
return src[i];
}
fn update(i: u32, val: f32) {
src[i] = val;
}
#enddecl(INPLACE)
#decl(NO_MASK)
fn mask_val(i: u32) -> f32 {
return 0.0;
}
#enddecl(NO_MASK)
#decl(MASK)
fn mask_val(i: u32) -> f32 {
return f32(mask[i]);
}
#enddecl(MASK)
#decl(NO_SINK)
fn lower_max_bound(i2: u32) -> f32 {
return -1e30;
}
fn add_sinks(val: f32, i2: u32, max_val: f32) -> f32 {
return val;
}
#enddecl(NO_SINK)
#decl(SINK)
fn lower_max_bound(i2: u32) -> f32 {
return sinks[params.offset_sinks + i2];
}
fn add_sinks(val: f32, i2: u32, max_val: f32) -> f32 {
return val + exp(sinks[params.offset_sinks + i2] - max_val);
}
#enddecl(SINK)
#end(DECLS)
#define(SHADER)
enable f16;
struct Params {
offset_src0: u32,
offset_src1: u32,
offset_sinks: u32,
offset_dst: u32,
// Strides (in elements)
stride_src01: u32,
stride_src02: u32,
stride_src03: u32,
stride_src11: u32,
stride_src12: u32,
stride_src13: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
// shape of src0/dst
ne: u32,
ne0: u32,
ne1: u32,
ne2: u32,
// shape of src1
ne12: u32,
ne13: u32,
scale: f32,
max_bias: f32,
n_head_log2: f32,
m0: f32,
m1: f32,
};
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
DECLS
const CACHE_SIZE: u32 = 16;
override wg_size: u32;
var<workgroup> scratch: array<f32, wg_size>;
@compute @workgroup_size(wg_size)
fn main(@builtin(workgroup_id) wid: vec3<u32>,
@builtin(local_invocation_id) lid: vec3<u32>) {
var i = wid.x;
let i3 = i / (params.ne2 * params.ne1);
i = i % (params.ne2 * params.ne1);
let i2 = i / params.ne1;
let i1 = i % params.ne1;
let i_src0_row = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01;
let i_src1_row = params.offset_src1 + (i3 % params.ne13) * params.stride_src13 + (i2 % params.ne12) * params.stride_src12 + i1 * params.stride_src11;
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
let elems = (params.ne0 + wg_size - 1) / wg_size;
let head = f32(i2);
let slope = select(1, select(pow(params.m1, 2 * (head - params.n_head_log2) + 1), pow(params.m0, head + 1), head < params.n_head_log2), params.max_bias > 0);
var cache: array<f32, CACHE_SIZE>;
var max_val = lower_max_bound(i2);
for (var j: u32 = 0; j < elems; j++) {
let col = j * wg_size + lid.x;
if (col < params.ne0) {
let val = src[i_src0_row + col] * params.scale + slope * mask_val(i_src1_row + col);
max_val = max(max_val, val);
if (col < CACHE_SIZE) {
cache[col] = val;
}
}
}
scratch[lid.x] = max_val;
workgroupBarrier();
var offset = wg_size / 2;
while (offset > 0) {
if (lid.x < offset) {
scratch[lid.x] = max(scratch[lid.x], scratch[lid.x + offset]);
}
offset = offset / 2;
workgroupBarrier();
}
let row_max = scratch[0];
var sum = 0.0f;
for (var j: u32 = 0; j < elems; j++) {
let col = j * wg_size + lid.x;
if (col < params.ne0) {
let val = select(src[i_src0_row + col] * params.scale + slope * mask_val(i_src1_row + col),
cache[col], col < CACHE_SIZE);
let ex = exp(val - row_max);
sum += ex;
if (col < CACHE_SIZE) {
cache[col] = ex;
} else {
update(i_dst_row + col, ex);
}
}
}
scratch[lid.x] = sum;
workgroupBarrier();
offset = wg_size / 2;
while (offset > 0) {
if (lid.x < offset) {
scratch[lid.x] += scratch[lid.x + offset];
}
offset = offset / 2;
workgroupBarrier();
}
let row_sum = add_sinks(scratch[0], i2, row_max);
let sum_recip = 1.0 / row_sum;
for (var j: u32 = 0; j < elems; j++) {
let col = j * wg_size + lid.x;
if (col < params.ne0) {
update(i_dst_row + col, select(inter_value(i_dst_row + col), cache[col], col < CACHE_SIZE) * sum_recip);
}
}
}
#end(SHADER)

View File

@ -3829,6 +3829,15 @@ struct ggml_tensor * ggml_soft_max_ext(
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
}
struct ggml_tensor * ggml_soft_max_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale,
float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, true);
}
void ggml_soft_max_add_sinks(
struct ggml_tensor * a,
struct ggml_tensor * sinks) {

View File

@ -3752,9 +3752,10 @@ struct test_soft_max : public test_case {
const std::array<int64_t, 2> nr23; // broadcast only dims 2 and 3
const float scale;
const float max_bias;
const bool inplace;
std::string vars() override {
return VARS_TO_STR8(type, ne, mask, sinks, m_prec, nr23, scale, max_bias);
return VARS_TO_STR9(type, ne, mask, sinks, m_prec, nr23, scale, max_bias, inplace);
}
// the 1024 test with bias occasionally fails:
@ -3770,8 +3771,9 @@ struct test_soft_max : public test_case {
ggml_type m_prec = GGML_TYPE_F32,
std::array<int64_t, 2> nr23 = {1, 1},
float scale = 1.0f,
float max_bias = 0.0f)
: type(type), ne(ne), mask(mask), sinks(sinks), m_prec(m_prec), nr23(nr23), scale(scale), max_bias(max_bias) {}
float max_bias = 0.0f,
bool inplace = false)
: type(type), ne(ne), mask(mask), sinks(sinks), m_prec(m_prec), nr23(nr23), scale(scale), max_bias(max_bias), inplace(inplace) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2]*nr23[0], ne[3]*nr23[1]);
@ -3790,7 +3792,12 @@ struct test_soft_max : public test_case {
ggml_set_name(sinks, "sinks");
}
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
ggml_tensor * out;
if (inplace) {
out = ggml_soft_max_ext_inplace(ctx, a, mask, scale, max_bias);
} else {
out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
}
ggml_soft_max_add_sinks(out, sinks);
ggml_set_name(out, "out");
@ -6562,6 +6569,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
}
// inplace tests
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, mask, sinks, GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f, true));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, mask, sinks, GGML_TYPE_F16, {1, 1}, 0.1f, 0.0f, true));
}
}
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f));