45 lines
1.4 KiB
Plaintext
45 lines
1.4 KiB
Plaintext
#version 450
|
|
|
|
#include "generic_unary_head.glsl"
|
|
#include "types.glsl"
|
|
|
|
#extension GL_EXT_control_flow_attributes : enable
|
|
#define BLOCK_SIZE 512
|
|
|
|
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
|
|
|
void main() {
|
|
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
|
const uint tid = gl_LocalInvocationID.x;
|
|
|
|
const uint i3 = row / (p.ne11 * p.ne12);
|
|
const uint i3_offset = i3 * p.ne12 * p.ne11;
|
|
const uint i2 = (row - i3_offset) / p.ne11;
|
|
const uint i2_offset = i2 * p.ne11;
|
|
const uint i1 = row - i3_offset - i2_offset;
|
|
|
|
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
|
|
|
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
|
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]);
|
|
sum[tid] += xi * xi;
|
|
}
|
|
|
|
// sum up partial sums and write back result
|
|
barrier();
|
|
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
sum[tid] += sum[tid + s];
|
|
}
|
|
barrier();
|
|
}
|
|
|
|
const FLOAT_TYPE scale = 1.0f / max(sqrt(sum[0]), FLOAT_TYPE(p.param1));
|
|
|
|
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
|
|
data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]));
|
|
}
|
|
}
|