opencl: allow resizing transpose buffers (#18384)
* opencl: allow resizing transpose buffers instead of using fixed sizes * opencl: remove commented code
This commit is contained in:
parent
a4bf35889e
commit
08566977a7
|
|
@ -263,6 +263,32 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|||
return { type, major, minor, patch };
|
||||
}
|
||||
|
||||
// cl buffer wrapper
|
||||
struct ggml_cl_buffer {
|
||||
cl_mem buffer;
|
||||
size_t size;
|
||||
|
||||
ggml_cl_buffer()
|
||||
: buffer(nullptr), size(0) {}
|
||||
|
||||
~ggml_cl_buffer() {
|
||||
if (buffer) {
|
||||
CL_CHECK(clReleaseMemObject(buffer));
|
||||
}
|
||||
}
|
||||
|
||||
void allocate(cl_context context, size_t new_size) {
|
||||
if (new_size > size) {
|
||||
size = new_size;
|
||||
if (buffer) {
|
||||
CL_CHECK(clReleaseMemObject(buffer));
|
||||
}
|
||||
cl_int err;
|
||||
CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Profiling
|
||||
struct ProfilingInfo {
|
||||
std::string op_name;
|
||||
|
|
@ -376,6 +402,11 @@ struct ggml_backend_opencl_context {
|
|||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
|
||||
// prealloc buffers for transposing weights and activations
|
||||
ggml_cl_buffer prealloc_quant_trans;
|
||||
ggml_cl_buffer prealloc_scales_trans;
|
||||
ggml_cl_buffer prealloc_act_trans;
|
||||
|
||||
cl_program program_add;
|
||||
cl_program program_add_id;
|
||||
cl_program program_clamp;
|
||||
|
|
@ -638,10 +669,6 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_transpose_16_buf;
|
||||
cl_kernel kernel_transpose_16_4x1;
|
||||
|
||||
cl_mem A_s_d_max; // max scale buffer size for transpose
|
||||
cl_mem A_q_d_max; // max weight buffer size for transpose
|
||||
cl_mem B_d_max; // max activation buffer size for transpose
|
||||
|
||||
// Gemm and Gemv related programs, kernels, etc
|
||||
cl_program program_CL_gemm;
|
||||
cl_program program_CL_gemv_general;
|
||||
|
|
@ -2600,9 +2627,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
required_B_d_bytes, max_B_d_bytes);
|
||||
}
|
||||
|
||||
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
|
||||
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
||||
backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
|
||||
backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
|
||||
backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
|
||||
|
|
@ -3607,32 +3634,35 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
// use sub_buffer of max buffer size instead
|
||||
|
||||
size_t q_size_bytes = K * M / 8 * sizeof(float);
|
||||
backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->A_q_d_max,
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
bool K_tile_trans = true;
|
||||
if ((K / 32) % 4 != 0){
|
||||
K_tile_trans =false;
|
||||
}
|
||||
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->A_s_d_max,
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
// cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
// <----------------------------------------------------------------------------------> //
|
||||
|
|
@ -7395,8 +7425,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->B_d_max,
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
|
|
|
|||
Loading…
Reference in New Issue