diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 37144970d3..722eb10e73 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -964,8 +964,7 @@ static void launch_conv2d_implicit_split_kernel(ggml_backend_cuda_context & ctx, int id = ggml_cuda_get_device(); ggml_cuda_pool_alloc Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n); - cudaFuncSetAttribute(conv2d_implicit_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 + CUDA_SET_SHARED_MEMORY_LIMIT((conv2d_implicit_kernel), 65536);// set shared memory limit to 64KB which is maximum for sm_75 dim3 gridDim(BlocksN, BlocksM, ksplit); dim3 blockDim(ThreadsN, ThreadsM); @@ -1162,15 +1161,15 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa } } - cudaFuncSetAttribute(conv2d_implicit_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 + CUDA_SET_SHARED_MEMORY_LIMIT((conv2d_implicit_kernel), 65536);// set shared memory limit to 64KB which is maximum for sm_75 + dim3 gridDim(BlocksN, BlocksM); dim3 blockDim(ThreadsN, ThreadsM); conv2d_implicit_kernel <<>>(X_H, K_H, Y_D, P); - } else{ + } else { conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); }