diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 3bb0df8bdc..521329d085 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -13,18 +13,18 @@ constexpr uint WARPSIZE = 32; //currently not use; in future for split-k kernels -static __global__ void reduce_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols, const int nrows) { - const int row = blockIdx.x; - const int col = threadIdx.x; +// static __global__ void reduce_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols, const int nrows) { +// const int row = blockIdx.x; +// const int col = threadIdx.x; - float sum = 0.0f; - if (row * blockDim.x + col < ncols) { - for (int i = 0; i < nrows; ++i){ - sum += x[i * ncols + row * blockDim.x + col]; - } - dst[row * blockDim.x + col] = sum; - } -} +// float sum = 0.0f; +// if (row * blockDim.x + col < ncols) { +// for (int i = 0; i < nrows; ++i){ +// sum += x[i * ncols + row * blockDim.x + col]; +// } +// dst[row * blockDim.x + col] = sum; +// } +// } template static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ @@ -1033,8 +1033,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * const uint OC = kernel->ne[3]; // ouptut_chanles const uint B = input->ne[3]; // n_batches - const int64_t total = B * OC * OH * OW; - param_t params = { B, IC, IH, IW, OC, KH, KW, ST_Y, ST_X, PD_Y, PD_X, DL_Y, DL_X, OH, OW }; params.SC_fastdiv = init_fastdiv_values(KW*IC); params.OW_fastdiv = init_fastdiv_values(OW);