From 3ea524e9c4bf19514d08eb490e6746255ae11a39 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Mon, 27 Oct 2025 23:10:19 -0400 Subject: [PATCH] WIP: almost working --- ggml/src/ggml-cuda/cpy.cu | 85 +++++--- tests/CMakeLists.txt | 1 + tests/test-backend-ops.cpp | 3 +- tests/test-conv2d-implicit.cpp | 22 +- tests/test-transpose.cpp | 375 +++++++++++++++++++++++++++++++++ 5 files changed, 450 insertions(+), 36 deletions(-) create mode 100644 tests/test-transpose.cpp diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 660c021a43..4405f9e378 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -49,10 +49,11 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co const T* src = reinterpret_cast(cx); T* dst = reinterpret_cast(cdst); - const int64_t nmat = ne /(ne00 * ne01); + const int64_t nmat = ne / (ne00 * ne01); const int64_t n = ne00 * ne01; // const int64_t n = ne01 * ne02; int width = ne01; + int height = ne00; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int tx = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset @@ -62,29 +63,65 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co __shared__ T tile[TILE_DIM][TILE_DIM]; for(int i = 0; i < BLOCK_NM; ++i){ - const unsigned int imat = blockIdx.z * BLOCK_NM + i; - if(imat < nmat){ - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){ - const unsigned int idx = (y+j)*width + x; - if(idx < n){ - const int row = threadIdx.y+j; - const int col = threadIdx.x ^ row; - // tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx]; - tile[row][col] = src[imat*n + idx]; - } - } - __syncthreads(); + __syncthreads(); - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){ - const unsigned int idx = (ty+j)*width + tx; - if(idx < n){ - // const int row = threadIdx.x; - const int col = (threadIdx.y+j) ^ threadIdx.x; - // dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j]; - dst[imat*n + idx] = tile[threadIdx.x][col]; - } + const unsigned int imat = blockIdx.z * BLOCK_NM + i; + if(imat >= nmat) + break; + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){ + if(imat < nmat && x < width && y + j < height){ + const unsigned int idx = (y+j)*width + x; + const int row = threadIdx.y+j; + const int col = threadIdx.x ^ row; + // tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx]; + tile[row][col] = src[imat*n + idx]; } } + __syncthreads(); + + + // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ + // printf("BEGIN %d\n", i); + // for(int jj = 0; jj < TILE_DIM; ++jj){ + // for(int ii = 0; ii < TILE_DIM; ++ii) + // printf("%.f, ", tile[jj][ii]); + // printf("]\n"); + // } + // } + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){ + + if(imat < nmat && ty + j < width && tx < height){ + const unsigned int idx = (ty+j)*height + tx; + // const int row = threadIdx.x; + const int col = (threadIdx.y+j) ^ threadIdx.x; + // dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j]; + dst[imat*n + idx] = tile[threadIdx.x][col]; + // if(imat*n + idx == 4*ne00){ + // printf("DEBUG: (%u, %u, %u, %u, %u), j=%d, tx=%d, ty=%d, imat=%u idx=%u dst[%u]=%.2f, %f\n", + // threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, blockIdx.z, j, tx, ty, + // imat, idx, imat*n + idx, dst[imat*n + idx], tile[threadIdx.x][threadIdx.y + j]); + // } + } + } + // } + } + + if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ + // for(int j = 0; j < 32; ++j){ + // j = 0; + for(int i = 0; i < 32; ++i) + // printf("%.2f, ", src[j*48+i]); + // printf("%.2f, ", src[j*48+i]); + printf("%.2f, ", __half2float(src[i])); + printf("]\n"); + // } + printf("==============================\n"); + // for(int j = 0; j < 32; ++j){ + for(int i = 0; i < 32; ++i) + printf("%.2f, ", __half2float(dst[i])); + printf("]\n"); + // } } } @@ -195,11 +232,11 @@ static void ggml_cpy_flt_cuda( const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) { - if constexpr ((std::is_same_v && std::is_same_v || + if constexpr ((std::is_same_v && std::is_same_v || std::is_same_v && std::is_same_v) && transpose){ - // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11); - // printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11); + printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11); + printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11); // if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose // if (transpose) { //transpose // printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7ce76f0105..1787e53eb5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -199,6 +199,7 @@ endif() llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-conv2d-implicit.cpp) +llama_build_and_test(test-transpose.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 6c0b5d17a6..2016c3f74c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2458,7 +2458,7 @@ struct test_cpy : public test_case { ggml_tensor * out = ggml_cpy(ctx, src, dst); if(is_transpose) - dst->op_params[10] = 999; + src->op_params[10] = 999; ggml_set_name(out, "out"); return out; @@ -6136,6 +6136,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {48, 48, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cont()); test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1})); diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index f3f4f91700..2790b3c235 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -451,17 +451,17 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < 26*38; i++) { - // for(int i = 0; i < conv2d_data.size(); i++) { - // // float diff = fabs(conv2d_data[i] - wino_data[i]); - // float diff = fabs(im2col_data[i] - wino_data[i]); - // float diff1 = fabs(im2col_data[i] - conv2d_data[i]); - // if(diff > 0.5) { - // printf("(%7.3f, %7.3f, %7.3f, %.2f, %.2f, %d) \n", - // im2col_data[i], conv2d_data[i], - // wino_data[i], diff, diff1, i); - // // break; - // } - // } + for(int i = 0; i < conv2d_data.size(); i++) { + // float diff = fabs(conv2d_data[i] - wino_data[i]); + float diff = fabs(im2col_data[i] - wino_data[i]); + float diff1 = fabs(im2col_data[i] - conv2d_data[i]); + if(diff > 0.5) { + printf("(%7.3f, %7.3f, %7.3f, %.2f, %.2f, %d) \n", + im2col_data[i], conv2d_data[i], + wino_data[i], diff, diff1, i); + // break; + } + } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); diff --git a/tests/test-transpose.cpp b/tests/test-transpose.cpp new file mode 100644 index 0000000000..73263f3438 --- /dev/null +++ b/tests/test-transpose.cpp @@ -0,0 +1,375 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-cpu.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +//#include +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +struct test_model { + struct ggml_tensor * a; + struct ggml_tensor * b; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; +}; + + + +void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) { + // create data + int KW = kw, KH = kh, IC = ic, OC = oc; + int IW = iw, IH = ih, N = 1; + srand(time(NULL)); + + // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH); + + // Initialize adata + std::vector adata(KW * KH * IC * OC); + for (int i = 0; i < KW * KH * IC * OC; i++) { + // adata[i] = 2.f; + adata[i] = (float)i; + // adata[i] = (rand() % 255) / 255.0; + // float r = -1.f + static_cast (rand()) /( static_cast (RAND_MAX/(1.f-(-1.f)))); + // adata[i] = r; + } + + // Convert adata to fp16 format + std::vector hadata(KW * KH * IC * OC); + ggml_fp32_to_fp16_row(adata.data(), hadata.data(), KW * KH * IC * OC); + + // Initialize bdata + std::vector bdata(IW * IH * IC * N); + for (int i = 0; i < IW * IH * IC * N; i++) { + // bdata[i] = (float)(i%IW)/10.f; + // bdata[i] = 1.5f; + bdata[i] = (float)(i+1); + // bdata[i] = (rand() % 255) / 255.0; + // float r = -1.f + static_cast (rand()) /( static_cast (RAND_MAX/(1.f-(-1.f)))); + // bdata[i] = r; + } + + // for(int i = 0; i < IH; i++) { + // // float diff = fabs(conv2d_data[i] - wino_data[i]); + // for(int j = 0; j < IW; j++) { + // printf("%.0f, ", bdata[i*IW+j]); + // } + // printf("\n"); + // } + for(int i = 0; i < KH; i++) { + // float diff = fabs(conv2d_data[i] - wino_data[i]); + for(int j = 0; j < KW; j++) { + printf("%.0f, ", adata[i*KW+j]); + } + printf("\n"); + } + printf(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + + size_t buffer_size = 0; + { + // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a + buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b + buffer_size += 1024; // overhead + } + + // printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + // printf("%s: backend buffer size = %0.2f MB\n", __func__, (buffer_size/ 1024.f/ 1024.f)); + + int num_tensors = 2; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + // initialize the backend +#ifdef GGML_USE_CUDA + if (use_gpu) { + // fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + +#ifdef GGML_USE_METAL + if (use_gpu) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + + if(!model.backend) { + // fallback to CPU backend + model.backend = ggml_backend_cpu_init(); + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size); + + // create context + model.ctx = ggml_init(params); + + // create tensors + model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC); + // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC); + model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N); + + int64_t *ne = model.a->ne; + printf("before trans: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); + + // create a allocator + struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer); + + // alloc memory + ggml_tallocr_alloc(&alloc, model.a); + + // load data to buffer + if(ggml_backend_is_cpu(model.backend)) { + memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); + // memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); + } else { + ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); + // ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); + } + + // alloc memory + ggml_tallocr_alloc(&alloc, model.b); + + if(ggml_backend_is_cpu(model.backend) +#ifdef GGML_USE_METAL + || ggml_backend_is_metal(model.backend) +#endif + ) { + memcpy(model.b->data, bdata.data(), ggml_nbytes(model.b)); + } else { + ggml_backend_tensor_set(model.b, bdata.data(), 0, ggml_nbytes(model.b)); + } +} + +typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model); + +struct ggml_cgraph * build_graph_0(const test_model& model) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + int s0 = 1; + int s1 = 1; + int p0 = 1; + int p1 = 1; + int d0 = 1; + int d1 = 1; + + + + // recalculate for avoid fragmentation + // struct ggml_tensor* conv2d_res = ggml_cont(ctx0, ggml_transpose(ctx0, model.b)); + struct ggml_tensor* conv2d_res = ggml_cont(ctx0, ggml_transpose(ctx0, model.a)); + ggml_set_name(conv2d_res, "transpose_res"); + ggml_build_forward_expand(gf, conv2d_res); + int64_t *ne = conv2d_res->ne; + printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); + + + // struct ggml_tensor* wino_res = ggml_conv_2d_3x3(ctx0, model.a, model.b); + // ggml_set_name(wino_res, "wino_res"); + // ggml_build_forward_expand(gf, wino_res); + // ne = wino_res->ne; + // printf("wino: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); + ggml_free(ctx0); + return gf; +} + + + +std::vector compute_graph(const test_model & model, ggml_gallocr_t allocr, + build_graph_t build_graph, int iters, double *t) { + struct ggml_cgraph * gf = build_graph(model); + + + // allocate tensors + ggml_gallocr_alloc_graph(allocr, gf); + int n_threads = 1; + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + + ggml_backend_synchronize(model.backend); + + ggml_backend_graph_compute(model.backend, gf); + + ggml_backend_synchronize(model.backend); + + int64_t start_time = ggml_time_us(); + + for(int iter=0; iter data(ggml_nelements(res)); + std::vector fdata(ggml_nelements(res)); + std::vector data(ggml_nelements(res)); + ggml_backend_tensor_get(res, fdata.data(), 0, ggml_nbytes(res)); + ggml_fp16_to_fp32_row(fdata.data(), data.data(), ggml_nelements(res)); + *t = time_us/1000; + return data; + +} + + +int main(void) +{ + ggml_time_init(); + std::vector> configs = { + // std::make_tuple(64,64,48,64,3,3), + // std::make_tuple(320,320,104,152,3,3), + // std::make_tuple(640,640,52,76,3,3), + // std::make_tuple(640,640,104,152,3,3), + // std::make_tuple(960,320,104,152,3,3), + // std::make_tuple(1,128,38,49,3,3), + std::make_tuple(1,1,38,49,38,49), + // std::make_tuple(1280,1280,26,38,1,1), + // std::make_tuple(256,128,768,1024,3,3), + // std::make_tuple(256,128,768,1024,1,1), + // std::make_tuple(1280,640,52,76,3,3), + // std::make_tuple(1920,1280,26,38,3,3), + // std::make_tuple(2560,1280,26,38,3,3), + // std::make_tuple(512,512,104,152,3,3), + // std::make_tuple(512,512,208,304,3,3), + // std::make_tuple(512,256,416,608,3,3), + // std::make_tuple(256,128,832,1216,3,3), + // std::make_tuple(256,256,832,1216,3,3), + // std::make_tuple(320,256,1024,1920) + }; + + int k = 0; + + for (auto c : configs){ + test_model model; + load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), + std::get<3>(c), std::get<4>(c), std::get<5>(c), true); + + ggml_gallocr_t allocr = NULL; + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + //create the worst case graph for memory usage estimation + struct ggml_cgraph * gf = build_graph_0(model); + + // compute the required memory + ggml_gallocr_reserve(allocr, gf); + size_t mem_size0 = ggml_gallocr_get_buffer_size(allocr, 0); + // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); + + + struct ggml_cgraph * gf_res_0 = NULL; + int iterations = 0; + + double run_time0; + std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); + + + + + + + + //create the worst case graph for memory usage estimation + + + + + + + + // for(int i = 0; i < ggml_nelements(wino_res); i++) { + // for(int i = 0; i < 26*38; i++) { + // for(int i = 0; i < std::get<2>(c); i++) { + // // float diff = fabs(conv2d_data[i] - wino_data[i]); + // for(int j = 0; j < std::get<3>(c); j++) { + // printf("%4.1f, ", im2col_data[i*std::get<3>(c)+j]); + // } + // printf("\n"); + // } + for(int i = 0; i < std::get<4>(c); i++) { + // float diff = fabs(conv2d_data[i] - wino_data[i]); + for(int j = 0; j < std::get<5>(c); j++) { + printf("%4.1f, ", im2col_data[i*std::get<5>(c)+j]); + } + printf("\n"); + } + + ggml_free(model.ctx); + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + ggml_gallocr_free(allocr); + + } + + // printf("\nPerforming test:\n"); + return 0; +}