WIP: almost working

This commit is contained in:
bssrdf 2025-10-27 23:10:19 -04:00
parent 6d12288037
commit 3ea524e9c4
5 changed files with 450 additions and 36 deletions

View File

@ -49,10 +49,11 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
const T* src = reinterpret_cast<const T*>(cx);
T* dst = reinterpret_cast<T*>(cdst);
const int64_t nmat = ne /(ne00 * ne01);
const int64_t nmat = ne / (ne00 * ne01);
const int64_t n = ne00 * ne01;
// const int64_t n = ne01 * ne02;
int width = ne01;
int height = ne00;
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int tx = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
@ -62,29 +63,65 @@ static __global__ void cpy_flt_transpose(const char * cx, char * cdst_direct, co
__shared__ T tile[TILE_DIM][TILE_DIM];
for(int i = 0; i < BLOCK_NM; ++i){
const unsigned int imat = blockIdx.z * BLOCK_NM + i;
if(imat < nmat){
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
const unsigned int idx = (y+j)*width + x;
if(idx < n){
const int row = threadIdx.y+j;
const int col = threadIdx.x ^ row;
// tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx];
tile[row][col] = src[imat*n + idx];
}
}
__syncthreads();
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
const unsigned int idx = (ty+j)*width + tx;
if(idx < n){
// const int row = threadIdx.x;
const int col = (threadIdx.y+j) ^ threadIdx.x;
// dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
dst[imat*n + idx] = tile[threadIdx.x][col];
}
const unsigned int imat = blockIdx.z * BLOCK_NM + i;
if(imat >= nmat)
break;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
if(imat < nmat && x < width && y + j < height){
const unsigned int idx = (y+j)*width + x;
const int row = threadIdx.y+j;
const int col = threadIdx.x ^ row;
// tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx];
tile[row][col] = src[imat*n + idx];
}
}
__syncthreads();
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
// printf("BEGIN %d\n", i);
// for(int jj = 0; jj < TILE_DIM; ++jj){
// for(int ii = 0; ii < TILE_DIM; ++ii)
// printf("%.f, ", tile[jj][ii]);
// printf("]\n");
// }
// }
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
if(imat < nmat && ty + j < width && tx < height){
const unsigned int idx = (ty+j)*height + tx;
// const int row = threadIdx.x;
const int col = (threadIdx.y+j) ^ threadIdx.x;
// dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
dst[imat*n + idx] = tile[threadIdx.x][col];
// if(imat*n + idx == 4*ne00){
// printf("DEBUG: (%u, %u, %u, %u, %u), j=%d, tx=%d, ty=%d, imat=%u idx=%u dst[%u]=%.2f, %f\n",
// threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, blockIdx.z, j, tx, ty,
// imat, idx, imat*n + idx, dst[imat*n + idx], tile[threadIdx.x][threadIdx.y + j]);
// }
}
}
// }
}
if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
// for(int j = 0; j < 32; ++j){
// j = 0;
for(int i = 0; i < 32; ++i)
// printf("%.2f, ", src[j*48+i]);
// printf("%.2f, ", src[j*48+i]);
printf("%.2f, ", __half2float(src[i]));
printf("]\n");
// }
printf("==============================\n");
// for(int j = 0; j < 32; ++j){
for(int i = 0; i < 32; ++i)
printf("%.2f, ", __half2float(dst[i]));
printf("]\n");
// }
}
}
@ -195,11 +232,11 @@ static void ggml_cpy_flt_cuda(
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
if constexpr ((std::is_same_v<src_t, half> && std::is_same_v<dst_t, half> ||
if constexpr ((std::is_same_v<src_t, half> && std::is_same_v<dst_t, half> ||
std::is_same_v<src_t, float> && std::is_same_v<dst_t, float>)
&& transpose){
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
// printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);
printf("cuda cpy transpose nb00=%d nb01=%d nb10=%d nb11=%d\n", nb00, nb01, nb10, nb11);
// if (ne00 == ne11 && ne01 == ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
// if (transpose) { //transpose
// printf("cuda cpy transpose ne=%d ne00=%d ne01=%d ne10=%d ne11=%d\n", ne, ne00, ne01, ne10, ne11);

View File

@ -199,6 +199,7 @@ endif()
llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)
llama_build_and_test(test-conv2d-implicit.cpp)
llama_build_and_test(test-transpose.cpp)
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")

View File

@ -2458,7 +2458,7 @@ struct test_cpy : public test_case {
ggml_tensor * out = ggml_cpy(ctx, src, dst);
if(is_transpose)
dst->op_params[10] = 999;
src->op_params[10] = 999;
ggml_set_name(out, "out");
return out;
@ -6136,6 +6136,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {48, 48, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cont());
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));

View File

@ -451,17 +451,17 @@ int main(void)
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
// for(int i = 0; i < 26*38; i++) {
// for(int i = 0; i < conv2d_data.size(); i++) {
// // float diff = fabs(conv2d_data[i] - wino_data[i]);
// float diff = fabs(im2col_data[i] - wino_data[i]);
// float diff1 = fabs(im2col_data[i] - conv2d_data[i]);
// if(diff > 0.5) {
// printf("(%7.3f, %7.3f, %7.3f, %.2f, %.2f, %d) \n",
// im2col_data[i], conv2d_data[i],
// wino_data[i], diff, diff1, i);
// // break;
// }
// }
for(int i = 0; i < conv2d_data.size(); i++) {
// float diff = fabs(conv2d_data[i] - wino_data[i]);
float diff = fabs(im2col_data[i] - wino_data[i]);
float diff1 = fabs(im2col_data[i] - conv2d_data[i]);
if(diff > 0.5) {
printf("(%7.3f, %7.3f, %7.3f, %.2f, %.2f, %d) \n",
im2col_data[i], conv2d_data[i],
wino_data[i], diff, diff1, i);
// break;
}
}
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);

375
tests/test-transpose.cpp Normal file
View File

@ -0,0 +1,375 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
//#include <cuda_runtime.h>
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
struct test_model {
struct ggml_tensor * a;
struct ggml_tensor * b;
ggml_backend_t backend = NULL;
ggml_backend_buffer_t buffer;
struct ggml_context * ctx;
};
void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
// create data
int KW = kw, KH = kh, IC = ic, OC = oc;
int IW = iw, IH = ih, N = 1;
srand(time(NULL));
// printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
// Initialize adata
std::vector<float> adata(KW * KH * IC * OC);
for (int i = 0; i < KW * KH * IC * OC; i++) {
// adata[i] = 2.f;
adata[i] = (float)i;
// adata[i] = (rand() % 255) / 255.0;
// float r = -1.f + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(1.f-(-1.f))));
// adata[i] = r;
}
// Convert adata to fp16 format
std::vector<ggml_fp16_t> hadata(KW * KH * IC * OC);
ggml_fp32_to_fp16_row(adata.data(), hadata.data(), KW * KH * IC * OC);
// Initialize bdata
std::vector<float> bdata(IW * IH * IC * N);
for (int i = 0; i < IW * IH * IC * N; i++) {
// bdata[i] = (float)(i%IW)/10.f;
// bdata[i] = 1.5f;
bdata[i] = (float)(i+1);
// bdata[i] = (rand() % 255) / 255.0;
// float r = -1.f + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(1.f-(-1.f))));
// bdata[i] = r;
}
// for(int i = 0; i < IH; i++) {
// // float diff = fabs(conv2d_data[i] - wino_data[i]);
// for(int j = 0; j < IW; j++) {
// printf("%.0f, ", bdata[i*IW+j]);
// }
// printf("\n");
// }
for(int i = 0; i < KH; i++) {
// float diff = fabs(conv2d_data[i] - wino_data[i]);
for(int j = 0; j < KW; j++) {
printf("%.0f, ", adata[i*KW+j]);
}
printf("\n");
}
printf(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
size_t buffer_size = 0;
{
// buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b
buffer_size += 1024; // overhead
}
// printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
// printf("%s: backend buffer size = %0.2f MB\n", __func__, (buffer_size/ 1024.f/ 1024.f));
int num_tensors = 2;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
// initialize the backend
#ifdef GGML_USE_CUDA
if (use_gpu) {
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0);
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
}
#endif
#ifdef GGML_USE_METAL
if (use_gpu) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
}
#endif
if(!model.backend) {
// fallback to CPU backend
model.backend = ggml_backend_cpu_init();
}
model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
// create context
model.ctx = ggml_init(params);
// create tensors
model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC);
// model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC);
model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
int64_t *ne = model.a->ne;
printf("before trans: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
// create a allocator
struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
// alloc memory
ggml_tallocr_alloc(&alloc, model.a);
// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
// memcpy(model.a->data, adata.data(), ggml_nbytes(model.a));
} else {
ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
// ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a));
}
// alloc memory
ggml_tallocr_alloc(&alloc, model.b);
if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(model.backend)
#endif
) {
memcpy(model.b->data, bdata.data(), ggml_nbytes(model.b));
} else {
ggml_backend_tensor_set(model.b, bdata.data(), 0, ggml_nbytes(model.b));
}
}
typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model);
struct ggml_cgraph * build_graph_0(const test_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
int s0 = 1;
int s1 = 1;
int p0 = 1;
int p1 = 1;
int d0 = 1;
int d1 = 1;
// recalculate for avoid fragmentation
// struct ggml_tensor* conv2d_res = ggml_cont(ctx0, ggml_transpose(ctx0, model.b));
struct ggml_tensor* conv2d_res = ggml_cont(ctx0, ggml_transpose(ctx0, model.a));
ggml_set_name(conv2d_res, "transpose_res");
ggml_build_forward_expand(gf, conv2d_res);
int64_t *ne = conv2d_res->ne;
printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
// struct ggml_tensor* wino_res = ggml_conv_2d_3x3(ctx0, model.a, model.b);
// ggml_set_name(wino_res, "wino_res");
// ggml_build_forward_expand(gf, wino_res);
// ne = wino_res->ne;
// printf("wino: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
ggml_free(ctx0);
return gf;
}
std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr,
build_graph_t build_graph, int iters, double *t) {
struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1;
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_cb(model.backend, n_threads);
}
#endif
ggml_backend_synchronize(model.backend);
ggml_backend_graph_compute(model.backend, gf);
ggml_backend_synchronize(model.backend);
int64_t start_time = ggml_time_us();
for(int iter=0; iter<iters; iter++){
ggml_backend_graph_compute(model.backend, gf);
ggml_backend_synchronize(model.backend);
}
// ggml_backend_synchronize(model.backend);
int64_t end_time = ggml_time_us();
double time_us = end_time - start_time;
time_us = time_us/iters;
// printf(" Taking %f ms\n ", time_us/1000);
//ggml_graph_print(gf);
struct ggml_tensor *res = NULL;
for(int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
if(strcmp(ggml_get_name(ggml_graph_node(gf, i)), "transpose_res") == 0) {
res = ggml_graph_node(gf, i);
} else if(strcmp(ggml_get_name(ggml_graph_node(gf, i)), "conv2d_res") == 0) {
res = ggml_graph_node(gf, i);
}
}
// std::vector<float> data(ggml_nelements(res));
std::vector<ggml_fp16_t> fdata(ggml_nelements(res));
std::vector<float> data(ggml_nelements(res));
ggml_backend_tensor_get(res, fdata.data(), 0, ggml_nbytes(res));
ggml_fp16_to_fp32_row(fdata.data(), data.data(), ggml_nelements(res));
*t = time_us/1000;
return data;
}
int main(void)
{
ggml_time_init();
std::vector<std::tuple<int, int, int, int, int, int>> configs = {
// std::make_tuple(64,64,48,64,3,3),
// std::make_tuple(320,320,104,152,3,3),
// std::make_tuple(640,640,52,76,3,3),
// std::make_tuple(640,640,104,152,3,3),
// std::make_tuple(960,320,104,152,3,3),
// std::make_tuple(1,128,38,49,3,3),
std::make_tuple(1,1,38,49,38,49),
// std::make_tuple(1280,1280,26,38,1,1),
// std::make_tuple(256,128,768,1024,3,3),
// std::make_tuple(256,128,768,1024,1,1),
// std::make_tuple(1280,640,52,76,3,3),
// std::make_tuple(1920,1280,26,38,3,3),
// std::make_tuple(2560,1280,26,38,3,3),
// std::make_tuple(512,512,104,152,3,3),
// std::make_tuple(512,512,208,304,3,3),
// std::make_tuple(512,256,416,608,3,3),
// std::make_tuple(256,128,832,1216,3,3),
// std::make_tuple(256,256,832,1216,3,3),
// std::make_tuple(320,256,1024,1920)
};
int k = 0;
for (auto c : configs){
test_model model;
load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),
std::get<3>(c), std::get<4>(c), std::get<5>(c), true);
ggml_gallocr_t allocr = NULL;
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
//create the worst case graph for memory usage estimation
struct ggml_cgraph * gf = build_graph_0(model);
// compute the required memory
ggml_gallocr_reserve(allocr, gf);
size_t mem_size0 = ggml_gallocr_get_buffer_size(allocr, 0);
// fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
struct ggml_cgraph * gf_res_0 = NULL;
int iterations = 0;
double run_time0;
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
//create the worst case graph for memory usage estimation
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
// for(int i = 0; i < 26*38; i++) {
// for(int i = 0; i < std::get<2>(c); i++) {
// // float diff = fabs(conv2d_data[i] - wino_data[i]);
// for(int j = 0; j < std::get<3>(c); j++) {
// printf("%4.1f, ", im2col_data[i*std::get<3>(c)+j]);
// }
// printf("\n");
// }
for(int i = 0; i < std::get<4>(c); i++) {
// float diff = fabs(conv2d_data[i] - wino_data[i]);
for(int j = 0; j < std::get<5>(c); j++) {
printf("%4.1f, ", im2col_data[i*std::get<5>(c)+j]);
}
printf("\n");
}
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);
ggml_backend_free(model.backend);
ggml_gallocr_free(allocr);
}
// printf("\nPerforming test:\n");
return 0;
}