tools: add quant-bench for profiling raw kernel performance

This commit is contained in:
chethanreddy1 2026-02-08 22:37:37 +05:30
parent 22cae83218
commit 05dfc18d55
2 changed files with 268 additions and 1 deletions

View File

@ -3,6 +3,11 @@ add_executable(${TARGET} llama-bench.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET_QUANT quant-bench)
add_executable(${TARGET_QUANT} quant-bench.cpp)
target_link_libraries(${TARGET_QUANT} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET_QUANT} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
install(TARGETS ${TARGET} ${TARGET_QUANT} RUNTIME)
endif()

View File

@ -0,0 +1,262 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-alloc.h"
#include "common.h"
#include <vector>
#include <string>
#include <cstdio>
#include <chrono>
#include <map>
#include <cmath>
#include <algorithm>
#include <thread>
#include <memory>
#include <cstring>
// Smart pointers for RAII cleanup
struct ggml_context_deleter {
void operator()(ggml_context * ctx) { ggml_free(ctx); }
};
using ggml_context_ptr = std::unique_ptr<ggml_context, ggml_context_deleter>;
struct ggml_backend_buffer_deleter {
void operator()(ggml_backend_buffer_t buf) { ggml_backend_buffer_free(buf); }
};
using ggml_backend_buffer_ptr = std::unique_ptr<struct ggml_backend_buffer, ggml_backend_buffer_deleter>;
struct ggml_backend_deleter {
void operator()(ggml_backend_t backend) { ggml_backend_free(backend); }
};
using ggml_backend_ptr = std::unique_ptr<struct ggml_backend, ggml_backend_deleter>;
// Utils
static uint64_t get_time_ns() {
using clock = std::chrono::high_resolution_clock;
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}
struct BenchmarkParams {
int64_t m = 4096;
int64_t k = 14336;
int64_t n_prefill = 512;
int64_t n_decode = 1;
int reps = 5;
bool verbose = false;
std::string device_arg = "auto";
};
static void print_usage(const char * argv0) {
printf("usage: %s [options]\n", argv0);
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" -d, --device <dev> device ID (int) or name (str) to use (default: auto)\n");
printf("\n");
}
static void run_benchmark(ggml_backend_t backend, const BenchmarkParams & params, ggml_type type_a, const std::string & phase_name, int64_t n) {
if (params.verbose) {
printf("Benchmarking %s %s: m=%ld n=%ld k=%ld\n", phase_name.c_str(), ggml_type_name(type_a), params.m, n, params.k);
}
// Init context
size_t ctx_size = ggml_tensor_overhead() * 16 + ggml_graph_overhead();
struct ggml_init_params init_params = {
/*.mem_size =*/ ctx_size,
/*.mem_base =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx(ggml_init(init_params));
// Create tensors
// A: Weight matrix (Quantized) [k, m]
// B: Input matrix [k, n]
struct ggml_tensor * a = ggml_new_tensor_2d(ctx.get(), type_a, params.k, params.m);
struct ggml_tensor * b = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, params.k, n);
// Check support
if (!ggml_backend_supports_op(backend, a) || !ggml_backend_supports_op(backend, b)) {
if (params.verbose) printf("Backend does not support input tensors for %s\n", ggml_type_name(type_a));
return;
}
// Build graph: C = A * B
struct ggml_tensor * c = ggml_mul_mat(ctx.get(), a, b);
if (!ggml_backend_supports_op(backend, c)) {
if (params.verbose) printf("Backend does not support MUL_MAT for %s\n", ggml_type_name(type_a));
return;
}
struct ggml_cgraph * gf = ggml_new_graph(ctx.get());
ggml_build_forward_expand(gf, c);
// Allocate memory
ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(ctx.get(), backend));
if (!buffer) {
printf("Failed to allocate memory\n");
return;
}
// Warmup
ggml_backend_graph_compute(backend, gf);
// Run benchmark
uint64_t t_start = get_time_ns();
for (int i = 0; i < params.reps; i++) {
ggml_backend_graph_compute(backend, gf);
}
uint64_t t_end = get_time_ns();
double t_ns = (double)(t_end - t_start) / params.reps;
double t_us = t_ns / 1000.0;
// Stats
// TOPS: 2*m*n*k
double ops = 2.0 * params.m * n * params.k;
double tops = (ops / t_ns) * 1e9 / 1e12; // TOPS
// Print Row
if (n > 1) {
// Prompt Processing: Bandwidth is less relevant, compute bound
printf("| %-10s | %10.2f | %10.2f |\n",
ggml_type_name(type_a), t_us, tops);
} else {
// Token Generation: Bandwidth is critical
// Bandwidth: Size(A) + Size(B) + Size(C)
size_t size_a = ggml_nbytes(a);
size_t size_b = ggml_nbytes(b);
size_t size_c = ggml_nbytes(c);
size_t total_bytes = size_a + size_b + size_c;
double gb_s = (double)total_bytes / t_ns; // GB/s
printf("| %-10s | %10.2f | %10.2f | %10.2f |\n",
ggml_type_name(type_a), t_us, tops, gb_s);
}
}
int main(int argc, char ** argv) {
BenchmarkParams params;
// Parse args
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
print_usage(argv[0]);
return 0;
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "-d" || arg == "--device") {
if (++i >= argc) {
fprintf(stderr, "error: missing argument for %s\n", arg.c_str());
return 1;
}
params.device_arg = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argv[0]);
return 1;
}
}
ggml_backend_load_all();
// Pick backend
ggml_backend_ptr backend_ptr;
if (params.device_arg != "auto") {
// Try to parse as integer index
try {
int id = std::stoi(params.device_arg);
if (id >= 0 && id < (int)ggml_backend_dev_count()) {
ggml_backend_dev_t dev = ggml_backend_dev_get(id);
printf("Using device %d: %s\n", id, ggml_backend_dev_name(dev));
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
}
} catch (...) {
// Not a number, try name lookup
}
if (!backend_ptr) {
// Try by name
ggml_backend_dev_t dev = ggml_backend_dev_by_name(params.device_arg.c_str());
if (dev) {
printf("Using device: %s\n", ggml_backend_dev_name(dev));
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
} else {
fprintf(stderr, "error: device '%s' not found\n", params.device_arg.c_str());
fprintf(stderr, "Available devices:\n");
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t d = ggml_backend_dev_get(i);
fprintf(stderr, " %zu: %s\n", i, ggml_backend_dev_name(d));
}
return 1;
}
}
} else {
// Auto-detect: Prioritize GPU
if (ggml_backend_dev_count() > 0) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
printf("Using auto-detected device %zu: %s\n", i, ggml_backend_dev_name(dev));
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
break;
}
}
}
}
// Fallback to CPU
if (!backend_ptr) {
backend_ptr.reset(ggml_backend_init_by_name("CPU", NULL));
if (!backend_ptr) {
// Try fetching CPU backend by index if name fails (fallback)
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
break;
}
}
}
printf("Using backend: CPU\n");
}
if (!backend_ptr) {
fprintf(stderr, "error: failed to initialize backend\n");
return 1;
}
// Quant types to test
std::vector<ggml_type> quants = {
GGML_TYPE_Q4_0, GGML_TYPE_Q4_K,
GGML_TYPE_Q5_0, GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
GGML_TYPE_Q8_0,
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S,
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS,
GGML_TYPE_MXFP4
};
printf("\n=== Prompt Processing (Prefill) Phase (Batch Size = %ld) ===\n", params.n_prefill);
printf("| %-10s | %-10s | %-10s |\n", "Quant", "Time (us)", "TOPS");
printf("|-%-10s-|-%-10s-|-%-10s-|\n", "----------", "----------", "----------");
for (auto type : quants) {
run_benchmark(backend_ptr.get(), params, type, "Prefill", params.n_prefill);
}
printf("\n=== Token Generation (Decoding) Phase (Batch Size = %ld) ===\n", params.n_decode);
printf("| %-10s | %-10s | %-10s | %-10s |\n", "Quant", "Time (us)", "TOPS", "Eff. BW (GB/s)");
printf("|-%-10s-|-%-10s-|-%-10s-|-%-14s-|\n", "----------", "----------", "----------", "--------------");
for (auto type : quants) {
run_benchmark(backend_ptr.get(), params, type, "Decoding", params.n_decode);
}
return 0;
}