tools: add quant-bench for profiling raw kernel performance
This commit is contained in:
parent
22cae83218
commit
05dfc18d55
|
|
@ -3,6 +3,11 @@ add_executable(${TARGET} llama-bench.cpp)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
set(TARGET_QUANT quant-bench)
|
||||||
|
add_executable(${TARGET_QUANT} quant-bench.cpp)
|
||||||
|
target_link_libraries(${TARGET_QUANT} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET_QUANT} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
if(LLAMA_TOOLS_INSTALL)
|
if(LLAMA_TOOLS_INSTALL)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} ${TARGET_QUANT} RUNTIME)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,262 @@
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <chrono>
|
||||||
|
#include <map>
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <thread>
|
||||||
|
#include <memory>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
// Smart pointers for RAII cleanup
|
||||||
|
struct ggml_context_deleter {
|
||||||
|
void operator()(ggml_context * ctx) { ggml_free(ctx); }
|
||||||
|
};
|
||||||
|
using ggml_context_ptr = std::unique_ptr<ggml_context, ggml_context_deleter>;
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_deleter {
|
||||||
|
void operator()(ggml_backend_buffer_t buf) { ggml_backend_buffer_free(buf); }
|
||||||
|
};
|
||||||
|
using ggml_backend_buffer_ptr = std::unique_ptr<struct ggml_backend_buffer, ggml_backend_buffer_deleter>;
|
||||||
|
|
||||||
|
struct ggml_backend_deleter {
|
||||||
|
void operator()(ggml_backend_t backend) { ggml_backend_free(backend); }
|
||||||
|
};
|
||||||
|
using ggml_backend_ptr = std::unique_ptr<struct ggml_backend, ggml_backend_deleter>;
|
||||||
|
|
||||||
|
// Utils
|
||||||
|
static uint64_t get_time_ns() {
|
||||||
|
using clock = std::chrono::high_resolution_clock;
|
||||||
|
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BenchmarkParams {
|
||||||
|
int64_t m = 4096;
|
||||||
|
int64_t k = 14336;
|
||||||
|
int64_t n_prefill = 512;
|
||||||
|
int64_t n_decode = 1;
|
||||||
|
int reps = 5;
|
||||||
|
bool verbose = false;
|
||||||
|
std::string device_arg = "auto";
|
||||||
|
};
|
||||||
|
|
||||||
|
static void print_usage(const char * argv0) {
|
||||||
|
printf("usage: %s [options]\n", argv0);
|
||||||
|
printf("\n");
|
||||||
|
printf("options:\n");
|
||||||
|
printf(" -h, --help show this help message and exit\n");
|
||||||
|
printf(" -v, --verbose verbose output\n");
|
||||||
|
printf(" -d, --device <dev> device ID (int) or name (str) to use (default: auto)\n");
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run_benchmark(ggml_backend_t backend, const BenchmarkParams & params, ggml_type type_a, const std::string & phase_name, int64_t n) {
|
||||||
|
if (params.verbose) {
|
||||||
|
printf("Benchmarking %s %s: m=%ld n=%ld k=%ld\n", phase_name.c_str(), ggml_type_name(type_a), params.m, n, params.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init context
|
||||||
|
size_t ctx_size = ggml_tensor_overhead() * 16 + ggml_graph_overhead();
|
||||||
|
struct ggml_init_params init_params = {
|
||||||
|
/*.mem_size =*/ ctx_size,
|
||||||
|
/*.mem_base =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ggml_context_ptr ctx(ggml_init(init_params));
|
||||||
|
|
||||||
|
// Create tensors
|
||||||
|
// A: Weight matrix (Quantized) [k, m]
|
||||||
|
// B: Input matrix [k, n]
|
||||||
|
struct ggml_tensor * a = ggml_new_tensor_2d(ctx.get(), type_a, params.k, params.m);
|
||||||
|
struct ggml_tensor * b = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, params.k, n);
|
||||||
|
|
||||||
|
// Check support
|
||||||
|
if (!ggml_backend_supports_op(backend, a) || !ggml_backend_supports_op(backend, b)) {
|
||||||
|
if (params.verbose) printf("Backend does not support input tensors for %s\n", ggml_type_name(type_a));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build graph: C = A * B
|
||||||
|
struct ggml_tensor * c = ggml_mul_mat(ctx.get(), a, b);
|
||||||
|
|
||||||
|
if (!ggml_backend_supports_op(backend, c)) {
|
||||||
|
if (params.verbose) printf("Backend does not support MUL_MAT for %s\n", ggml_type_name(type_a));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx.get());
|
||||||
|
ggml_build_forward_expand(gf, c);
|
||||||
|
|
||||||
|
// Allocate memory
|
||||||
|
ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(ctx.get(), backend));
|
||||||
|
if (!buffer) {
|
||||||
|
printf("Failed to allocate memory\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
|
// Run benchmark
|
||||||
|
uint64_t t_start = get_time_ns();
|
||||||
|
for (int i = 0; i < params.reps; i++) {
|
||||||
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
}
|
||||||
|
uint64_t t_end = get_time_ns();
|
||||||
|
|
||||||
|
double t_ns = (double)(t_end - t_start) / params.reps;
|
||||||
|
double t_us = t_ns / 1000.0;
|
||||||
|
|
||||||
|
// Stats
|
||||||
|
// TOPS: 2*m*n*k
|
||||||
|
double ops = 2.0 * params.m * n * params.k;
|
||||||
|
double tops = (ops / t_ns) * 1e9 / 1e12; // TOPS
|
||||||
|
|
||||||
|
// Print Row
|
||||||
|
if (n > 1) {
|
||||||
|
// Prompt Processing: Bandwidth is less relevant, compute bound
|
||||||
|
printf("| %-10s | %10.2f | %10.2f |\n",
|
||||||
|
ggml_type_name(type_a), t_us, tops);
|
||||||
|
} else {
|
||||||
|
// Token Generation: Bandwidth is critical
|
||||||
|
// Bandwidth: Size(A) + Size(B) + Size(C)
|
||||||
|
size_t size_a = ggml_nbytes(a);
|
||||||
|
size_t size_b = ggml_nbytes(b);
|
||||||
|
size_t size_c = ggml_nbytes(c);
|
||||||
|
size_t total_bytes = size_a + size_b + size_c;
|
||||||
|
double gb_s = (double)total_bytes / t_ns; // GB/s
|
||||||
|
|
||||||
|
printf("| %-10s | %10.2f | %10.2f | %10.2f |\n",
|
||||||
|
ggml_type_name(type_a), t_us, tops, gb_s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
BenchmarkParams params;
|
||||||
|
|
||||||
|
// Parse args
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
std::string arg = argv[i];
|
||||||
|
if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage(argv[0]);
|
||||||
|
return 0;
|
||||||
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
|
params.verbose = true;
|
||||||
|
} else if (arg == "-d" || arg == "--device") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
fprintf(stderr, "error: missing argument for %s\n", arg.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
params.device_arg = argv[i];
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
print_usage(argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_load_all();
|
||||||
|
|
||||||
|
// Pick backend
|
||||||
|
ggml_backend_ptr backend_ptr;
|
||||||
|
|
||||||
|
if (params.device_arg != "auto") {
|
||||||
|
// Try to parse as integer index
|
||||||
|
try {
|
||||||
|
int id = std::stoi(params.device_arg);
|
||||||
|
if (id >= 0 && id < (int)ggml_backend_dev_count()) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(id);
|
||||||
|
printf("Using device %d: %s\n", id, ggml_backend_dev_name(dev));
|
||||||
|
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
// Not a number, try name lookup
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!backend_ptr) {
|
||||||
|
// Try by name
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_by_name(params.device_arg.c_str());
|
||||||
|
if (dev) {
|
||||||
|
printf("Using device: %s\n", ggml_backend_dev_name(dev));
|
||||||
|
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: device '%s' not found\n", params.device_arg.c_str());
|
||||||
|
fprintf(stderr, "Available devices:\n");
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
|
ggml_backend_dev_t d = ggml_backend_dev_get(i);
|
||||||
|
fprintf(stderr, " %zu: %s\n", i, ggml_backend_dev_name(d));
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Auto-detect: Prioritize GPU
|
||||||
|
if (ggml_backend_dev_count() > 0) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||||
|
printf("Using auto-detected device %zu: %s\n", i, ggml_backend_dev_name(dev));
|
||||||
|
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to CPU
|
||||||
|
if (!backend_ptr) {
|
||||||
|
backend_ptr.reset(ggml_backend_init_by_name("CPU", NULL));
|
||||||
|
if (!backend_ptr) {
|
||||||
|
// Try fetching CPU backend by index if name fails (fallback)
|
||||||
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||||
|
backend_ptr.reset(ggml_backend_dev_init(dev, NULL));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("Using backend: CPU\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!backend_ptr) {
|
||||||
|
fprintf(stderr, "error: failed to initialize backend\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quant types to test
|
||||||
|
std::vector<ggml_type> quants = {
|
||||||
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_K,
|
||||||
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_K,
|
||||||
|
GGML_TYPE_Q6_K,
|
||||||
|
GGML_TYPE_Q8_0,
|
||||||
|
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
||||||
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S,
|
||||||
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS,
|
||||||
|
GGML_TYPE_MXFP4
|
||||||
|
};
|
||||||
|
|
||||||
|
printf("\n=== Prompt Processing (Prefill) Phase (Batch Size = %ld) ===\n", params.n_prefill);
|
||||||
|
printf("| %-10s | %-10s | %-10s |\n", "Quant", "Time (us)", "TOPS");
|
||||||
|
printf("|-%-10s-|-%-10s-|-%-10s-|\n", "----------", "----------", "----------");
|
||||||
|
|
||||||
|
for (auto type : quants) {
|
||||||
|
run_benchmark(backend_ptr.get(), params, type, "Prefill", params.n_prefill);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Token Generation (Decoding) Phase (Batch Size = %ld) ===\n", params.n_decode);
|
||||||
|
printf("| %-10s | %-10s | %-10s | %-10s |\n", "Quant", "Time (us)", "TOPS", "Eff. BW (GB/s)");
|
||||||
|
printf("|-%-10s-|-%-10s-|-%-10s-|-%-14s-|\n", "----------", "----------", "----------", "--------------");
|
||||||
|
|
||||||
|
for (auto type : quants) {
|
||||||
|
run_benchmark(backend_ptr.get(), params, type, "Decoding", params.n_decode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue