Update llama-bench.cpp
llama-bench: Continue On Failure option
This commit is contained in:
parent
706e3f93a6
commit
b4c0a88c0e
|
|
@ -344,6 +344,7 @@ struct cmd_params {
|
|||
bool verbose;
|
||||
bool progress;
|
||||
bool no_warmup;
|
||||
bool no_fail;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
};
|
||||
|
|
@ -382,6 +383,7 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* verbose */ false,
|
||||
/* progress */ false,
|
||||
/* no_warmup */ false,
|
||||
/* no_fail */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
};
|
||||
|
|
@ -406,6 +408,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -v, --verbose verbose output\n");
|
||||
printf(" --progress print test progress indicators\n");
|
||||
printf(" --no-warmup skip warmup runs before benchmarking\n");
|
||||
printf(" -nf, --no-fail continue on failure (default: disabled)\n");
|
||||
if (llama_supports_rpc()) {
|
||||
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
|
||||
}
|
||||
|
|
@ -509,6 +512,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
params.delay = cmd_params_defaults.delay;
|
||||
params.progress = cmd_params_defaults.progress;
|
||||
params.no_warmup = cmd_params_defaults.no_warmup;
|
||||
params.no_fail = cmd_params_defaults.no_fail;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
|
@ -933,6 +937,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
params.progress = true;
|
||||
} else if (arg == "--no-warmup") {
|
||||
params.no_warmup = true;
|
||||
} else if (arg == "-nf" || arg == "--no-fail") {
|
||||
params.no_fail = true;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
|
@ -2067,6 +2073,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
int params_idx = 0;
|
||||
auto params_count = params_instances.size();
|
||||
bool any_success = false;
|
||||
for (const auto & inst : params_instances) {
|
||||
params_idx++;
|
||||
if (params.progress) {
|
||||
|
|
@ -2080,17 +2087,35 @@ int main(int argc, char ** argv) {
|
|||
|
||||
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
||||
if (lmodel == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
||||
return 1;
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s' - skipping this permutation\n", __func__, inst.model.c_str());
|
||||
fprintf(stderr, " Settings: n_gpu_layers=%d, n_cpu_moe=%d, split_mode=%s, main_gpu=%d, use_mmap=%d, no_host=%d, devices=%s\n",
|
||||
inst.n_gpu_layers, inst.n_cpu_moe, split_mode_str(inst.split_mode), inst.main_gpu,
|
||||
inst.use_mmap, inst.no_host, devices_to_string(inst.devices).c_str());
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
prev_inst = &inst;
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
||||
llama_model_free(lmodel);
|
||||
return 1;
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s' - skipping this permutation\n", __func__, inst.model.c_str());
|
||||
fprintf(stderr, " Settings: n_batch=%d, n_ubatch=%d, type_k=%s, type_v=%s, flash_attn=%d, no_kv_offload=%d, embeddings=%d, no_op_offload=%d\n",
|
||||
inst.n_batch, inst.n_ubatch, ggml_type_name(inst.type_k), ggml_type_name(inst.type_v),
|
||||
inst.flash_attn, inst.no_kv_offload, inst.embeddings, inst.no_op_offload);
|
||||
fprintf(stderr, " n_prompt=%d, n_gen=%d, n_depth=%d\n", inst.n_prompt, inst.n_gen, inst.n_depth);
|
||||
// Note: Keep lmodel loaded for potential next permutation with same model params
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
||||
llama_model_free(lmodel);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
|
@ -2104,10 +2129,17 @@ int main(int argc, char ** argv) {
|
|||
|
||||
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: failed to parse cpu-mask: %s - skipping this permutation\n", __func__, t.cpu_mask.c_str());
|
||||
fprintf(stderr, " Settings: n_threads=%d, cpu_strict=%d, poll=%d\n", t.n_threads, t.cpu_strict, t.poll);
|
||||
llama_free(ctx);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
tpp.strict_cpu = t.cpu_strict;
|
||||
tpp.poll = t.poll;
|
||||
|
|
@ -2115,10 +2147,17 @@ int main(int argc, char ** argv) {
|
|||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d - skipping this permutation\n", __func__, tpp.n_threads);
|
||||
fprintf(stderr, " Settings: cpu_mask=%s, cpu_strict=%d, poll=%d\n", t.cpu_mask.c_str(), t.cpu_strict, t.poll);
|
||||
llama_free(ctx);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||
|
|
@ -2132,10 +2171,21 @@ int main(int argc, char ** argv) {
|
|||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to run prompt warmup - skipping this permutation\n", __func__);
|
||||
fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
|
||||
t.model_type.c_str(), t.n_prompt, t.n_batch, t.n_ubatch, t.n_threads);
|
||||
fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
|
||||
ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
|
||||
llama_free(ctx);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (t.n_gen > 0) {
|
||||
|
|
@ -2144,10 +2194,21 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
bool res = test_gen(ctx, 1, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to run gen warmup - skipping this permutation\n", __func__);
|
||||
fprintf(stderr, " Settings: model=%s, n_gen=1, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
|
||||
t.model_type.c_str(), t.n_batch, t.n_ubatch, t.n_threads);
|
||||
fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
|
||||
ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
|
||||
llama_free(ctx);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2174,10 +2235,21 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to run depth - skipping this permutation\n", __func__);
|
||||
fprintf(stderr, " Settings: model=%s, n_depth=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
|
||||
t.model_type.c_str(), t.n_depth, t.n_batch, t.n_ubatch, t.n_threads);
|
||||
fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
|
||||
ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
|
||||
llama_free(ctx);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// store the context state for reuse in later runs
|
||||
|
|
@ -2201,10 +2273,22 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to run prompt - skipping this permutation\n", __func__);
|
||||
fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n",
|
||||
t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth);
|
||||
fprintf(stderr, " n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n",
|
||||
t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v));
|
||||
fprintf(stderr, " n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn);
|
||||
llama_free(ctx);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (t.n_gen > 0) {
|
||||
|
|
@ -2214,10 +2298,22 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
bool res = test_gen(ctx, t.n_gen, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
if (params.no_fail) {
|
||||
fprintf(stderr, "%s: error: failed to run gen - skipping this permutation\n", __func__);
|
||||
fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n",
|
||||
t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth);
|
||||
fprintf(stderr, " n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n",
|
||||
t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v));
|
||||
fprintf(stderr, " n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn);
|
||||
llama_free(ctx);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
|
||||
llama_free(ctx);
|
||||
llama_model_free(lmodel);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2225,6 +2321,9 @@ int main(int argc, char ** argv) {
|
|||
t.samples_ns.push_back(t_ns);
|
||||
}
|
||||
|
||||
// Mark this test as successful
|
||||
any_success = true;
|
||||
|
||||
if (p) {
|
||||
p->print_test(t);
|
||||
fflush(p->fout);
|
||||
|
|
@ -2254,5 +2353,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
// Exit code logic: 0 if any test succeeded, 1 if all tests failed
|
||||
return any_success ? 0 : 1;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue