From b4c0a88c0e6b8606dff88cfabc06f8fcc9e67b25 Mon Sep 17 00:00:00 2001 From: SamAcctX <87765660+SamAcctX@users.noreply.github.com> Date: Fri, 2 Jan 2026 13:59:23 -0600 Subject: [PATCH] Update llama-bench.cpp llama-bench: Continue On Failure option --- tools/llama-bench/llama-bench.cpp | 168 ++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 34 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index a98ede0a57..f5afe47eb5 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -344,6 +344,7 @@ struct cmd_params { bool verbose; bool progress; bool no_warmup; + bool no_fail; output_formats output_format; output_formats output_format_stderr; }; @@ -382,6 +383,7 @@ static const cmd_params cmd_params_defaults = { /* verbose */ false, /* progress */ false, /* no_warmup */ false, + /* no_fail */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, }; @@ -406,6 +408,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); + printf(" -nf, --no-fail continue on failure (default: disabled)\n"); if (llama_supports_rpc()) { printf(" -rpc, --rpc register RPC devices (comma separated)\n"); } @@ -509,6 +512,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.delay = cmd_params_defaults.delay; params.progress = cmd_params_defaults.progress; params.no_warmup = cmd_params_defaults.no_warmup; + params.no_fail = cmd_params_defaults.no_fail; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -933,6 +937,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.progress = true; } else if (arg == "--no-warmup") { params.no_warmup = true; + } else if (arg == "-nf" || arg == "--no-fail") { + params.no_fail = true; } else { invalid_param = true; break; @@ -2067,6 +2073,7 @@ int main(int argc, char ** argv) { int params_idx = 0; auto params_count = params_instances.size(); + bool any_success = false; for (const auto & inst : params_instances) { params_idx++; if (params.progress) { @@ -2080,17 +2087,35 @@ int main(int argc, char ** argv) { lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); - return 1; + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to load model '%s' - skipping this permutation\n", __func__, inst.model.c_str()); + fprintf(stderr, " Settings: n_gpu_layers=%d, n_cpu_moe=%d, split_mode=%s, main_gpu=%d, use_mmap=%d, no_host=%d, devices=%s\n", + inst.n_gpu_layers, inst.n_cpu_moe, split_mode_str(inst.split_mode), inst.main_gpu, + inst.use_mmap, inst.no_host, devices_to_string(inst.devices).c_str()); + continue; + } else { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } } prev_inst = &inst; } llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_model_free(lmodel); - return 1; + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to create context with model '%s' - skipping this permutation\n", __func__, inst.model.c_str()); + fprintf(stderr, " Settings: n_batch=%d, n_ubatch=%d, type_k=%s, type_v=%s, flash_attn=%d, no_kv_offload=%d, embeddings=%d, no_op_offload=%d\n", + inst.n_batch, inst.n_ubatch, ggml_type_name(inst.type_k), ggml_type_name(inst.type_v), + inst.flash_attn, inst.no_kv_offload, inst.embeddings, inst.no_op_offload); + fprintf(stderr, " n_prompt=%d, n_gen=%d, n_depth=%d\n", inst.n_prompt, inst.n_gen, inst.n_depth); + // Note: Keep lmodel loaded for potential next permutation with same model params + continue; + } else { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); + llama_model_free(lmodel); + return 1; + } } test t(inst, lmodel, ctx); @@ -2104,10 +2129,17 @@ int main(int argc, char ** argv) { struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) { - fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: failed to parse cpu-mask: %s - skipping this permutation\n", __func__, t.cpu_mask.c_str()); + fprintf(stderr, " Settings: n_threads=%d, cpu_strict=%d, poll=%d\n", t.n_threads, t.cpu_strict, t.poll); + llama_free(ctx); + continue; + } else { + fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } tpp.strict_cpu = t.cpu_strict; tpp.poll = t.poll; @@ -2115,10 +2147,17 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { - fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: threadpool create failed : n_threads %d - skipping this permutation\n", __func__, tpp.n_threads); + fprintf(stderr, " Settings: cpu_mask=%s, cpu_strict=%d, poll=%d\n", t.cpu_mask.c_str(), t.cpu_strict, t.poll); + llama_free(ctx); + continue; + } else { + fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } llama_attach_threadpool(ctx, threadpool, NULL); @@ -2132,10 +2171,21 @@ int main(int argc, char ** argv) { //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); if (!res) { - fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to run prompt warmup - skipping this permutation\n", __func__); + fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n", + t.model_type.c_str(), t.n_prompt, t.n_batch, t.n_ubatch, t.n_threads); + fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n", + ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn); + llama_free(ctx); + ggml_threadpool_free_fn(threadpool); + continue; + } else { + fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } } if (t.n_gen > 0) { @@ -2144,10 +2194,21 @@ int main(int argc, char ** argv) { } bool res = test_gen(ctx, 1, t.n_threads); if (!res) { - fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to run gen warmup - skipping this permutation\n", __func__); + fprintf(stderr, " Settings: model=%s, n_gen=1, n_batch=%d, n_ubatch=%d, n_threads=%d\n", + t.model_type.c_str(), t.n_batch, t.n_ubatch, t.n_threads); + fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n", + ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn); + llama_free(ctx); + ggml_threadpool_free_fn(threadpool); + continue; + } else { + fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } } } @@ -2174,10 +2235,21 @@ int main(int argc, char ** argv) { } bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); if (!res) { - fprintf(stderr, "%s: error: failed to run depth\n", __func__); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to run depth - skipping this permutation\n", __func__); + fprintf(stderr, " Settings: model=%s, n_depth=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n", + t.model_type.c_str(), t.n_depth, t.n_batch, t.n_ubatch, t.n_threads); + fprintf(stderr, " type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n", + ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn); + llama_free(ctx); + ggml_threadpool_free_fn(threadpool); + continue; + } else { + fprintf(stderr, "%s: error: failed to run depth\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } // store the context state for reuse in later runs @@ -2201,10 +2273,22 @@ int main(int argc, char ** argv) { } bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); if (!res) { - fprintf(stderr, "%s: error: failed to run prompt\n", __func__); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to run prompt - skipping this permutation\n", __func__); + fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n", + t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth); + fprintf(stderr, " n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n", + t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v)); + fprintf(stderr, " n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn); + llama_free(ctx); + ggml_threadpool_free_fn(threadpool); + continue; + } else { + fprintf(stderr, "%s: error: failed to run prompt\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } } if (t.n_gen > 0) { @@ -2214,10 +2298,22 @@ int main(int argc, char ** argv) { } bool res = test_gen(ctx, t.n_gen, t.n_threads); if (!res) { - fprintf(stderr, "%s: error: failed to run gen\n", __func__); - llama_free(ctx); - llama_model_free(lmodel); - exit(1); + if (params.no_fail) { + fprintf(stderr, "%s: error: failed to run gen - skipping this permutation\n", __func__); + fprintf(stderr, " Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n", + t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth); + fprintf(stderr, " n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n", + t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v)); + fprintf(stderr, " n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn); + llama_free(ctx); + ggml_threadpool_free_fn(threadpool); + continue; + } else { + fprintf(stderr, "%s: error: failed to run gen\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } } } @@ -2225,6 +2321,9 @@ int main(int argc, char ** argv) { t.samples_ns.push_back(t_ns); } + // Mark this test as successful + any_success = true; + if (p) { p->print_test(t); fflush(p->fout); @@ -2254,5 +2353,6 @@ int main(int argc, char ** argv) { llama_backend_free(); - return 0; + // Exit code logic: 0 if any test succeeded, 1 if all tests failed + return any_success ? 0 : 1; }