Merge 82cf900599 into 18ddaea2ae

2026-01-02 22:37:32 +01:00 · 2026-01-02 22:37:32 +01:00 · 810b445208
parent 18ddaea2ae 82cf900599
commit 810b445208
2 changed files with 139 additions and 34 deletions
--- a/tools/llama-bench/README.md
+++ b/tools/llama-bench/README.md
@ -32,6 +32,7 @@ options:
  -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
  --list-devices                            list available devices and exit
  -v, --verbose                             verbose output
+  -nf, --no-fail                            continue on failure (default: disabled)
  --progress                                print test progress indicators
  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)

@ -80,6 +81,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave

 Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.

+Using the `-nf` option, any test param combination that results in a failure will NOT cause the entire set of permuted test scenarios to terminate.  Instead, the failing param combination will be logged to STDERR and the execution will then cycle to the next test scenario.  This will repeat until all calculated scenarios have been attempted.  Test scenarios that successfully execute will log results to STDOUT as usual.
+If any permutation resulted in a successful test, the `llama-bench` process will exit with a return code 0 (success).  If all permutations failed to execute, the `llama-bench` process will exit with a return code 1 (fail/error).
+In order for this new flag to remain backwards-compatible with the `llama-bench` tool's previous behavior for a given invocation, this new mode must be explicitly enabled.
+
 For a description of the other options, see the [completion example](../completion/README.md).

 > [!NOTE]
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -344,6 +344,7 @@ struct cmd_params {
    bool                             verbose;
    bool                             progress;
    bool                             no_warmup;
+    bool                             no_fail;
    output_formats                   output_format;
    output_formats                   output_format_stderr;
 };
@ -382,6 +383,7 @@ static const cmd_params cmd_params_defaults = {
    /* verbose              */ false,
    /* progress             */ false,
    /* no_warmup            */ false,
+    /* no_fail              */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
@ -406,6 +408,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                             verbose output\n");
    printf("  --progress                                print test progress indicators\n");
    printf("  --no-warmup                               skip warmup runs before benchmarking\n");
+    printf("  -nf, --no-fail                            continue on failure (default: disabled)\n");
    if (llama_supports_rpc()) {
        printf("  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)\n");
    }
@ -509,6 +512,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.delay                = cmd_params_defaults.delay;
    params.progress             = cmd_params_defaults.progress;
    params.no_warmup            = cmd_params_defaults.no_warmup;
+    params.no_fail              = cmd_params_defaults.no_fail;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@ -933,6 +937,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                params.progress = true;
            } else if (arg == "--no-warmup") {
                params.no_warmup = true;
+            } else if (arg == "-nf" || arg == "--no-fail") {
+                params.no_fail = true;
            } else {
                invalid_param = true;
                break;
@ -2067,6 +2073,7 @@ int main(int argc, char ** argv) {

    int  params_idx   = 0;
    auto params_count = params_instances.size();
+    bool any_success  = false;
    for (const auto & inst : params_instances) {
        params_idx++;
        if (params.progress) {
@ -2080,17 +2087,35 @@ int main(int argc, char ** argv) {

            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
            if (lmodel == NULL) {
-                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
-                return 1;
+                if (params.no_fail) {
+                    fprintf(stderr, "%s: error: failed to load model '%s' - skipping this permutation\n", __func__, inst.model.c_str());
+                    fprintf(stderr, "  Settings: n_gpu_layers=%d, n_cpu_moe=%d, split_mode=%s, main_gpu=%d, use_mmap=%d, no_host=%d, devices=%s\n",
+                            inst.n_gpu_layers, inst.n_cpu_moe, split_mode_str(inst.split_mode), inst.main_gpu,
+                            inst.use_mmap, inst.no_host, devices_to_string(inst.devices).c_str());
+                    continue;
+                } else {
+                    fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+                    return 1;
+                }
            }
            prev_inst = &inst;
        }

        llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
-            llama_model_free(lmodel);
-            return 1;
+            if (params.no_fail) {
+                fprintf(stderr, "%s: error: failed to create context with model '%s' - skipping this permutation\n", __func__, inst.model.c_str());
+                fprintf(stderr, "  Settings: n_batch=%d, n_ubatch=%d, type_k=%s, type_v=%s, flash_attn=%d, no_kv_offload=%d, embeddings=%d, no_op_offload=%d\n",
+                        inst.n_batch, inst.n_ubatch, ggml_type_name(inst.type_k), ggml_type_name(inst.type_v),
+                        inst.flash_attn, inst.no_kv_offload, inst.embeddings, inst.no_op_offload);
+                fprintf(stderr, "  n_prompt=%d, n_gen=%d, n_depth=%d\n", inst.n_prompt, inst.n_gen, inst.n_depth);
+                // Note: Keep lmodel loaded for potential next permutation with same model params
+                continue;
+            } else {
+                fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
+                llama_model_free(lmodel);
+                return 1;
+            }
        }

        test t(inst, lmodel, ctx);
@ -2104,10 +2129,17 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
-            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
-            llama_free(ctx);
-            llama_model_free(lmodel);
-            exit(1);
+            if (params.no_fail) {
+                fprintf(stderr, "%s: failed to parse cpu-mask: %s - skipping this permutation\n", __func__, t.cpu_mask.c_str());
+                fprintf(stderr, "  Settings: n_threads=%d, cpu_strict=%d, poll=%d\n", t.n_threads, t.cpu_strict, t.poll);
+                llama_free(ctx);
+                continue;
+            } else {
+                fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+                llama_free(ctx);
+                llama_model_free(lmodel);
+                exit(1);
+            }
        }
        tpp.strict_cpu = t.cpu_strict;
        tpp.poll       = t.poll;
@ -2115,10 +2147,17 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
        if (!threadpool) {
-            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-            llama_free(ctx);
-            llama_model_free(lmodel);
-            exit(1);
+            if (params.no_fail) {
+                fprintf(stderr, "%s: threadpool create failed : n_threads %d - skipping this permutation\n", __func__, tpp.n_threads);
+                fprintf(stderr, "  Settings: cpu_mask=%s, cpu_strict=%d, poll=%d\n", t.cpu_mask.c_str(), t.cpu_strict, t.poll);
+                llama_free(ctx);
+                continue;
+            } else {
+                fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+                llama_free(ctx);
+                llama_model_free(lmodel);
+                exit(1);
+            }
        }

        llama_attach_threadpool(ctx, threadpool, NULL);
@ -2132,10 +2171,21 @@ int main(int argc, char ** argv) {
                //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                if (!res) {
-                    fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
-                    llama_free(ctx);
-                    llama_model_free(lmodel);
-                    exit(1);
+                    if (params.no_fail) {
+                        fprintf(stderr, "%s: error: failed to run prompt warmup - skipping this permutation\n", __func__);
+                        fprintf(stderr, "  Settings: model=%s, n_prompt=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
+                                t.model_type.c_str(), t.n_prompt, t.n_batch, t.n_ubatch, t.n_threads);
+                        fprintf(stderr, "  type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
+                                ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
+                        llama_free(ctx);
+                        ggml_threadpool_free_fn(threadpool);
+                        continue;
+                    } else {
+                        fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
+                        exit(1);
+                    }
                }
            }
            if (t.n_gen > 0) {
@ -2144,10 +2194,21 @@ int main(int argc, char ** argv) {
                }
                bool res = test_gen(ctx, 1, t.n_threads);
                if (!res) {
-                    fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
-                    llama_free(ctx);
-                    llama_model_free(lmodel);
-                    exit(1);
+                    if (params.no_fail) {
+                        fprintf(stderr, "%s: error: failed to run gen warmup - skipping this permutation\n", __func__);
+                        fprintf(stderr, "  Settings: model=%s, n_gen=1, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
+                                t.model_type.c_str(), t.n_batch, t.n_ubatch, t.n_threads);
+                        fprintf(stderr, "  type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
+                                ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
+                        llama_free(ctx);
+                        ggml_threadpool_free_fn(threadpool);
+                        continue;
+                    } else {
+                        fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
+                        exit(1);
+                    }
                }
            }
        }
@ -2174,10 +2235,21 @@ int main(int argc, char ** argv) {
                    }
                    bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
                    if (!res) {
-                        fprintf(stderr, "%s: error: failed to run depth\n", __func__);
-                        llama_free(ctx);
-                        llama_model_free(lmodel);
-                        exit(1);
+                        if (params.no_fail) {
+                            fprintf(stderr, "%s: error: failed to run depth - skipping this permutation\n", __func__);
+                            fprintf(stderr, "  Settings: model=%s, n_depth=%d, n_batch=%d, n_ubatch=%d, n_threads=%d\n",
+                                    t.model_type.c_str(), t.n_depth, t.n_batch, t.n_ubatch, t.n_threads);
+                            fprintf(stderr, "  type_k=%s, type_v=%s, n_gpu_layers=%d, flash_attn=%d\n",
+                                    ggml_type_name(t.type_k), ggml_type_name(t.type_v), t.n_gpu_layers, t.flash_attn);
+                            llama_free(ctx);
+                            ggml_threadpool_free_fn(threadpool);
+                            continue;
+                        } else {
+                            fprintf(stderr, "%s: error: failed to run depth\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(lmodel);
+                            exit(1);
+                        }
                    }

                    // store the context state for reuse in later runs
@ -2201,10 +2273,22 @@ int main(int argc, char ** argv) {
                }
                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                if (!res) {
-                    fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
-                    llama_free(ctx);
-                    llama_model_free(lmodel);
-                    exit(1);
+                    if (params.no_fail) {
+                        fprintf(stderr, "%s: error: failed to run prompt - skipping this permutation\n", __func__);
+                        fprintf(stderr, "  Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n",
+                                t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth);
+                        fprintf(stderr, "  n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n",
+                                t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v));
+                        fprintf(stderr, "  n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn);
+                        llama_free(ctx);
+                        ggml_threadpool_free_fn(threadpool);
+                        continue;
+                    } else {
+                        fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
+                        exit(1);
+                    }
                }
            }
            if (t.n_gen > 0) {
@ -2214,10 +2298,22 @@ int main(int argc, char ** argv) {
                }
                bool res = test_gen(ctx, t.n_gen, t.n_threads);
                if (!res) {
-                    fprintf(stderr, "%s: error: failed to run gen\n", __func__);
-                    llama_free(ctx);
-                    llama_model_free(lmodel);
-                    exit(1);
+                    if (params.no_fail) {
+                        fprintf(stderr, "%s: error: failed to run gen - skipping this permutation\n", __func__);
+                        fprintf(stderr, "  Settings: model=%s, n_prompt=%d, n_gen=%d, n_depth=%d\n",
+                                t.model_type.c_str(), t.n_prompt, t.n_gen, t.n_depth);
+                        fprintf(stderr, "  n_batch=%d, n_ubatch=%d, n_threads=%d, type_k=%s, type_v=%s\n",
+                                t.n_batch, t.n_ubatch, t.n_threads, ggml_type_name(t.type_k), ggml_type_name(t.type_v));
+                        fprintf(stderr, "  n_gpu_layers=%d, flash_attn=%d\n", t.n_gpu_layers, t.flash_attn);
+                        llama_free(ctx);
+                        ggml_threadpool_free_fn(threadpool);
+                        continue;
+                    } else {
+                        fprintf(stderr, "%s: error: failed to run gen\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
+                        exit(1);
+                    }
                }
            }

@ -2225,6 +2321,9 @@ int main(int argc, char ** argv) {
            t.samples_ns.push_back(t_ns);
        }

+        // Mark this test as successful
+        any_success = true;
+
        if (p) {
            p->print_test(t);
            fflush(p->fout);
@ -2254,5 +2353,6 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    return 0;
+    // Exit code logic: 0 if any test succeeded, 1 if all tests failed
+    return any_success ? 0 : 1;
 }