From 50d21aa4a4a4b63dc1023d34066d89f1331f6b8f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 24 Nov 2025 07:18:39 +0100 Subject: [PATCH] tests : cleanup test-backend-sampler.cpp --- tests/test-backend-sampler.cpp | 150 +++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 65 deletions(-) diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp index ebbb6e039e..19e9855aed 100644 --- a/tests/test-backend-sampler.cpp +++ b/tests/test-backend-sampler.cpp @@ -16,38 +16,40 @@ #include struct test_model_context { - llama_model * model = nullptr; - llama_context * ctx = nullptr; - const llama_vocab * vocab = nullptr; - int n_vocab = 0; + llama_model * model = nullptr; + llama_context * ctx = nullptr; + const llama_vocab * vocab = nullptr; + int n_vocab = 0; + std::unordered_map seq_positions; std::unordered_map last_batch_info; - bool setup_model(const char * model_path) { + bool load_model(const char * model_path) { if (model != nullptr) { return true; } llama_backend_init(); - llama_model_params mparams = llama_model_default_params(); - model = llama_model_load_from_file(model_path, mparams); + model = llama_model_load_from_file(model_path, llama_model_default_params()); if (model == nullptr) { fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", model_path); cleanup(); return false; } - vocab = llama_model_get_vocab(model); + vocab = llama_model_get_vocab(model); + n_vocab = llama_vocab_n_tokens(vocab); + fprintf(stderr, "Vocabulary size: %d\n", n_vocab); return true; } bool setup(const char * model_path, std::vector & configs) { if (model == nullptr) { - setup_model(model_path); + load_model(model_path); } - if (model != nullptr && ctx != nullptr) { + if (ctx != nullptr) { return true; } @@ -73,10 +75,6 @@ struct test_model_context { } llama_set_warmup(ctx, false); - vocab = llama_model_get_vocab(model); - n_vocab = llama_vocab_n_tokens(vocab); - fprintf(stderr, "Vocabulary size: %d\n", n_vocab); - return true; } @@ -130,15 +128,15 @@ struct test_model_context { printf("Batch contents:\n"); - printf(" n_tokens: %d\n", batch.n_tokens); + printf("n_tokens: %d\n", batch.n_tokens); for (int i = 0; i < batch.n_tokens; i++) { - printf(" token[%d]: tok=%-5d, pos=%d, n_seq_id=%d, seq_ids=[", i, batch.token[i], batch.pos[i], batch.n_seq_id[i]); + printf("token[%d]: tok=%-5d, pos=%d, n_seq_id=%d, seq_ids=[", i, batch.token[i], batch.pos[i], batch.n_seq_id[i]); - for (int j = 0; j < batch.n_seq_id[i]; j++) { - printf("%d%s", batch.seq_id[i][j], j < batch.n_seq_id[i]-1 ? ", " : ""); + for (int j = 0; j < batch.n_seq_id[i]; j++) { + printf("%d%s", batch.seq_id[i][j], j < batch.n_seq_id[i]-1 ? ", " : ""); + } + printf("], logits=%d\n", batch.logits[i]); } - printf("], logits=%d\n", batch.logits[i]); -} if (llama_decode(ctx, batch) != 0) { fprintf(stderr, "Warning: llama_decode failed\n"); @@ -151,7 +149,6 @@ struct test_model_context { if (batch.logits[i]) { llama_seq_id seq_id = batch.seq_id[i][0]; last_batch_info[seq_id] = i; - printf("seq %d : batch idx %d\n", seq_id, i); } } @@ -249,10 +246,15 @@ struct test_model_context { } void cleanup() { - if (ctx) llama_free(ctx); - if (model) llama_model_free(model); + if (ctx) { + llama_free(ctx); + } + if (model) { + llama_model_free(model); + } llama_backend_free(); - ctx = nullptr; + + ctx = nullptr; model = nullptr; vocab = nullptr; } @@ -374,36 +376,44 @@ static void test_backend_temp_sampling(const char * model_path) { return; } - int32_t batch_idx_0 = test_ctx.idx_for_seq(0); - int32_t batch_idx_1 = test_ctx.idx_for_seq(1); + // Verfify sequence 0 + { + int32_t batch_idx = test_ctx.idx_for_seq(0); + int n_logits = llama_get_backend_sampled_logits_count_ith(test_ctx.ctx, batch_idx); + GGML_ASSERT(n_logits == test_ctx.n_vocab); - int n_logits = llama_get_backend_sampled_logits_count_ith(test_ctx.ctx, batch_idx_0); - GGML_ASSERT(n_logits == test_ctx.n_vocab); + // Sample from sequence 0 using CPU sampler + struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params(); + struct llama_sampler * chain = llama_sampler_chain_init(chain_params); + llama_sampler_chain_add(chain, llama_sampler_init_dist(18)); - // Sample from sequence 0 using CPU sampler - struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params(); - struct llama_sampler * chain_0 = llama_sampler_chain_init(chain_params_0); - llama_sampler_chain_add(chain_0, llama_sampler_init_dist(18)); + llama_token token = llama_sampler_sample(chain, test_ctx.ctx, batch_idx); + const std::string token_str = test_ctx.token_to_piece(token, false); + printf("Sequence 0 sampled token id:%d, string: '%s'\n", token, token_str.c_str()); + GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); - llama_token token_0 = llama_sampler_sample(chain_0, test_ctx.ctx, batch_idx_0); - const std::string token_0_str = test_ctx.token_to_piece(token_0, false); - printf("Sequence 0 sampled token id:%d, string: '%s'\n", token_0, token_0_str.c_str()); - GGML_ASSERT(token_0 >= 0 && token_0 < test_ctx.n_vocab); + llama_sampler_free(chain); + } - // Sample from sequence 1 using CPU sampler - struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params(); - struct llama_sampler * chain_1 = llama_sampler_chain_init(chain_params_1); - llama_sampler_chain_add(chain_1, llama_sampler_init_dist(18)); + // Verfify sequence 1 + { + int32_t batch_idx = test_ctx.idx_for_seq(1); - llama_token token_1 = llama_sampler_sample(chain_1, test_ctx.ctx, batch_idx_1); - const std::string token_1_str = test_ctx.token_to_piece(token_1, false); - printf("Sequence 1 sampled token id:%d, string: '%s'\n", token_1, token_1_str.c_str()); - GGML_ASSERT(token_1 >= 0 && token_1 < test_ctx.n_vocab); + // Sample from sequence 1 using CPU sampler + struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params(); + struct llama_sampler * chain = llama_sampler_chain_init(chain_params); + llama_sampler_chain_add(chain, llama_sampler_init_dist(18)); + + llama_token token = llama_sampler_sample(chain, test_ctx.ctx, batch_idx); + const std::string token_str = test_ctx.token_to_piece(token, false); + printf("Sequence 1 sampled token id:%d, string: '%s'\n", token, token_str.c_str()); + GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); + + llama_sampler_free(chain); + } printf("backend temp sampling test PASSED\n"); - llama_sampler_free(chain_0); - llama_sampler_free(chain_1); } static void test_backend_multi_sequence_sampling(const char * model_path) { @@ -436,17 +446,23 @@ static void test_backend_multi_sequence_sampling(const char * model_path) { return; } - int32_t batch_idx_0 = test_ctx.idx_for_seq(0); - llama_token seq0_token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx_0); - const std::string seq0_token_str = test_ctx.token_to_piece(seq0_token, false); - printf("Seq 0 sampled token id=%d, string='%s'\n", seq0_token, seq0_token_str.c_str()); - GGML_ASSERT(seq0_token >= 0 && seq0_token < test_ctx.n_vocab); + // Verfiy sequence 0 + { + int32_t batch_idx = test_ctx.idx_for_seq(0); + llama_token token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx); + const std::string token_str = test_ctx.token_to_piece(token, false); + printf("Seq 0 sampled token id=%d, string='%s'\n", token, token_str.c_str()); + GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); + } - int32_t batch_idx_1 = test_ctx.idx_for_seq(1); - llama_token seq1_token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx_1); - const std::string seq1_token_str = test_ctx.token_to_piece(seq1_token, false); - printf("Seq 1 sampled token id=%d, string='%s'\n", seq1_token, seq1_token_str.c_str()); - GGML_ASSERT(seq1_token >= 0 && seq1_token < test_ctx.n_vocab); + // Verify sequence 1 + { + int32_t batch_idx= test_ctx.idx_for_seq(1); + llama_token token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx); + const std::string token_str = test_ctx.token_to_piece(token, false); + printf("Seq 1 sampled token id=%d, string='%s'\n", token, token_str.c_str()); + GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); + } // Generate tokens for each sequence printf("\nMulti-sequence generation:\n"); @@ -473,26 +489,29 @@ static void test_backend_multi_sequence_sampling(const char * model_path) { static void test_backend_dist_sampling(const char * model_path) { test_model_context test_ctx; + const int seq_id = 189; const int32_t seed = 88; struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params(); struct llama_sampler * backend_sampler_chain = llama_sampler_chain_init(backend_chain_params); llama_sampler_chain_add(backend_sampler_chain, llama_sampler_backend_init_dist(seed)); - std::vector backend_sampler_configs = {{ 0, backend_sampler_chain }}; + std::vector backend_sampler_configs = {{ seq_id, backend_sampler_chain }}; if (!test_ctx.setup(model_path, backend_sampler_configs)) { return; } - if (!test_ctx.decode({{0, "Hello"}})) { + if (!test_ctx.decode({{seq_id, "Some"}})) { return; } - llama_token token = llama_get_backend_sampled_token_ith(test_ctx.ctx, test_ctx.idx_for_seq(0)); - printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str()); + int32_t batch_idx = test_ctx.idx_for_seq(seq_id); + llama_token token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx); + printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str()); GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); + GGML_ASSERT(llama_get_backend_sampled_logits_ith(test_ctx.ctx, batch_idx) == nullptr); token = llama_get_backend_sampled_token_ith(test_ctx.ctx, -1); - printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str()); + printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str()); GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab); } @@ -510,7 +529,7 @@ static void test_backend_dist_sampling_and_cpu(const char * model_path) { return; } - if (!test_ctx.decode({{seq_id, "Hello"}})) { + if (!test_ctx.decode({{seq_id, "Some"}})) { return; } @@ -523,14 +542,15 @@ static void test_backend_dist_sampling_and_cpu(const char * model_path) { llama_token backend_token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx); llama_token cpu_token = llama_sampler_sample(chain, test_ctx.ctx, batch_idx); + printf("dist & cpu sampled id:%d, string:'%s'\n", cpu_token, test_ctx.token_to_piece(cpu_token, false).c_str()); GGML_ASSERT(backend_token == cpu_token); } static void test_backend_logit_bias_sampling(const char * model_path) { test_model_context test_ctx; - // Calling setup_model to ensure vocab is loaded and can be accessed - if (!test_ctx.setup_model(model_path)) { + // Calling load_model to ensure vocab is loaded and can be accessed + if (!test_ctx.load_model(model_path)) { return; } @@ -616,7 +636,7 @@ static void test_backend_mixed_sampling(const char * model_path) { GGML_ASSERT(llama_get_backend_sampled_logits_count_ith(test_ctx.ctx, batch_idx) == 0); } - // Verfiy sequence 0 that used the top-k backend sampler. + // Verfiy sequence 1 that used the top-k backend sampler. { int32_t batch_idx = test_ctx.idx_for_seq(1); float * logits = llama_get_backend_sampled_logits_ith(test_ctx.ctx, batch_idx);