From 38f408c2530871a24a96b318b13f89836e3b10a3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 19 Nov 2025 13:43:29 +0200
Subject: [PATCH] common : fix regression caused by extra memory allocations
 during sampling

---
 common/sampling.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 9813762eca..bccec35dac 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -122,27 +122,24 @@ struct common_sampler {
 
         const int n_vocab = llama_vocab_n_tokens(vocab);
 
-        // Use the member variable instead of allocating locally
-        cur.clear();
-
         if (sampled_probs) {
             const uint32_t sampled_probs_count = llama_get_backend_sampled_probs_count_ith(ctx, idx);
-            cur.reserve(sampled_probs_count);
+            cur.resize(sampled_probs_count);
             for (uint32_t i = 0; i < sampled_probs_count; ++i) {
-                cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]});
+                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
             }
         } else if (sampled_logits) {
             const uint32_t sampled_logits_count = llama_get_backend_sampled_logits_count_ith(ctx, idx);
-            cur.reserve(sampled_logits_count);
+            cur.resize(sampled_logits_count);
             for (uint32_t i = 0; i < sampled_logits_count; i++) {
-                cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
+                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
             }
         } else {
             const auto * logits = llama_get_logits_ith(ctx, idx);
             GGML_ASSERT(logits != nullptr);
-            cur.reserve(n_vocab);
+            cur.resize(n_vocab);
             for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
             }
         }