From cdd47abd83b748e0ed0de0d85deaa7aa6d102384 Mon Sep 17 00:00:00 2001 From: Nic Boet Date: Thu, 1 Jan 2026 15:00:26 -0600 Subject: [PATCH] llama: fix loop index types and remove unreachable loop Normalize loop index types across batch, graph, and KV code paths to match the width of their bounds (e.g. n_tokens, n_rs, n_seq_id, n_expert_used). This also removes an unreachable loop with identical start/end conditions GCC emited the following warning when building with optimizations: llama-graph.cpp:473:9: warning: iteration 2147483645 invokes undefined behavior [-Waggressive-loop-optimizations] Signed-off-by: Nic Boet --- src/llama-batch.cpp | 12 ++++----- src/llama-graph.cpp | 60 +++++++++++++++++++----------------------- src/llama-kv-cache.cpp | 4 +-- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 386fab04ac..8facd21e72 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -720,7 +720,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; udata->output[i] = batch.logits[idxs[i]]; - for (int s = 0; s < udata->n_seq_id[i]; ++s) { + for (int32_t s = 0; s < udata->n_seq_id[i]; ++s) { const llama_seq_id seq_id = batch.seq_id[idxs[i]][s]; udata->seq_id_data.push_back(seq_id); @@ -815,8 +815,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { if (debug > 1) { int seq_id_max = 0; for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { - for (int s = 0; s < ubatch.n_seq_id[i]; ++s) { - for (int s = 0; s < ubatch.n_seq_id[i]; ++s) { + for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) { + for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) { seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]); } } @@ -827,7 +827,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { std::vector seq_id(seq_id_max); - for (int s = 0; s < ubatch.n_seq_id[i]; ++s) { + for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) { seq_id[ubatch.seq_id[i][s]] = 1; } @@ -892,7 +892,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc); batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc); batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1)); - for (int i = 0; i < n_tokens_alloc; ++i) { + for (int32_t i = 0; i < n_tokens_alloc; ++i) { batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); } batch.seq_id[n_tokens_alloc] = nullptr; @@ -908,7 +908,7 @@ void llama_batch_free(struct llama_batch batch) { if (batch.pos) free(batch.pos); if (batch.n_seq_id) free(batch.n_seq_id); if (batch.seq_id) { - for (int i = 0; batch.seq_id[i] != nullptr; ++i) { + for (int32_t i = 0; batch.seq_id[i] != nullptr; ++i) { free(batch.seq_id[i]); } free(batch.seq_id); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1d0d7197e1..6dd38e4a2a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -46,7 +46,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { // the 3 first dims are the same, and 4th dim is all 0 std::vector pos_data(n_tokens*n_pos_per_embd); // copy the first dimension - for (int i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { pos_data[ i] = ubatch->pos[i]; pos_data[ n_tokens + i] = ubatch->pos[i]; pos_data[2 * n_tokens + i] = ubatch->pos[i]; @@ -75,7 +75,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(n_attn_temp_floor_scale != 0); std::vector attn_scale_data(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { const float pos = ubatch->pos[i]; attn_scale_data[i] = std::log( std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0 @@ -96,8 +96,8 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { int32_t * data = (int32_t *) pos_bucket->data; for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_tokens; ++i) { + for (int64_t j = 0; j < n_tokens; ++j) { + for (int64_t i = 0; i < n_tokens; ++i) { data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true); } } @@ -120,7 +120,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { int32_t * data = (int32_t *) out_ids->data; if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { data[i] = i; } @@ -131,7 +131,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { int n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { if (ubatch->output[i]) { data[n_outputs++] = i; } @@ -159,8 +159,8 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean)); std::vector sums(n_seqs_unq, 0); - for (int i = 0; i < n_tokens; i += n_seq_tokens) { - for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { + for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) { + for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[i][s]; const int32_t seq_idx = ubatch->seq_idx[seq_id]; @@ -169,19 +169,19 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { } std::vector div(n_seqs_unq, 0.0f); - for (int s = 0; s < n_seqs_unq; ++s) { + for (int64_t s = 0; s < n_seqs_unq; ++s) { const uint64_t sum = sums[s]; if (sum > 0) { div[s] = 1.0f/float(sum); } } - for (int i = 0; i < n_tokens; i += n_seq_tokens) { - for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { + for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) { + for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[i][s]; const int32_t seq_idx = ubatch->seq_idx[seq_id]; - for (int j = 0; j < n_seq_tokens; ++j) { + for (int64_t j = 0; j < n_seq_tokens; ++j) { data[seq_idx*n_tokens + i + j] = div[seq_idx]; } } @@ -212,10 +212,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token ); - for (int i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { const llama_pos pos = ubatch->pos[i]; - for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { + for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[i][s]; const int32_t seq_idx = ubatch->seq_idx[seq_id]; @@ -230,7 +230,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { } } - for (int s = 0; s < n_seqs_unq; ++s) { + for (int64_t s = 0; s < n_seqs_unq; ++s) { if (target_row[s] >= 0) { data[s] = target_row[s]; } @@ -248,7 +248,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) { int32_t * data = (int32_t *) s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_rs; ++i) { + for (int64_t i = 0; i < n_rs; ++i) { data[i] = mctx->s_copy(i); } } @@ -298,14 +298,14 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64 LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__); LLAMA_LOG_DEBUG(" "); - for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) { + for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) { LLAMA_LOG_DEBUG("%2d", j); } LLAMA_LOG_DEBUG("\n"); - for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) { + for (int64_t i = 0; i < std::min((int64_t)20, n_tokens); ++i) { LLAMA_LOG_DEBUG(" %2d ", i); - for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) { + for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) { float val = data[i * n_kv + j]; if (val == -INFINITY) { LLAMA_LOG_DEBUG(" ∞"); @@ -323,13 +323,13 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) { for (int h = 0; h < 1; ++h) { - for (int i1 = 0; i1 < n_tokens; ++i1) { + for (int64_t i1 = 0; i1 < n_tokens; ++i1) { const llama_seq_id s1 = ubatch->seq_id[i1][0]; const llama_pos p1 = ubatch->pos[i1]; const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv; - for (int i0 = 0; i0 < n_tokens; ++i0) { + for (int64_t i0 = 0; i0 < n_tokens; ++i0) { const llama_seq_id s0 = ubatch->seq_id[i0][0]; const llama_pos p0 = ubatch->pos[i0]; @@ -454,11 +454,11 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { float * data = (float *) cross_kq_mask->data; for (int h = 0; h < 1; ++h) { - for (int i = 0; i < n_tokens; ++i) { - for (int j = 0; j < n_enc; ++j) { + for (int64_t i = 0; i < n_tokens; ++i) { + for (int64_t j = 0; j < n_enc; ++j) { float f = -INFINITY; - for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { + for (int32_t s = 0; s < ubatch->n_seq_id[i]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[i][s]; if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) { @@ -469,12 +469,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { data[h*(n_enc*n_tokens) + i*n_enc + j] = f; } } - - for (int i = n_tokens; i < n_tokens; ++i) { - for (int j = 0; j < n_enc; ++j) { - data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY; - } - } } } @@ -491,7 +485,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { int32_t * data = (int32_t *) inp_rs->s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_rs; ++i) { + for (int64_t i = 0; i < n_rs; ++i) { data[i] = mctx->get_recr()->s_copy(i); } } @@ -1176,7 +1170,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( assert(n_expert_used > 0); // order the views before the adds - for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { + for (int64_t i = 0; i < hparams.n_expert_used; ++i) { cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]); ggml_build_forward_expand(gf, cur_experts[i]); @@ -1188,7 +1182,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( // ref: https://github.com/ggml-org/llama.cpp/pull/14753 ggml_tensor * moe_out = cur_experts[0]; - for (uint32_t i = 1; i < hparams.n_expert_used; ++i) { + for (int64_t i = 1; i < hparams.n_expert_used; ++i) { moe_out = ggml_add(ctx0, moe_out, cur_experts[i]); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 3186242d60..403513ab0f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1335,8 +1335,8 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch const int32_t n_kv = dst->ne[0]; for (int h = 0; h < 1; ++h) { - for (int i = 0; i < n_tokens; ++i) { - for (int j = 0; j < n_kv; ++j) { + for (int64_t i = 0; i < n_tokens; ++i) { + for (int32_t j = 0; j < n_kv; ++j) { // the position when the cells is empty is irrelevant - it will be masked out later in the attention const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);