From cdd47abd83b748e0ed0de0d85deaa7aa6d102384 Mon Sep 17 00:00:00 2001
From: Nic Boet <nic@boet.cc>
Date: Thu, 1 Jan 2026 15:00:26 -0600
Subject: [PATCH] llama: fix loop index types and remove unreachable loop

Normalize loop index types across batch, graph, and KV code paths to
match the width of their bounds (e.g. n_tokens, n_rs,
n_seq_id, n_expert_used).

This also removes an unreachable loop with identical start/end
conditions
GCC emited the following warning when building with optimizations:

  llama-graph.cpp:473:9: warning: iteration 2147483645 invokes undefined
  behavior [-Waggressive-loop-optimizations]

Signed-off-by: Nic Boet <nic@boet.cc>
---
 src/llama-batch.cpp    | 12 ++++-----
 src/llama-graph.cpp    | 60 +++++++++++++++++++-----------------------
 src/llama-kv-cache.cpp |  4 +--
 3 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 386fab04ac..8facd21e72 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -720,7 +720,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
         udata->output[i]   = batch.logits[idxs[i]];
 
-        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
+        for (int32_t s = 0; s < udata->n_seq_id[i]; ++s) {
             const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
 
             udata->seq_id_data.push_back(seq_id);
@@ -815,8 +815,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
         if (debug > 1) {
             int seq_id_max = 0;
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-                    for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                    for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
                         seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
                     }
                 }
@@ -827,7 +827,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                 std::vector<int8_t> seq_id(seq_id_max);
 
-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
                     seq_id[ubatch.seq_id[i][s]] = 1;
                 }
 
@@ -892,7 +892,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
     batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
     batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
-    for (int i = 0; i < n_tokens_alloc; ++i) {
+    for (int32_t i = 0; i < n_tokens_alloc; ++i) {
         batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
     }
     batch.seq_id[n_tokens_alloc] = nullptr;
@@ -908,7 +908,7 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.pos)      free(batch.pos);
     if (batch.n_seq_id) free(batch.n_seq_id);
     if (batch.seq_id) {
-        for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
+        for (int32_t i = 0; batch.seq_id[i] != nullptr; ++i) {
             free(batch.seq_id[i]);
         }
         free(batch.seq_id);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 1d0d7197e1..6dd38e4a2a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -46,7 +46,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
             // the 3 first dims are the same, and 4th dim is all 0
             std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
             // copy the first dimension
-            for (int i = 0; i < n_tokens; ++i) {
+            for (int64_t i = 0; i < n_tokens; ++i) {
                 pos_data[               i] = ubatch->pos[i];
                 pos_data[    n_tokens + i] = ubatch->pos[i];
                 pos_data[2 * n_tokens + i] = ubatch->pos[i];
@@ -75,7 +75,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
         GGML_ASSERT(n_attn_temp_floor_scale != 0);
 
         std::vector<float> attn_scale_data(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
             const float pos = ubatch->pos[i];
             attn_scale_data[i] = std::log(
                 std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
@@ -96,8 +96,8 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
         int32_t * data = (int32_t *) pos_bucket->data;
 
         for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_tokens; ++i) {
+            for (int64_t j = 0; j < n_tokens; ++j) {
+                for (int64_t i = 0; i < n_tokens; ++i) {
                     data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
                 }
             }
@@ -120,7 +120,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
     int32_t * data = (int32_t *) out_ids->data;
 
     if (n_outputs == n_tokens) {
-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
             data[i] = i;
         }
 
@@ -131,7 +131,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
 
     int n_outputs = 0;
 
-    for (int i = 0; i < n_tokens; ++i) {
+    for (int64_t i = 0; i < n_tokens; ++i) {
         if (ubatch->output[i]) {
             data[n_outputs++] = i;
         }
@@ -159,8 +159,8 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
         memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));
 
         std::vector<uint64_t> sums(n_seqs_unq, 0);
-        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+        for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                 const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                 const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
@@ -169,19 +169,19 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
         }
 
         std::vector<float> div(n_seqs_unq, 0.0f);
-        for (int s = 0; s < n_seqs_unq; ++s) {
+        for (int64_t s = 0; s < n_seqs_unq; ++s) {
             const uint64_t sum = sums[s];
             if (sum > 0) {
                 div[s] = 1.0f/float(sum);
             }
         }
 
-        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+        for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                 const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                 const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-                for (int j = 0; j < n_seq_tokens; ++j) {
+                for (int64_t j = 0; j < n_seq_tokens; ++j) {
                     data[seq_idx*n_tokens + i + j] = div[seq_idx];
                 }
             }
@@ -212,10 +212,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
             (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
         );
 
-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
             const llama_pos pos = ubatch->pos[i];
 
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                 const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                 const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
@@ -230,7 +230,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
             }
         }
 
-        for (int s = 0; s < n_seqs_unq; ++s) {
+        for (int64_t s = 0; s < n_seqs_unq; ++s) {
             if (target_row[s] >= 0) {
                 data[s] = target_row[s];
             }
@@ -248,7 +248,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
         int32_t * data = (int32_t *) s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
+        for (int64_t i = 0; i < n_rs; ++i) {
             data[i] = mctx->s_copy(i);
         }
     }
@@ -298,14 +298,14 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
     LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
 
     LLAMA_LOG_DEBUG("    ");
-    for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+    for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
         LLAMA_LOG_DEBUG("%2d", j);
     }
     LLAMA_LOG_DEBUG("\n");
 
-    for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
+    for (int64_t i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
         LLAMA_LOG_DEBUG(" %2d ", i);
-        for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+        for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
             float val = data[i * n_kv + j];
             if (val == -INFINITY) {
                 LLAMA_LOG_DEBUG(" ∞");
@@ -323,13 +323,13 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
     const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
         for (int h = 0; h < 1; ++h) {
-            for (int i1 = 0; i1 < n_tokens; ++i1) {
+            for (int64_t i1 = 0; i1 < n_tokens; ++i1) {
                 const llama_seq_id s1 = ubatch->seq_id[i1][0];
                 const llama_pos    p1 = ubatch->pos[i1];
 
                 const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
 
-                for (int i0 = 0; i0 < n_tokens; ++i0) {
+                for (int64_t i0 = 0; i0 < n_tokens; ++i0) {
                     const llama_seq_id s0 = ubatch->seq_id[i0][0];
                     const llama_pos p0    = ubatch->pos[i0];
 
@@ -454,11 +454,11 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     float * data = (float *) cross_kq_mask->data;
 
     for (int h = 0; h < 1; ++h) {
-        for (int i = 0; i < n_tokens; ++i) {
-            for (int j = 0; j < n_enc; ++j) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            for (int64_t j = 0; j < n_enc; ++j) {
                 float f = -INFINITY;
 
-                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                     const llama_seq_id seq_id = ubatch->seq_id[i][s];
 
                     if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
@@ -469,12 +469,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
                 data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
             }
         }
-
-        for (int i = n_tokens; i < n_tokens; ++i) {
-            for (int j = 0; j < n_enc; ++j) {
-                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
-            }
-        }
     }
 }
 
@@ -491,7 +485,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
+        for (int64_t i = 0; i < n_rs; ++i) {
             data[i] = mctx->get_recr()->s_copy(i);
         }
     }
@@ -1176,7 +1170,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     assert(n_expert_used > 0);
 
     // order the views before the adds
-    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+    for (int64_t i = 0; i < hparams.n_expert_used; ++i) {
         cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
 
         ggml_build_forward_expand(gf, cur_experts[i]);
@@ -1188,7 +1182,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
     ggml_tensor * moe_out = cur_experts[0];
 
-    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+    for (int64_t i = 1; i < hparams.n_expert_used; ++i) {
         moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
     }
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 3186242d60..403513ab0f 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1335,8 +1335,8 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
     const int32_t n_kv = dst->ne[0];
 
     for (int h = 0; h < 1; ++h) {
-        for (int i = 0; i < n_tokens; ++i) {
-            for (int j = 0; j < n_kv; ++j) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            for (int32_t j = 0; j < n_kv; ++j) {
                 // the position when the cells is empty is irrelevant - it will be masked out later in the attention
                 const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);