llama: fix loop index types and remove unreachable loop

Normalize loop index types across batch, graph, and KV code paths to match the width of their bounds (e.g. n_tokens, n_rs, n_seq_id, n_expert_used). This also removes an unreachable loop with identical start/end conditions GCC emited the following warning when building with optimizations: llama-graph.cpp:473:9: warning: iteration 2147483645 invokes undefined behavior [-Waggressive-loop-optimizations] Signed-off-by: Nic Boet <nic@boet.cc>
2026-01-01 15:00:26 -06:00 · 2026-01-01 15:00:26 -06:00 · cdd47abd83
parent ced765be44
commit cdd47abd83
3 changed files with 35 additions and 41 deletions
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@ -720,7 +720,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
        udata->output[i]   = batch.logits[idxs[i]];

-        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
+        for (int32_t s = 0; s < udata->n_seq_id[i]; ++s) {
            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];

            udata->seq_id_data.push_back(seq_id);
@ -815,8 +815,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
        if (debug > 1) {
            int seq_id_max = 0;
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-                    for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                    for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
                        seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
                    }
                }
@ -827,7 +827,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                std::vector<int8_t> seq_id(seq_id_max);

-                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
                    seq_id[ubatch.seq_id[i][s]] = 1;
                }

@ -892,7 +892,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
-    for (int i = 0; i < n_tokens_alloc; ++i) {
+    for (int32_t i = 0; i < n_tokens_alloc; ++i) {
        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
    }
    batch.seq_id[n_tokens_alloc] = nullptr;
@ -908,7 +908,7 @@ void llama_batch_free(struct llama_batch batch) {
    if (batch.pos)      free(batch.pos);
    if (batch.n_seq_id) free(batch.n_seq_id);
    if (batch.seq_id) {
-        for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
+        for (int32_t i = 0; batch.seq_id[i] != nullptr; ++i) {
            free(batch.seq_id[i]);
        }
        free(batch.seq_id);
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -46,7 +46,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
            // the 3 first dims are the same, and 4th dim is all 0
            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
            // copy the first dimension
-            for (int i = 0; i < n_tokens; ++i) {
+            for (int64_t i = 0; i < n_tokens; ++i) {
                pos_data[               i] = ubatch->pos[i];
                pos_data[    n_tokens + i] = ubatch->pos[i];
                pos_data[2 * n_tokens + i] = ubatch->pos[i];
@ -75,7 +75,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
        GGML_ASSERT(n_attn_temp_floor_scale != 0);

        std::vector<float> attn_scale_data(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
            const float pos = ubatch->pos[i];
            attn_scale_data[i] = std::log(
                std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
@ -96,8 +96,8 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
        int32_t * data = (int32_t *) pos_bucket->data;

        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_tokens; ++i) {
+            for (int64_t j = 0; j < n_tokens; ++j) {
+                for (int64_t i = 0; i < n_tokens; ++i) {
                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
                }
            }
@ -120,7 +120,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
    int32_t * data = (int32_t *) out_ids->data;

    if (n_outputs == n_tokens) {
-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
            data[i] = i;
        }

@ -131,7 +131,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {

    int n_outputs = 0;

-    for (int i = 0; i < n_tokens; ++i) {
+    for (int64_t i = 0; i < n_tokens; ++i) {
        if (ubatch->output[i]) {
            data[n_outputs++] = i;
        }
@ -159,8 +159,8 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
        memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));

        std::vector<uint64_t> sums(n_seqs_unq, 0);
-        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+        for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                const int32_t      seq_idx = ubatch->seq_idx[seq_id];

@ -169,19 +169,19 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
        }

        std::vector<float> div(n_seqs_unq, 0.0f);
-        for (int s = 0; s < n_seqs_unq; ++s) {
+        for (int64_t s = 0; s < n_seqs_unq; ++s) {
            const uint64_t sum = sums[s];
            if (sum > 0) {
                div[s] = 1.0f/float(sum);
            }
        }

-        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+        for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                const int32_t      seq_idx = ubatch->seq_idx[seq_id];

-                for (int j = 0; j < n_seq_tokens; ++j) {
+                for (int64_t j = 0; j < n_seq_tokens; ++j) {
                    data[seq_idx*n_tokens + i + j] = div[seq_idx];
                }
            }
@ -212,10 +212,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
        );

-        for (int i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
            const llama_pos pos = ubatch->pos[i];

-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+            for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
                const int32_t      seq_idx = ubatch->seq_idx[seq_id];

@ -230,7 +230,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
            }
        }

-        for (int s = 0; s < n_seqs_unq; ++s) {
+        for (int64_t s = 0; s < n_seqs_unq; ++s) {
            if (target_row[s] >= 0) {
                data[s] = target_row[s];
            }
@ -248,7 +248,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
        int32_t * data = (int32_t *) s_copy->data;

        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
+        for (int64_t i = 0; i < n_rs; ++i) {
            data[i] = mctx->s_copy(i);
        }
    }
@ -298,14 +298,14 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
    LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);

    LLAMA_LOG_DEBUG("    ");
-    for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+    for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
        LLAMA_LOG_DEBUG("%2d", j);
    }
    LLAMA_LOG_DEBUG("\n");

-    for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
+    for (int64_t i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
        LLAMA_LOG_DEBUG(" %2d ", i);
-        for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
+        for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
            float val = data[i * n_kv + j];
            if (val == -INFINITY) {
                LLAMA_LOG_DEBUG(" ∞");
@ -323,13 +323,13 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {

    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
        for (int h = 0; h < 1; ++h) {
-            for (int i1 = 0; i1 < n_tokens; ++i1) {
+            for (int64_t i1 = 0; i1 < n_tokens; ++i1) {
                const llama_seq_id s1 = ubatch->seq_id[i1][0];
                const llama_pos    p1 = ubatch->pos[i1];

                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;

-                for (int i0 = 0; i0 < n_tokens; ++i0) {
+                for (int64_t i0 = 0; i0 < n_tokens; ++i0) {
                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
                    const llama_pos p0    = ubatch->pos[i0];

@ -454,11 +454,11 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    float * data = (float *) cross_kq_mask->data;

    for (int h = 0; h < 1; ++h) {
-        for (int i = 0; i < n_tokens; ++i) {
-            for (int j = 0; j < n_enc; ++j) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            for (int64_t j = 0; j < n_enc; ++j) {
                float f = -INFINITY;

-                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                for (int32_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
                    const llama_seq_id seq_id = ubatch->seq_id[i][s];

                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
@ -469,12 +469,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
                data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
            }
        }
-
-        for (int i = n_tokens; i < n_tokens; ++i) {
-            for (int j = 0; j < n_enc; ++j) {
-                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
-            }
-        }
    }
 }

@ -491,7 +485,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
        int32_t * data = (int32_t *) inp_rs->s_copy->data;

        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
+        for (int64_t i = 0; i < n_rs; ++i) {
            data[i] = mctx->get_recr()->s_copy(i);
        }
    }
@ -1176,7 +1170,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    assert(n_expert_used > 0);

    // order the views before the adds
-    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+    for (int64_t i = 0; i < hparams.n_expert_used; ++i) {
        cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);

        ggml_build_forward_expand(gf, cur_experts[i]);
@ -1188,7 +1182,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
    ggml_tensor * moe_out = cur_experts[0];

-    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+    for (int64_t i = 1; i < hparams.n_expert_used; ++i) {
        moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
    }

--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -1335,8 +1335,8 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
    const int32_t n_kv = dst->ne[0];

    for (int h = 0; h < 1; ++h) {
-        for (int i = 0; i < n_tokens; ++i) {
-            for (int j = 0; j < n_kv; ++j) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            for (int32_t j = 0; j < n_kv; ++j) {
                // the position when the cells is empty is irrelevant - it will be masked out later in the attention
                const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);