llama: fix loop index types and remove unreachable loop

Normalize loop index types across batch, graph, and KV code paths to
match the width of their bounds (e.g. n_tokens, n_rs,
n_seq_id, n_expert_used).

This also removes an unreachable loop with identical start/end
conditions
GCC emited the following warning when building with optimizations:

  llama-graph.cpp:473:9: warning: iteration 2147483645 invokes undefined
  behavior [-Waggressive-loop-optimizations]

Signed-off-by: Nic Boet <nic@boet.cc>
This commit is contained in:
Nic Boet 2026-01-01 15:00:26 -06:00
parent ced765be44
commit cdd47abd83
3 changed files with 35 additions and 41 deletions

View File

@ -720,7 +720,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
udata->output[i] = batch.logits[idxs[i]];
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
for (int32_t s = 0; s < udata->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
udata->seq_id_data.push_back(seq_id);
@ -815,8 +815,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
if (debug > 1) {
int seq_id_max = 0;
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
}
}
@ -827,7 +827,7 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
std::vector<int8_t> seq_id(seq_id_max);
for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
for (int32_t s = 0; s < ubatch.n_seq_id[i]; ++s) {
seq_id[ubatch.seq_id[i][s]] = 1;
}
@ -892,7 +892,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
for (int i = 0; i < n_tokens_alloc; ++i) {
for (int32_t i = 0; i < n_tokens_alloc; ++i) {
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
}
batch.seq_id[n_tokens_alloc] = nullptr;
@ -908,7 +908,7 @@ void llama_batch_free(struct llama_batch batch) {
if (batch.pos) free(batch.pos);
if (batch.n_seq_id) free(batch.n_seq_id);
if (batch.seq_id) {
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
for (int32_t i = 0; batch.seq_id[i] != nullptr; ++i) {
free(batch.seq_id[i]);
}
free(batch.seq_id);

View File

@ -46,7 +46,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
for (int64_t i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
@ -75,7 +75,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
GGML_ASSERT(n_attn_temp_floor_scale != 0);
std::vector<float> attn_scale_data(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
for (int64_t i = 0; i < n_tokens; ++i) {
const float pos = ubatch->pos[i];
attn_scale_data[i] = std::log(
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
@ -96,8 +96,8 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
int32_t * data = (int32_t *) pos_bucket->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_tokens; ++i) {
for (int64_t j = 0; j < n_tokens; ++j) {
for (int64_t i = 0; i < n_tokens; ++i) {
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
}
}
@ -120,7 +120,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
int32_t * data = (int32_t *) out_ids->data;
if (n_outputs == n_tokens) {
for (int i = 0; i < n_tokens; ++i) {
for (int64_t i = 0; i < n_tokens; ++i) {
data[i] = i;
}
@ -131,7 +131,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
int n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
for (int64_t i = 0; i < n_tokens; ++i) {
if (ubatch->output[i]) {
data[n_outputs++] = i;
}
@ -159,8 +159,8 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));
std::vector<uint64_t> sums(n_seqs_unq, 0);
for (int i = 0; i < n_tokens; i += n_seq_tokens) {
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[i][s];
const int32_t seq_idx = ubatch->seq_idx[seq_id];
@ -169,19 +169,19 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
}
std::vector<float> div(n_seqs_unq, 0.0f);
for (int s = 0; s < n_seqs_unq; ++s) {
for (int64_t s = 0; s < n_seqs_unq; ++s) {
const uint64_t sum = sums[s];
if (sum > 0) {
div[s] = 1.0f/float(sum);
}
}
for (int i = 0; i < n_tokens; i += n_seq_tokens) {
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
for (int64_t i = 0; i < n_tokens; i += n_seq_tokens) {
for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[i][s];
const int32_t seq_idx = ubatch->seq_idx[seq_id];
for (int j = 0; j < n_seq_tokens; ++j) {
for (int64_t j = 0; j < n_seq_tokens; ++j) {
data[seq_idx*n_tokens + i + j] = div[seq_idx];
}
}
@ -212,10 +212,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
);
for (int i = 0; i < n_tokens; ++i) {
for (int64_t i = 0; i < n_tokens; ++i) {
const llama_pos pos = ubatch->pos[i];
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
for (int64_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[i][s];
const int32_t seq_idx = ubatch->seq_idx[seq_id];
@ -230,7 +230,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
}
}
for (int s = 0; s < n_seqs_unq; ++s) {
for (int64_t s = 0; s < n_seqs_unq; ++s) {
if (target_row[s] >= 0) {
data[s] = target_row[s];
}
@ -248,7 +248,7 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
int32_t * data = (int32_t *) s_copy->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_rs; ++i) {
for (int64_t i = 0; i < n_rs; ++i) {
data[i] = mctx->s_copy(i);
}
}
@ -298,14 +298,14 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
LLAMA_LOG_DEBUG(" ");
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
LLAMA_LOG_DEBUG("%2d", j);
}
LLAMA_LOG_DEBUG("\n");
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
for (int64_t i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
LLAMA_LOG_DEBUG(" %2d ", i);
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
for (int64_t j = 0; j < std::min((int64_t)20, n_kv); ++j) {
float val = data[i * n_kv + j];
if (val == -INFINITY) {
LLAMA_LOG_DEBUG("");
@ -323,13 +323,13 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) {
for (int64_t i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0];
const llama_pos p1 = ubatch->pos[i1];
const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
for (int i0 = 0; i0 < n_tokens; ++i0) {
for (int64_t i0 = 0; i0 < n_tokens; ++i0) {
const llama_seq_id s0 = ubatch->seq_id[i0][0];
const llama_pos p0 = ubatch->pos[i0];
@ -454,11 +454,11 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
float * data = (float *) cross_kq_mask->data;
for (int h = 0; h < 1; ++h) {
for (int i = 0; i < n_tokens; ++i) {
for (int j = 0; j < n_enc; ++j) {
for (int64_t i = 0; i < n_tokens; ++i) {
for (int64_t j = 0; j < n_enc; ++j) {
float f = -INFINITY;
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
for (int32_t s = 0; s < ubatch->n_seq_id[i]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[i][s];
if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
@ -469,12 +469,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
}
}
for (int i = n_tokens; i < n_tokens; ++i) {
for (int j = 0; j < n_enc; ++j) {
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
}
}
}
}
@ -491,7 +485,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
int32_t * data = (int32_t *) inp_rs->s_copy->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_rs; ++i) {
for (int64_t i = 0; i < n_rs; ++i) {
data[i] = mctx->get_recr()->s_copy(i);
}
}
@ -1176,7 +1170,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
assert(n_expert_used > 0);
// order the views before the adds
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
for (int64_t i = 0; i < hparams.n_expert_used; ++i) {
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
ggml_build_forward_expand(gf, cur_experts[i]);
@ -1188,7 +1182,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
ggml_tensor * moe_out = cur_experts[0];
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
for (int64_t i = 1; i < hparams.n_expert_used; ++i) {
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
}

View File

@ -1335,8 +1335,8 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
const int32_t n_kv = dst->ne[0];
for (int h = 0; h < 1; ++h) {
for (int i = 0; i < n_tokens; ++i) {
for (int j = 0; j < n_kv; ++j) {
for (int64_t i = 0; i < n_tokens; ++i) {
for (int32_t j = 0; j < n_kv; ++j) {
// the position when the cells is empty is irrelevant - it will be masked out later in the attention
const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);