server : fix off-by-1 in server_tokens::size_up_to_pos() (#20279)
* server : fix off-by-1 in server_tokens::size_up_to_pos() * cont : fix typo [no ci]
This commit is contained in:
parent
f76565db92
commit
d6e1556499
|
|
@ -276,7 +276,7 @@ llama_pos server_tokens::pos_next(int64_t n_tokens) const {
|
|||
|
||||
size_t server_tokens::size_up_to_pos(llama_pos max_pos) const {
|
||||
if (!has_mtmd) {
|
||||
return std::min((size_t)(max_pos + 1), tokens.size());
|
||||
return std::min((size_t)max_pos, tokens.size());
|
||||
}
|
||||
|
||||
size_t idx = 0;
|
||||
|
|
@ -296,7 +296,7 @@ size_t server_tokens::size_up_to_pos(llama_pos max_pos) const {
|
|||
idx++;
|
||||
}
|
||||
|
||||
if (pos > max_pos) {
|
||||
if (pos >= max_pos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ public:
|
|||
// the next position after n_tokens. if n_tokens < 0, return the next position after all tokens.
|
||||
llama_pos pos_next(int64_t n_tokens = -1) const;
|
||||
|
||||
// number of tokens with position <= max_pos
|
||||
// number of tokens with position < max_pos
|
||||
size_t size_up_to_pos(llama_pos max_pos) const;
|
||||
|
||||
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
|
||||
|
|
|
|||
|
|
@ -570,7 +570,7 @@ private:
|
|||
std::vector<server_slot> slots;
|
||||
|
||||
int slots_debug = 0;
|
||||
int n_empty_consequtive = 0;
|
||||
int n_empty_consecutive = 0;
|
||||
|
||||
std::unique_ptr<server_prompt_cache> prompt_cache;
|
||||
|
||||
|
|
@ -2372,7 +2372,7 @@ private:
|
|||
} else {
|
||||
pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
|
||||
n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
|
||||
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024);
|
||||
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) checkpoint_size / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2630,11 +2630,11 @@ private:
|
|||
if (batch.n_tokens == 0) {
|
||||
SRV_WRN("%s", "no tokens to decode\n");
|
||||
|
||||
if (++n_empty_consequtive > 3) {
|
||||
if (++n_empty_consecutive > 3) {
|
||||
GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277");
|
||||
}
|
||||
} else {
|
||||
n_empty_consequtive = 0;
|
||||
n_empty_consecutive = 0;
|
||||
}
|
||||
|
||||
int32_t i_next = 0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue