diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 13ea8c690f..5b8895b341 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -276,7 +276,7 @@ llama_pos server_tokens::pos_next(int64_t n_tokens) const { size_t server_tokens::size_up_to_pos(llama_pos max_pos) const { if (!has_mtmd) { - return std::min((size_t)(max_pos + 1), tokens.size()); + return std::min((size_t)max_pos, tokens.size()); } size_t idx = 0; @@ -296,7 +296,7 @@ size_t server_tokens::size_up_to_pos(llama_pos max_pos) const { idx++; } - if (pos > max_pos) { + if (pos >= max_pos) { break; } } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 4fb9e488df..a234541e19 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -170,7 +170,7 @@ public: // the next position after n_tokens. if n_tokens < 0, return the next position after all tokens. llama_pos pos_next(int64_t n_tokens = -1) const; - // number of tokens with position <= max_pos + // number of tokens with position < max_pos size_t size_up_to_pos(llama_pos max_pos) const; const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b67190a469..33bec85c23 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -570,7 +570,7 @@ private: std::vector slots; int slots_debug = 0; - int n_empty_consequtive = 0; + int n_empty_consecutive = 0; std::unique_ptr prompt_cache; @@ -2372,7 +2372,7 @@ private: } else { pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); - SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024); + SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_past = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, n_past, (float) checkpoint_size / 1024 / 1024); } } @@ -2630,11 +2630,11 @@ private: if (batch.n_tokens == 0) { SRV_WRN("%s", "no tokens to decode\n"); - if (++n_empty_consequtive > 3) { + if (++n_empty_consecutive > 3) { GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277"); } } else { - n_empty_consequtive = 0; + n_empty_consecutive = 0; } int32_t i_next = 0;