From dbbc8a118911846a259e35c216baa6489c8f2101 Mon Sep 17 00:00:00 2001 From: Karlon <44919207+oopb@users.noreply.github.com> Date: Tue, 30 Dec 2025 11:17:00 +0800 Subject: [PATCH 1/2] android: fix infinite generation and maintain attention in func shift_context() Properly maintain stop_generation_position when context is shifted. Additionally, add variable attention_sink to maintain attention. Adjust the position of the function after the variable stop_generation_position is defined. Fixes #18409. --- .../lib/src/main/cpp/ai_chat.cpp | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp index 9e460ac198..8f1026769d 100644 --- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp +++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp @@ -269,23 +269,6 @@ static void reset_long_term_states(const bool clear_kv_cache = true) { llama_memory_clear(llama_get_memory(g_context), false); } -/** - * TODO-hyin: implement sliding-window version as a better alternative - * - * Context shifting by discarding the older half of the tokens appended after system prompt: - * - take the [system_prompt_position] first tokens from the original prompt - * - take half of the last (system_prompt_position - system_prompt_position) tokens - * - recompute the logits in batches - */ -static void shift_context() { - const int n_discard = (current_position - system_prompt_position) / 2; - LOGi("%s: Discarding %d tokens", __func__, n_discard); - llama_memory_seq_rm(llama_get_memory(g_context), 0, system_prompt_position, system_prompt_position + n_discard); - llama_memory_seq_add(llama_get_memory(g_context), 0, system_prompt_position + n_discard, current_position, -n_discard); - current_position -= n_discard; - LOGi("%s: Context shifting done! Current position: %d", __func__, current_position); -} - static std::string chat_add_and_format(const std::string &role, const std::string &content) { common_chat_msg new_msg; new_msg.role = role; @@ -313,6 +296,30 @@ static void reset_short_term_states() { assistant_ss.str(""); } +/** + * TODO-hyin: implement sliding-window version as a better alternative + * + * Context shifting by discarding the older half of the tokens appended after system prompt: + * - take the [system_prompt_position] first tokens from the original prompt + * - take half of the last (system_prompt_position - system_prompt_position) tokens + * - recompute the logits in batches + */ +static void shift_context() { + const int attention_sink = 4; + const int keep_first = std::max(system_prompt_position, attention_sink); + const int n_discard = (current_position - keep_first) / 2; + if (n_discard <= 0) { + LOGi("%s: n_discard <= 0", __func__); + return; + } + LOGi("%s: Discarding %d tokens", __func__, n_discard); + llama_memory_seq_rm(llama_get_memory(g_context), 0, keep_first, keep_first + n_discard); + llama_memory_seq_add(llama_get_memory(g_context), 0, keep_first + n_discard, -1, -n_discard); + current_position -= n_discard; + stop_generation_position -= n_discard; + LOGi("%s: Context shifting done! Current position: %d", __func__, current_position); +} + static int decode_tokens_in_batches( llama_context *context, llama_batch &batch, From 2d55904a1548152355ba73c2e3961da630b1520b Mon Sep 17 00:00:00 2001 From: Karlon <44919207+oopb@users.noreply.github.com> Date: Tue, 30 Dec 2025 16:21:53 +0800 Subject: [PATCH 2/2] Clarify token handling in shift_context comments Updated comments in shift_context function to clarify token handling. --- examples/llama.android/lib/src/main/cpp/ai_chat.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp index 8f1026769d..a37b6b6343 100644 --- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp +++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp @@ -300,9 +300,11 @@ static void reset_short_term_states() { * TODO-hyin: implement sliding-window version as a better alternative * * Context shifting by discarding the older half of the tokens appended after system prompt: - * - take the [system_prompt_position] first tokens from the original prompt - * - take half of the last (system_prompt_position - system_prompt_position) tokens + * - take the [keep_recent] first tokens from the original prompt + * - take half of the last (current_position - keep_first) tokens * - recompute the logits in batches + * + * attention_sink: keep the first 4 tokens to maintain attention. */ static void shift_context() { const int attention_sink = 4;