Add LLAMA_STATE_SEQ_FLAGS_APPEND for incremental KV state restore

Add a new flag LLAMA_STATE_SEQ_FLAGS_APPEND (value 2) that skips the seq_rm() call in state_read_meta, allowing incremental chunk-by-chunk restore to the same sequence via repeated state_seq_set_data_ext calls. This enables external KV cache systems to restore opaque state blobs one chunk at a time without each chunk clearing the previous one. - Add #define LLAMA_STATE_SEQ_FLAGS_APPEND 2 in llama.h - Thread flags parameter through state_read() to state_read_meta() - Gate seq_rm() on !(flags & LLAMA_STATE_SEQ_FLAGS_APPEND) - Default behavior (flags=0) is unchanged
2026-03-31 10:46:18 +00:00 · 2026-03-31 10:46:18 +00:00 · ce447e2745
parent 26dac845cc
commit ce447e2745
3 changed files with 9 additions and 4 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -872,6 +872,9 @@ extern "C" {
 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1

+// restore without clearing existing sequence data (append to existing KV cache entries)
+#define LLAMA_STATE_SEQ_FLAGS_APPEND 2
+
    typedef uint32_t llama_state_seq_flags;

    LLAMA_API size_t llama_state_seq_get_size_ext(
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -1722,7 +1722,7 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
        slot_info sinfo;

        bool res = true;
-        res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
+        res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id, flags);
        res = res && state_read_data(io, strm, cell_count, sinfo);

        if (!res) {
@ -1868,13 +1868,15 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
    }
 }

-bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id, llama_state_seq_flags flags) {
    auto & cells = v_cells[strm];
    auto & head  = v_heads[strm];

    if (dest_seq_id != -1) {
        // single sequence
-        seq_rm(dest_seq_id, -1, -1);
+        if (!(flags & LLAMA_STATE_SEQ_FLAGS_APPEND)) {
+            seq_rm(dest_seq_id, -1, -1);
+        }

        llama_batch_allocr balloc(hparams.n_pos_per_embd());

--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@ -280,7 +280,7 @@ private:
    void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;

-    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count,       slot_info & sinfo, llama_seq_id dest_seq_id = -1);
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count,       slot_info & sinfo, llama_seq_id dest_seq_id = -1, llama_state_seq_flags flags = 0);
    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
 };