From d5c325051fded7b55d65059f74e6ceb1d0a81197 Mon Sep 17 00:00:00 2001
From: European Tech <en.prie@pm.me>
Date: Fri, 20 Mar 2026 21:16:11 +0100
Subject: [PATCH 1/2] server: persist context checkpoints across slot
 save/restore

For hybrid/recurrent models (Qwen3.5, Jamba, Falcon-H1), the server
creates context checkpoints during prompt processing that snapshot the
full recurrent state at regular intervals.  These checkpoints are
essential to avoid full prompt re-processing when a slot is reused.

The existing /slots save/restore API persists the raw KV+recurrent
memory via llama_state_seq_{save,load}_file, and also restores the
token list.  However, it does not persist the checkpoint metadata
stored in server_prompt::checkpoints.  Without these, the hybrid model
cache validation logic in update_slots() cannot find any checkpoint to
restore from and falls back to full prompt re-processing.

This patch adds two small helper functions (slot_checkpoints_save and
slot_checkpoints_load) that write/read a companion file alongside the
main slot save file.  The format is a versioned binary file with a
magic header.

This is particularly useful in router mode with --models-max 1, where
switching between models destroys the in-memory prompt cache.  Users
can now call /slots/0?action=save before a model swap and
/slots/0?action=restore after, recovering the full cache including
checkpoints.

Tested with Qwen3.5-27B (64 layers, 16 attention + 48 recurrent):
- Without patch: cache_n=0, 23s re-processing after swap
- With patch:    cache_n=26549, 75ms after swap
---
 tools/server/server-context.cpp | 127 ++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 9de554e900..b48373df82 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -455,6 +455,127 @@ struct server_slot {
     }
 };
 
+//
+// checkpoint persistence helpers for hybrid/recurrent models
+//
+// Hybrid models (e.g. Qwen3.5, Jamba, Falcon-H1) use recurrent layers whose
+// state cannot be partially restored from the KV cache alone.  The server
+// creates "context checkpoints" during prompt processing that snapshot the
+// full recurrent state at regular intervals.  These checkpoints live in
+// server_prompt::checkpoints and are essential to avoid a full prompt
+// re-processing when the slot is reused.
+//
+// The built-in /slots save/restore API persists the raw KV+recurrent memory
+// via llama_state_seq_{save,load}_file, but does NOT persist the checkpoint
+// metadata.  The two helpers below fill that gap: they write/read a small
+// companion file (<filename>.checkpoints) next to the main slot save file.
+//
+// File format (binary, little-endian):
+//   uint32  magic    = 0x4C4C4350  ("LLCP")
+//   uint32  version  = 1
+//   uint32  n_checkpoints
+//   For each checkpoint:
+//     int32   pos_min
+//     int32   pos_max
+//     int64   n_tokens
+//     uint64  data_size
+//     uint8   data[data_size]
+//
+
+static bool slot_checkpoints_save(const std::string & filepath,
+                                  const std::list<server_prompt_checkpoint> & checkpoints) {
+    if (checkpoints.empty()) {
+        return true;
+    }
+
+    const std::string cp_path = filepath + ".checkpoints";
+    FILE * fp = fopen(cp_path.c_str(), "wb");
+    if (!fp) {
+        SRV_WRN("failed to open checkpoint file for writing: %s\n", cp_path.c_str());
+        return false;
+    }
+
+    const uint32_t magic   = 0x4C4C4350;
+    const uint32_t version = 1;
+    const uint32_t n_cp    = (uint32_t) checkpoints.size();
+
+    bool ok = true;
+    ok = ok && fwrite(&magic,   sizeof(magic),   1, fp) == 1;
+    ok = ok && fwrite(&version, sizeof(version), 1, fp) == 1;
+    ok = ok && fwrite(&n_cp,    sizeof(n_cp),    1, fp) == 1;
+
+    for (const auto & cp : checkpoints) {
+        const uint64_t data_size = cp.data.size();
+        ok = ok && fwrite(&cp.pos_min,  sizeof(cp.pos_min),  1, fp) == 1;
+        ok = ok && fwrite(&cp.pos_max,  sizeof(cp.pos_max),  1, fp) == 1;
+        ok = ok && fwrite(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
+        ok = ok && fwrite(&data_size,   sizeof(data_size),   1, fp) == 1;
+        if (data_size > 0) {
+            ok = ok && fwrite(cp.data.data(), 1, data_size, fp) == data_size;
+        }
+    }
+
+    fclose(fp);
+
+    if (!ok) {
+        SRV_WRN("failed to write checkpoint data to %s\n", cp_path.c_str());
+        std::remove(cp_path.c_str());
+        return false;
+    }
+
+    SRV_INF("saved %u context checkpoints to %s\n", n_cp, cp_path.c_str());
+    return true;
+}
+
+static bool slot_checkpoints_load(const std::string & filepath,
+                                  std::list<server_prompt_checkpoint> & checkpoints) {
+    const std::string cp_path = filepath + ".checkpoints";
+    FILE * fp = fopen(cp_path.c_str(), "rb");
+    if (!fp) {
+        return true;  // no checkpoint file is not an error
+    }
+
+    uint32_t magic = 0, version = 0, n_cp = 0;
+    bool ok = true;
+    ok = ok && fread(&magic,   sizeof(magic),   1, fp) == 1;
+    ok = ok && fread(&version, sizeof(version), 1, fp) == 1;
+    ok = ok && fread(&n_cp,    sizeof(n_cp),    1, fp) == 1;
+
+    if (!ok || magic != 0x4C4C4350 || version != 1) {
+        SRV_WRN("invalid checkpoint file header: %s\n", cp_path.c_str());
+        fclose(fp);
+        return false;
+    }
+
+    checkpoints.clear();
+
+    for (uint32_t i = 0; i < n_cp && ok; i++) {
+        server_prompt_checkpoint cp;
+        uint64_t data_size = 0;
+        ok = ok && fread(&cp.pos_min,  sizeof(cp.pos_min),  1, fp) == 1;
+        ok = ok && fread(&cp.pos_max,  sizeof(cp.pos_max),  1, fp) == 1;
+        ok = ok && fread(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
+        ok = ok && fread(&data_size,   sizeof(data_size),   1, fp) == 1;
+        if (ok && data_size > 0) {
+            cp.data.resize(data_size);
+            ok = ok && fread(cp.data.data(), 1, data_size, fp) == data_size;
+        }
+        if (ok) {
+            checkpoints.push_back(std::move(cp));
+        }
+    }
+
+    fclose(fp);
+
+    if (!ok) {
+        SRV_WRN("failed to read checkpoint data from %s\n", cp_path.c_str());
+        checkpoints.clear();
+        return false;
+    }
+
+    SRV_INF("restored %u context checkpoints from %s\n", n_cp, cp_path.c_str());
+    return true;
+}
 
 
 //
@@ -1822,6 +1943,9 @@ private:
                     const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
 
+                    // persist context checkpoints alongside the slot state
+                    slot_checkpoints_save(filepath, slot->prompt.checkpoints);
+
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
 
@@ -1869,6 +1993,9 @@ private:
                     slot->prompt.tokens.clear();
                     slot->prompt.tokens.insert(tokens);
 
+                    // restore context checkpoints if a companion file exists
+                    slot_checkpoints_load(filepath, slot->prompt.checkpoints);
+
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 

From 6d7dc316f279dfc63e46c462dd3c9ed56d414aff Mon Sep 17 00:00:00 2001
From: European Tech <en.prie@pm.me>
Date: Fri, 20 Mar 2026 22:35:58 +0100
Subject: [PATCH 2/2] server: auto-save/restore slot state on child exit/start
 in router mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In router mode with --models-max 1, switching models kills the child
process, destroying all in-memory state including the prompt cache and
context checkpoints.  This forces a full prompt re-processing on every
model swap return, which can take tens of seconds for long prompts.

This patch adds two methods (auto_save_slots, auto_restore_slots) that
are called automatically during the child process lifecycle:

- auto_save_slots: called after start_loop() returns (before clean_up),
  saves each slot's state + checkpoints to --slot-save-path using the
  model filename stem as the save name.

- auto_restore_slots: called after load_model() (before start_loop),
  checks if a save file exists for this model and restores it.

Combined with the checkpoint persistence from the previous commit,
this makes model hot-swapping fully transparent: the conversation
context is preserved across swaps with no client-side changes.

Tested with Qwen3.5-27B + Qwen3.5-35B-A3B MoE in router mode:
- Swap 27B→MoE: ~7s (incl auto-save 826 MiB state + 749 MiB checkpoints)
- Swap MoE→27B: ~6s (incl auto-restore)
- cache_n after restore: 26549 (91ms vs 23s without)
---
 tools/server/server-context.cpp | 60 +++++++++++++++++++++++++++++++++
 tools/server/server-context.h   |  5 +++
 tools/server/server.cpp         |  6 ++++
 3 files changed, 71 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b48373df82..3fa9253c3c 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3092,6 +3092,66 @@ llama_context * server_context::get_llama_context() const {
     return impl->ctx;
 }
 
+void server_context::auto_save_slots() {
+    const auto & params = impl->params_base;
+    if (params.slot_save_path.empty()) {
+        return;
+    }
+
+    for (auto & slot : impl->slots) {
+        if (slot.prompt.tokens.size() == 0) {
+            continue;
+        }
+
+        const std::string model_stem = std::filesystem::path(params.model.path).stem().string();
+        const std::string filepath   = params.slot_save_path + "/" + model_stem;
+
+        const llama_tokens & tokens = slot.prompt.tokens.get_text_tokens();
+        const size_t token_count    = slot.prompt.tokens.size();
+        const size_t nwrite = llama_state_seq_save_file(impl->ctx, filepath.c_str(), slot.id, tokens.data(), token_count);
+
+        slot_checkpoints_save(filepath, slot.prompt.checkpoints);
+
+        SRV_INF("auto-saved slot %d (%zu tokens, %.1f MiB) to %s\n",
+            slot.id, token_count, (float) nwrite / (1024.0f * 1024.0f), filepath.c_str());
+    }
+}
+
+void server_context::auto_restore_slots() {
+    const auto & params = impl->params_base;
+    if (params.slot_save_path.empty()) {
+        return;
+    }
+
+    const std::string model_stem = std::filesystem::path(params.model.path).stem().string();
+    const std::string filepath   = params.slot_save_path + "/" + model_stem;
+
+    if (!std::filesystem::exists(filepath)) {
+        return;
+    }
+
+    for (auto & slot : impl->slots) {
+        llama_tokens tokens;
+        tokens.resize(slot.n_ctx);
+        size_t token_count = 0;
+        const size_t nread = llama_state_seq_load_file(impl->ctx, filepath.c_str(), slot.id, tokens.data(), tokens.size(), &token_count);
+
+        if (nread == 0) {
+            SRV_WRN("auto-restore failed for slot %d from %s\n", slot.id, filepath.c_str());
+            continue;
+        }
+
+        tokens.resize(token_count);
+        slot.prompt.tokens.clear();
+        slot.prompt.tokens.insert(tokens);
+
+        slot_checkpoints_load(filepath, slot.prompt.checkpoints);
+
+        SRV_INF("auto-restored slot %d (%zu tokens, %.1f MiB) from %s\n",
+            slot.id, token_count, (float) nread / (1024.0f * 1024.0f), filepath.c_str());
+    }
+}
+
 server_response_reader server_context::get_response_reader() {
     return impl->get_response_reader();
 }
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 75f3d2de56..e63220a6ba 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -64,6 +64,11 @@ struct server_context {
     // terminate main loop (will unblock start_loop)
     void terminate();
 
+    // auto-save/restore slot state for seamless model hot-swapping in router mode
+    // requires --slot-save-path to be set
+    void auto_save_slots();
+    void auto_restore_slots();
+
     // get the underlaying llama_context, can return nullptr if sleeping
     // not thread-safe, should only be used from the main thread
     llama_context * get_llama_context() const;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0bd6fda17d..adbdc77bb8 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -273,6 +273,9 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: model loaded\n", __func__);
 
+        // in router mode, restore previously saved slot state for this model
+        ctx_server.auto_restore_slots();
+
         shutdown_handler = [&](int) {
             // this will unblock start_loop()
             ctx_server.terminate();
@@ -318,6 +321,9 @@ int main(int argc, char ** argv) {
         // this call blocks the main thread until queue_tasks.terminate() is called
         ctx_server.start_loop();
 
+        // in router mode, save slot state before exit so it can be restored on reload
+        ctx_server.auto_save_slots();
+
         clean_up();
         if (ctx_http.thread.joinable()) {
             ctx_http.thread.join();