From f71b68ae4bcc22413f771c7010fe5e9353e5061c Mon Sep 17 00:00:00 2001
From: lonnie <lonweird08@gmail.com>
Date: Sun, 1 Mar 2026 22:43:24 +0800
Subject: [PATCH] feat: add --cache-only flag to skip model re-download

Add a new --cache-only (or --no-update) flag that sits between the
default behavior and --offline mode:

- Default: check etag, re-download if changed
- --cache-only: use cached file if exists, download new models only
- --offline: no network access, must have all files cached

This is useful for serving large models (50GB+) where unexpected
re-downloads due to upstream repo updates are problematic.

Fixes: ggerganov/llama.cpp/issues/20005
---
 common/arg.cpp      | 12 ++++++++++--
 common/common.h     |  1 +
 common/download.cpp | 37 +++++++++++++++++++++++++------------
 common/download.h   |  2 ++
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 05f4a5244e..2d877c8ffc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -291,13 +291,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
     }
 
     const bool offline = params.offline;
+    const bool cache_only = params.cache_only;
     std::string model_endpoint = get_model_endpoint();
     auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
 
     // prepare local path for caching
     auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
     auto preset_path = fs_get_cache_file(preset_fname);
-    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline, cache_only);
     const bool has_preset = status >= 200 && status < 400;
 
     // remote preset is optional, so we don't error out if not found
@@ -378,7 +379,7 @@ static handle_model_result common_params_handle_model(
 
     // then, download it if needed
     if (!model.url.empty()) {
-        bool ok = common_download_model(model, bearer_token, offline);
+        bool ok = common_download_model(model, bearer_token, offline, cache_only);
         if (!ok) {
             LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
             exit(1);
@@ -3198,6 +3199,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.offline = true;
         }
     ).set_env("LLAMA_OFFLINE"));
+    add_opt(common_arg(
+        {"--cache-only"},
+        "Cache-only mode: download models if not cached, but never re-download or update cached models",
+        [](common_params & params) {
+            params.cache_only = true;
+        }
+    ).set_env("LLAMA_CACHE_ONLY"));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
diff --git a/common/common.h b/common/common.h
index c5a8037571..b538a15892 100644
--- a/common/common.h
+++ b/common/common.h
@@ -440,6 +440,7 @@ struct common_params {
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
+    bool    cache_only                 = false; // use cache only, never re-download
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
diff --git a/common/download.cpp b/common/download.cpp
index 5ef60a4208..75d12f2991 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -440,18 +440,29 @@ int common_download_file_single(const std::string & url,
                                 const std::string & path,
                                 const std::string & bearer_token,
                                 bool offline,
+                                bool cache_only,
                                 const common_header_list & headers) {
-    if (!offline) {
-        return common_download_file_single_online(url, path, bearer_token, headers);
+    if (offline) {
+        // Original offline mode: no network access at all
+        if (!std::filesystem::exists(path)) {
+            LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
+            return -1;
+        }
+        LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+        return 304; // Not Modified - fake cached response
     }
 
-    if (!std::filesystem::exists(path)) {
-        LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
-        return -1;
+    if (cache_only) {
+        // Cache-only mode: use cached file if exists, otherwise download
+        // but never re-download if etag changed
+        if (std::filesystem::exists(path)) {
+            LOG_INF("%s: using cached file (cache-only mode): %s\n", __func__, path.c_str());
+            return 304; // Not Modified - fake cached response
+        }
+        // File not cached, proceed with download
     }
 
-    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-    return 304; // Not Modified - fake cached response
+    return common_download_file_single_online(url, path, bearer_token, headers);
 }
 
 // download multiple files from remote URLs to local paths
@@ -459,6 +470,7 @@ int common_download_file_single(const std::string & url,
 static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
                                           const std::string & bearer_token,
                                           bool offline,
+                                          bool cache_only,
                                           const common_header_list & headers) {
     // Prepare download in parallel
     std::vector<std::future<bool>> futures_download;
@@ -468,8 +480,8 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
         futures_download.push_back(
             std::async(
                 std::launch::async,
-                [&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
-                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                [&bearer_token, offline, cache_only, &headers](const std::pair<std::string, std::string> & it) -> bool {
+                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, cache_only, headers);
                     return is_http_status_ok(http_status);
                 },
                 item
@@ -490,6 +502,7 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
 bool common_download_model(const common_params_model & model,
                            const std::string & bearer_token,
                            bool offline,
+                           bool cache_only,
                            const common_header_list & headers) {
     // Basic validation of the model.url
     if (model.url.empty()) {
@@ -497,7 +510,7 @@ bool common_download_model(const common_params_model & model,
         return false;
     }
 
-    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
+    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, cache_only, headers);
     if (!is_http_status_ok(http_status)) {
         return false;
     }
@@ -557,7 +570,7 @@ bool common_download_model(const common_params_model & model,
         }
 
         // Download in parallel
-        common_download_file_multiple(urls, bearer_token, offline, headers);
+        common_download_file_multiple(urls, bearer_token, offline, cache_only, headers);
     }
 
     return true;
@@ -751,7 +764,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
         std::string local_path = fs_get_cache_file(model_filename);
 
         const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
-        const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
+        const int http_status = common_download_file_single(blob_url, local_path, token, false, false, {});
         if (!is_http_status_ok(http_status)) {
             throw std::runtime_error("Failed to download Docker Model");
         }
diff --git a/common/download.h b/common/download.h
index 1c1d8e6db5..79beeb197d 100644
--- a/common/download.h
+++ b/common/download.h
@@ -65,6 +65,7 @@ bool common_download_model(
     const common_params_model & model,
     const std::string & bearer_token,
     bool offline,
+    bool cache_only,
     const common_header_list & headers = {}
 );
 
@@ -77,6 +78,7 @@ int common_download_file_single(const std::string & url,
                                 const std::string & path,
                                 const std::string & bearer_token,
                                 bool offline,
+                                bool cache_only,
                                 const common_header_list & headers = {});
 
 // resolve and download model from Docker registry