From f71b68ae4bcc22413f771c7010fe5e9353e5061c Mon Sep 17 00:00:00 2001 From: lonnie Date: Sun, 1 Mar 2026 22:43:24 +0800 Subject: [PATCH] feat: add --cache-only flag to skip model re-download Add a new --cache-only (or --no-update) flag that sits between the default behavior and --offline mode: - Default: check etag, re-download if changed - --cache-only: use cached file if exists, download new models only - --offline: no network access, must have all files cached This is useful for serving large models (50GB+) where unexpected re-downloads due to upstream repo updates are problematic. Fixes: ggerganov/llama.cpp/issues/20005 --- common/arg.cpp | 12 ++++++++++-- common/common.h | 1 + common/download.cpp | 37 +++++++++++++++++++++++++------------ common/download.h | 2 ++ 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 05f4a5244e..2d877c8ffc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -291,13 +291,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa } const bool offline = params.offline; + const bool cache_only = params.cache_only; std::string model_endpoint = get_model_endpoint(); auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini"; // prepare local path for caching auto preset_fname = clean_file_name(hf_repo + "_preset.ini"); auto preset_path = fs_get_cache_file(preset_fname); - const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline); + const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline, cache_only); const bool has_preset = status >= 200 && status < 400; // remote preset is optional, so we don't error out if not found @@ -378,7 +379,7 @@ static handle_model_result common_params_handle_model( // then, download it if needed if (!model.url.empty()) { - bool ok = common_download_model(model, bearer_token, offline); + bool ok = common_download_model(model, bearer_token, offline, cache_only); if (!ok) { LOG_ERR("error: failed to download model from %s\n", model.url.c_str()); exit(1); @@ -3198,6 +3199,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.offline = true; } ).set_env("LLAMA_OFFLINE")); + add_opt(common_arg( + {"--cache-only"}, + "Cache-only mode: download models if not cached, but never re-download or update cached models", + [](common_params & params) { + params.cache_only = true; + } + ).set_env("LLAMA_CACHE_ONLY")); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n" diff --git a/common/common.h b/common/common.h index c5a8037571..b538a15892 100644 --- a/common/common.h +++ b/common/common.h @@ -440,6 +440,7 @@ struct common_params { int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector bool offline = false; + bool cache_only = false; // use cache only, never re-download int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line diff --git a/common/download.cpp b/common/download.cpp index 5ef60a4208..75d12f2991 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -440,18 +440,29 @@ int common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline, + bool cache_only, const common_header_list & headers) { - if (!offline) { - return common_download_file_single_online(url, path, bearer_token, headers); + if (offline) { + // Original offline mode: no network access at all + if (!std::filesystem::exists(path)) { + LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str()); + return -1; + } + LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str()); + return 304; // Not Modified - fake cached response } - if (!std::filesystem::exists(path)) { - LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str()); - return -1; + if (cache_only) { + // Cache-only mode: use cached file if exists, otherwise download + // but never re-download if etag changed + if (std::filesystem::exists(path)) { + LOG_INF("%s: using cached file (cache-only mode): %s\n", __func__, path.c_str()); + return 304; // Not Modified - fake cached response + } + // File not cached, proceed with download } - LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str()); - return 304; // Not Modified - fake cached response + return common_download_file_single_online(url, path, bearer_token, headers); } // download multiple files from remote URLs to local paths @@ -459,6 +470,7 @@ int common_download_file_single(const std::string & url, static bool common_download_file_multiple(const std::vector> & urls, const std::string & bearer_token, bool offline, + bool cache_only, const common_header_list & headers) { // Prepare download in parallel std::vector> futures_download; @@ -468,8 +480,8 @@ static bool common_download_file_multiple(const std::vector & it) -> bool { - const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers); + [&bearer_token, offline, cache_only, &headers](const std::pair & it) -> bool { + const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, cache_only, headers); return is_http_status_ok(http_status); }, item @@ -490,6 +502,7 @@ static bool common_download_file_multiple(const std::vector