diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 6645aceb25..3fb9f3ffc8 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -325,9 +325,15 @@ hf_files get_repo_files(const std::string & repo_id, if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { file.oid = item["lfs"]["oid"].get(); } + if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { + file.size = item["lfs"]["size"].get(); + } } else if (item.contains("oid") && item["oid"].is_string()) { file.oid = item["oid"].get(); } + if (file.size == 0 && item.contains("size") && item["size"].is_number()) { + file.size = item["size"].get(); + } if (!file.oid.empty() && !is_valid_oid(file.oid)) { LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); @@ -487,6 +493,34 @@ std::string finalize_file(const hf_file & file) { // delete everything after this line, one day +// copied from download.cpp without the tag part +struct gguf_split_info { + std::string prefix; // tag included + int index; + int count; +}; + +static gguf_split_info get_gguf_split_info(const std::string & path) { + static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); + std::smatch m; + + std::string prefix = path; + if (!string_remove_suffix(prefix, ".gguf")) { + return {}; + } + + int index = 1; + int count = 1; + + if (std::regex_match(prefix, m, re_split)) { + index = std::stoi(m[2].str()); + count = std::stoi(m[3].str()); + prefix = m[1].str(); + } + + return {std::move(prefix), index, count}; +} + static std::pair parse_manifest_name(std::string & filename) { static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); std::smatch match; @@ -504,25 +538,30 @@ static std::string make_old_cache_filename(const std::string & owner, return result; } -static void migrate_single_file(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const nl::json & node, - const hf_files & files) { +struct migrate_file { + std::string path; + std::string sha256; + size_t size; + fs::path old_path; + fs::path etag_path; + const hf_file * file; +}; - if (!node.contains("rfilename") || - !node.contains("lfs") || - !node["lfs"].contains("sha256")) { - return; - } +using migrate_files = std::vector; - std::string path = node["rfilename"]; - std::string sha256 = node["lfs"]["sha256"]; +static bool collect_file(const fs::path & old_cache, + const std::string & owner, + const std::string & repo, + const std::string & path, + const std::string & sha256, + const hf_files & files, + migrate_files & to_migrate) { + + const hf_file * file = nullptr; - const hf_file * file_info = nullptr; for (const auto & f : files) { if (f.path == path) { - file_info = &f; + file = &f; break; } } @@ -532,41 +571,105 @@ static void migrate_single_file(const fs::path & old_cache, fs::path etag_path = old_path.string() + ".etag"; if (!fs::exists(old_path)) { - if (fs::exists(etag_path)) { - LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str()); - fs::remove(etag_path); + if (file && fs::exists(file->final_path)) { + return true; } - return; + LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); + return false; } - if (!file_info) { - LOG_WRN("%s: %s not found in current repo, ignoring...\n", __func__, old_filename.c_str()); - return; - } else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch), ignoring...\n", __func__, old_filename.c_str()); - return; + if (!file) { + LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); + return false; } + if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { + LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); + return false; + } + + if (file->size > 0) { + size_t size = fs::file_size(old_path); + if (size != file->size) { + LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); + return false; + } + } + + to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); + return true; +} + +static bool collect_files(const fs::path & old_cache, + const std::string & owner, + const std::string & repo, + const nl::json & node, + const hf_files & files, + migrate_files & to_migrate) { + + if (!node.contains("rfilename") || + !node.contains("lfs") || + !node["lfs"].contains("sha256")) { + return true; + } + + std::string path = node["rfilename"]; + std::string sha256 = node["lfs"]["sha256"]; + + auto split = get_gguf_split_info(path); + + if (split.count <= 1) { + return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); + } + + std::vector> splits; + + for (const auto & f : files) { + auto split_f = get_gguf_split_info(f.path); + if (split_f.count == split.count && split_f.prefix == split.prefix) { + // sadly the manifest only provides the sha256 of the first file (index == 1) + // the rest will be verified using the size... + std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; + splits.emplace_back(f.path, f_sha256); + } + } + + if ((int)splits.size() != split.count) { + LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); + return false; + } + + for (const auto & [f_path, f_sha256] : splits) { + if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { + return false; + } + } + + return true; +} + +static bool migrate_file(const migrate_file & file) { std::error_code ec; - fs::path new_path(file_info->local_path); + fs::path new_path(file.file->local_path); fs::create_directories(new_path.parent_path(), ec); if (!fs::exists(new_path, ec)) { - fs::rename(old_path, new_path, ec); + fs::rename(file.old_path, new_path, ec); if (ec) { - fs::copy_file(old_path, new_path, ec); + fs::copy_file(file.old_path, new_path, ec); if (ec) { - LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str()); - return; + LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); + return false; } } - fs::remove(old_path, ec); + fs::remove(file.old_path, ec); } - fs::remove(etag_path, ec); + fs::remove(file.etag_path, ec); - std::string filename = finalize_file(*file_info); - LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str()); + std::string filename = finalize_file(*file.file); + LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); + return true; } void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { @@ -614,19 +717,43 @@ void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { continue; } + migrate_files to_migrate; + bool ok = true; + try { std::ifstream manifest(entry.path()); auto json = nl::json::parse(manifest); - for (const char * key : {"ggufFile", "mmprojFile"}) { if (json.contains(key)) { - migrate_single_file(old_cache, owner, repo, json[key], files); + if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { + ok = false; + break; + } } } } catch (const std::exception & e) { LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); continue; } + + if (!ok) { + LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); + continue; + } + + for (const auto & file : to_migrate) { + if (!migrate_file(file)) { + ok = false; + break; + } + } + + if (!ok) { + LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); + continue; + } + + LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); fs::remove(entry.path()); } } diff --git a/common/hf-cache.h b/common/hf-cache.h index ee2e98494a..9e46f97743 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -14,6 +14,7 @@ struct hf_file { std::string final_path; std::string oid; std::string repo_id; + size_t size = 0; // only for the migration }; using hf_files = std::vector;