diff --git a/common/arg.cpp b/common/arg.cpp index 26c790c7e0..9c0e6fbe78 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2255,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::vector split_arg{ it, {} }; if (split_arg.size() >= llama_max_devices()) { throw std::invalid_argument( - string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices()) ); } for (size_t i = 0; i < llama_max_devices(); ++i) { @@ -2295,10 +2295,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_FIT")); add_opt(common_arg( - { "-fitt", "--fit-target" }, "MiB", - string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)), - [](common_params & params, int value) { - params.fit_params_target = value * size_t(1024*1024); + { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...", + string_format("target margin per device for --fit, comma-separated list of values, " + "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)), + [](common_params & params, const std::string & value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices()) + ); + } + if (split_arg.size() == 1) { + std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024); + return; + } + for (size_t i = 0; i < split_arg.size(); i++) { + params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024; + } } ).set_env("LLAMA_ARG_FIT_TARGET")); add_opt(common_arg( diff --git a/common/common.cpp b/common/common.cpp index 34fa3b5a42..744f0b4eeb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) : if (params.fit_params) { LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__); llama_params_fit(params.model.path.c_str(), &mparams, &cparams, - params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); } diff --git a/common/common.h b/common/common.h index d55a6b71fb..7794c0268b 100644 --- a/common/common.h +++ b/common/common.h @@ -332,12 +332,14 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - bool fit_params = true; // whether to fit unset model/context parameters to free device memory - size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory - int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use + int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + bool fit_params = true; // whether to fit unset model/context parameters to free device memory + int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use + + // margin per device in bytes for fitting parameters to free memory: + std::vector fit_params_target = std::vector(llama_max_devices(), 1024 * 1024*1024); enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs diff --git a/include/llama.h b/include/llama.h index edc4c871a1..12e4e57d0e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -495,7 +495,7 @@ extern "C" { struct llama_context_params * cparams, float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements - size_t margin, // margin of memory to leave per device in bytes + size_t * margins, // margins of memory to leave per device in bytes uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log diff --git a/src/llama.cpp b/src/llama.cpp index dfefb3d2b5..33f51a2389 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -147,9 +147,8 @@ class llama_params_fit_exception : public std::runtime_error { static void llama_params_fit_impl( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, - size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { constexpr int64_t MiB = 1024*1024; - const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits typedef std::vector dmds_t; const llama_model_params default_mparams = llama_model_default_params(); @@ -168,6 +167,12 @@ static void llama_params_fit_impl( return; } + std::vector margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits + margins.reserve(nd); + for (size_t id = 0; id < nd; id++) { + margins.push_back(margins_s[id]); + } + std::vector dev_names; { dev_names.reserve(nd); @@ -187,9 +192,10 @@ static void llama_params_fit_impl( int64_t sum_free = 0; int64_t sum_projected_free = 0; - int64_t min_projected_free = INT64_MAX; int64_t sum_projected_used = 0; int64_t sum_projected_model = 0; + std::vector projected_free_per_device; + projected_free_per_device.reserve(nd); if (nd > 1) { LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); @@ -199,45 +205,63 @@ static void llama_params_fit_impl( const int64_t projected_used = dmd.mb.total(); const int64_t projected_free = dmd.free - projected_used; + projected_free_per_device.push_back(projected_free); sum_free += dmd.free; sum_projected_used += projected_used; sum_projected_free += projected_free; - min_projected_free = std::min(min_projected_free, projected_free); sum_projected_model += dmd.mb.model; if (nd > 1) { - LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", - __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB, - projected_free >= 0 ? "surplus" : "deficit"); + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); } } assert(sum_free >= 0 && sum_projected_used >= 0); LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", __func__, sum_projected_used/MiB, sum_free/MiB); - if (min_projected_free >= margin) { - if (nd == 1) { + if (nd == 1) { + if (projected_free_per_device[0] >= margins[0]) { LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", - __func__, min_projected_free/MiB, margin/MiB); + __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); + return; + } + } else { + bool changes_needed = false; + for (size_t id = 0; id < nd; id++) { + if (projected_free_per_device[id] < margins[id]) { + changes_needed = true; + break; + } + } + if (!changes_needed) { + LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); return; } - LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n", - __func__, min_projected_free/MiB, margin/MiB); - return; } // step 2: try reducing memory use by reducing the context size { - int64_t global_surplus = sum_projected_free - int64_t(nd)*margin; + int64_t global_surplus = sum_projected_free; + for (size_t id = 0; id < nd; id++) { + global_surplus -= margins[id]; + } if (global_surplus < 0) { - LLAMA_LOG_INFO(nd == 1 ? - "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" : - "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n", - __func__, margin/MiB, -global_surplus/MiB); + if (nd == 1) { + LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", + __func__, margins[0]/MiB, -global_surplus/MiB); + } else { + LLAMA_LOG_INFO( + "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n", + __func__, -global_surplus/MiB); + } if (cparams->n_ctx == 0) { if (hp_nct > n_ctx_min) { - int64_t sum_used_target = sum_free - nd*margin_s; + int64_t sum_used_target = sum_free; + for (size_t id = 0; id < nd; id++) { + sum_used_target -= margins[id]; + } if (nd > 1) { // for multiple devices we need to be more conservative in terms of how much context we think can fit: // - for dense models only whole layers can be assigned to devices @@ -448,9 +472,9 @@ static void llama_params_fit_impl( const dmds_t dmds_cpu_moe = llama_get_device_memory_data( path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); - for (const llama_device_memory_data & dmd : dmds_cpu_moe) { - global_surplus_cpu_moe += dmd.free; - global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin; + for (size_t id = 0; id < nd; id++) { + global_surplus_cpu_moe += dmds_cpu_moe[id].free; + global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id]; } if (global_surplus_cpu_moe > 0) { @@ -469,7 +493,7 @@ static void llama_params_fit_impl( std::vector targets; // maximum acceptable memory use per device targets.reserve(nd); for (size_t id = 0; id < nd; id++) { - targets.push_back(dmds_full[id].free - margin); + targets.push_back(dmds_full[id].free - margins[id]); LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); } @@ -701,11 +725,11 @@ static void llama_params_fit_impl( enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, - size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) { const int64_t t0_us = llama_time_us(); llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS; try { - llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level); + llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level); LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); } catch (const llama_params_fit_exception & e) { LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index c7e7748ca9..f9d9cb34c7 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -27,7 +27,7 @@ int main(int argc, char ** argv) { auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams, - params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) { LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);