llama_fit_params: return enum for fail vs. error (#18374)
This commit is contained in:
parent
9045c9afe5
commit
a52dc60ba3
|
|
@ -467,10 +467,16 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
enum llama_params_fit_status {
|
||||||
|
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||||
|
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
||||||
|
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
||||||
|
};
|
||||||
|
|
||||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||||
// returns true if the parameters could be successfully modified to fit device memory
|
// returns true if the parameters could be successfully modified to fit device memory
|
||||||
// this function is NOT thread safe because it modifies the global llama logger state
|
// this function is NOT thread safe because it modifies the global llama logger state
|
||||||
LLAMA_API bool llama_params_fit(
|
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_model_params * mparams,
|
struct llama_model_params * mparams,
|
||||||
struct llama_context_params * cparams,
|
struct llama_context_params * cparams,
|
||||||
|
|
|
||||||
|
|
@ -140,6 +140,10 @@ enum layer_fraction_t {
|
||||||
};
|
};
|
||||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||||
|
|
||||||
|
class llama_params_fit_exception : public std::runtime_error {
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
static void llama_params_fit_impl(
|
static void llama_params_fit_impl(
|
||||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
|
@ -281,28 +285,28 @@ static void llama_params_fit_impl(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||||
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||||
}
|
}
|
||||||
if (nd > 1) {
|
if (nd > 1) {
|
||||||
if (!tensor_split) {
|
if (!tensor_split) {
|
||||||
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
||||||
}
|
}
|
||||||
if (mparams->tensor_split) {
|
if (mparams->tensor_split) {
|
||||||
for (size_t id = 0; id < nd; id++) {
|
for (size_t id = 0; id < nd; id++) {
|
||||||
if (mparams->tensor_split[id] != 0.0f) {
|
if (mparams->tensor_split[id] != 0.0f) {
|
||||||
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!tensor_buft_overrides) {
|
if (!tensor_buft_overrides) {
|
||||||
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
||||||
}
|
}
|
||||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||||
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
||||||
}
|
}
|
||||||
|
|
||||||
// step 3: iteratively fill the back to front with "dense" layers
|
// step 3: iteratively fill the back to front with "dense" layers
|
||||||
|
|
@ -385,7 +389,7 @@ static void llama_params_fit_impl(
|
||||||
tensor_buft_overrides[itbo].buft = nullptr;
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
itbo++;
|
itbo++;
|
||||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
throw llama_params_fit_exception("llama_params_fit_n_tensor_buft_overrides() == "
|
||||||
+ std::to_string(ntbo) + " is insufficient for model\n");
|
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||||
}
|
}
|
||||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||||
|
|
@ -683,22 +687,25 @@ static void llama_params_fit_impl(
|
||||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_params_fit(
|
enum llama_params_fit_status llama_params_fit(
|
||||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
const int64_t t0_us = llama_time_us();
|
const int64_t t0_us = llama_time_us();
|
||||||
bool ok = true;
|
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
||||||
try {
|
try {
|
||||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||||
} catch (const std::runtime_error & e) {
|
} catch (const llama_params_fit_exception & e) {
|
||||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||||
ok = false;
|
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
||||||
|
} catch (const std::runtime_error & e) {
|
||||||
|
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
||||||
|
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
||||||
}
|
}
|
||||||
const int64_t t1_us = llama_time_us();
|
const int64_t t1_us = llama_time_us();
|
||||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||||
return ok;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
|
|
|
||||||
|
|
@ -26,10 +26,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
auto cparams = common_context_params_to_llama(params);
|
auto cparams = common_context_params_to_llama(params);
|
||||||
const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
if (!success) {
|
if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
|
||||||
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue