context : refactor model_supports_compute_type to lambda
keep // // llama_context // look nice.
This commit is contained in:
parent
d38923a1ec
commit
233f5ab82d
|
|
@ -14,32 +14,6 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
static bool model_supports_compute_type(enum llm_arch arch, ggml_type compute_type) {
|
|
||||||
// F32 is always supported - it's the default/safe precision
|
|
||||||
if (compute_type == GGML_TYPE_F32) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Nowadays FP16 and BF16 support is model-specific.
|
|
||||||
// Add models here as their required ops are 'compute_type' implemented and validated.
|
|
||||||
// Example (uncomment when ready):
|
|
||||||
// if (compute_type == GGML_TYPE_F16 || compute_type == GGML_TYPE_BF16) {
|
|
||||||
// switch (arch) {
|
|
||||||
// case LLM_ARCH_QWEN2:
|
|
||||||
// case LLM_ARCH_QWEN2MOE:
|
|
||||||
// case LLM_ARCH_QWEN3:
|
|
||||||
// // ... other validated models
|
|
||||||
// return true;
|
|
||||||
// default:
|
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// No models enabled yet for non-F32 compute types
|
|
||||||
(void)arch;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_context
|
// llama_context
|
||||||
//
|
//
|
||||||
|
|
@ -192,15 +166,33 @@ llama_context::llama_context(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the model supports the requested compute type
|
// Nowadays FP16 and BF16 support is model-specific.
|
||||||
if (cparams.compute_type != GGML_TYPE_F32) {
|
// Add models here as their required ops are 'compute_type' implemented and validated.
|
||||||
if (!model_supports_compute_type(model.arch, cparams.compute_type)) {
|
auto model_supports_compute_type = [&](ggml_type ct) -> bool {
|
||||||
LLAMA_LOG_WARN("%s: model arch '%s' does not yet support compute_type %s, "
|
if (ct == GGML_TYPE_F32) {
|
||||||
"falling back to F32. To enable, the required ops must be implemented first.\n",
|
return true; // F32 is always supported
|
||||||
__func__, llm_arch_name(model.arch),
|
|
||||||
ggml_type_name(cparams.compute_type));
|
|
||||||
cparams.compute_type = GGML_TYPE_F32;
|
|
||||||
}
|
}
|
||||||
|
// Example (uncomment when ready):
|
||||||
|
// if (ct == GGML_TYPE_F16 || ct == GGML_TYPE_BF16) {
|
||||||
|
// switch (model.arch) {
|
||||||
|
// case LLM_ARCH_QWEN2:
|
||||||
|
// case LLM_ARCH_QWEN2MOE:
|
||||||
|
// case LLM_ARCH_QWEN3:
|
||||||
|
// return true;
|
||||||
|
// default:
|
||||||
|
// return false;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
(void)model.arch; // no models enabled yet for non-F32 compute types
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!model_supports_compute_type(cparams.compute_type)) {
|
||||||
|
LLAMA_LOG_WARN("%s: model arch '%s' does not yet support compute_type %s, "
|
||||||
|
"falling back to F32. To enable, the required ops must be implemented first.\n",
|
||||||
|
__func__, llm_arch_name(model.arch),
|
||||||
|
ggml_type_name(cparams.compute_type));
|
||||||
|
cparams.compute_type = GGML_TYPE_F32;
|
||||||
}
|
}
|
||||||
|
|
||||||
// with causal attention, the batch size is limited by the context size
|
// with causal attention, the batch size is limited by the context size
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue