context : add model_supports_compute_type check, nowadays still model-specific.

2026-02-12 12:19:44 +08:00 · 2026-02-12 12:19:44 +08:00 · f47e50a18b
parent e30dc63cb6
commit f47e50a18b
1 changed files with 37 additions and 0 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -14,6 +14,32 @@
 #include <limits>
 #include <stdexcept>

+static bool model_supports_compute_type(enum llm_arch arch, ggml_type compute_type) {
+    // F32 is always supported - it's the default/safe precision
+    if (compute_type == GGML_TYPE_F32) {
+        return true;
+    }
+
+    // Nowadays FP16 and BF16 support is model-specific.
+    // Add models here as their required ops are 'compute_type' implemented and validated.
+    // Example (uncomment when ready):
+    // if (compute_type == GGML_TYPE_F16 || compute_type == GGML_TYPE_BF16) {
+    //     switch (arch) {
+    //         case LLM_ARCH_QWEN2:
+    //         case LLM_ARCH_QWEN2MOE:
+    //         case LLM_ARCH_QWEN3:
+    //         // ... other validated models
+    //             return true;
+    //         default:
+    //             return false;
+    //     }
+    // }
+
+    // No models enabled yet for non-F32 compute types
+    (void)arch;
+    return false;
+}
+
 //
 // llama_context
 //
@ -166,6 +192,17 @@ llama_context::llama_context(
            break;
    }

+    // check if the model supports the requested compute type
+    if (cparams.compute_type != GGML_TYPE_F32) {
+        if (!model_supports_compute_type(model.arch, cparams.compute_type)) {
+            LLAMA_LOG_WARN("%s: model arch '%s' does not yet support compute_type %s, "
+                           "falling back to F32. To enable, the required ops must be implemented first.\n",
+                           __func__, llm_arch_name(model.arch),
+                           ggml_type_name(cparams.compute_type));
+            cparams.compute_type = GGML_TYPE_F32;
+        }
+    }
+
    // with causal attention, the batch size is limited by the context size
    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;