Update attn_qkv schema, change throw behaviour

2026-03-10 11:24:24 -04:00 · 2026-03-10 11:24:24 -04:00 · 544745c034
parent 2015dea820
commit 544745c034
4 changed files with 1907 additions and 367 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -355,7 +355,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
    return return_type;
 }

-ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
    const std::string name = ggml_get_name(tensor);

    // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -782,7 +782,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ3_S:
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   return GGML_TYPE_IQ3_S;

-        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+        default: return GGML_TYPE_COUNT;
    }
 }

@ -874,6 +874,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    ggml_type default_type = llama_ftype_get_default_type(ftype);
+    if (default_type == GGML_TYPE_COUNT) {
+        throw std::runtime_error(format("invalid output file type %d\n", ftype));
+    }

    // mmap consistently increases speed on Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
--- a/tests/snapshots/qwen3-coder-next.schema
+++ b/tests/snapshots/qwen3-coder-next.schema
--- a/tests/snapshots/qwen3.5-27b.schema
+++ b/tests/snapshots/qwen3.5-27b.schema
--- a/tests/snapshots/qwen3.5-397b-a17b.schema
+++ b/tests/snapshots/qwen3.5-397b-a17b.schema