diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index dd6c978b94..e35b4573f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -553,11 +553,12 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param return quantize; } -static ggml_type get_tensor_target_type( +static ggml_type tensor_get_target_type( quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, - ggml_type default_type + ggml_type default_type, + bool update_stats // should we update qs or no? ) { ggml_type new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. @@ -597,7 +598,9 @@ static ggml_type get_tensor_target_type( LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; } else { - ++qs.n_k_quantized; + if (update_stats) { + ++qs.n_k_quantized; + } } if (convert_incompatible_tensor) { @@ -623,7 +626,9 @@ static ggml_type get_tensor_target_type( new_type = GGML_TYPE_F16; } LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; + if (update_stats) { + ++qs.n_fallback; + } } } } @@ -851,7 +856,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // TODO: we could save this per-tensor and correlate it with the vector of tensors so we // don't have to call this function again later (currently twice per tensor) - ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type); + ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false); if (!params->imatrix && tensor_allows_quantization(params, model.arch, tensor) && @@ -957,7 +962,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if so, what will be the target type? if (do_quantize) { - new_type = get_tensor_target_type(qs, params, tensor, default_type); + new_type = tensor_get_target_type(qs, params, tensor, default_type, true); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type;