correct fallback logic
This commit is contained in:
parent
521a13e6c6
commit
3c1f94a49d
|
|
@ -89,7 +89,6 @@ struct quantize_state_impl {
|
|||
int i_ffn_gate = 0;
|
||||
int i_ffn_up = 0;
|
||||
|
||||
int n_k_quantized = 0;
|
||||
int n_fallback = 0;
|
||||
|
||||
bool has_imatrix = false;
|
||||
|
|
@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type(
|
|||
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
|
||||
new_type = GGML_TYPE_F16;
|
||||
}
|
||||
++qs.n_fallback;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type(
|
|||
return new_type;
|
||||
}
|
||||
|
||||
// update internal quantization state statistics based on the tensor name
|
||||
static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight") != std::string::npos) {
|
||||
++qs.i_attention_wv;
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
++qs.i_ffn_down;
|
||||
} else if (name.find("ffn_gate") != std::string::npos) {
|
||||
++qs.i_ffn_gate;
|
||||
} else if (name.find("ffn_up") != std::string::npos) {
|
||||
++qs.i_ffn_up;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
|
|
@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
tensor_requires_imatrix(tensor, target_type)
|
||||
) {
|
||||
if (params->dry_run) {
|
||||
will_require_imatrix = true; // set flag for warning later, but continue with dry run
|
||||
// set flag for warning later, but continue with dry run
|
||||
will_require_imatrix = true;
|
||||
} else {
|
||||
LLAMA_LOG_ERROR("\n============================================================================\n"
|
||||
" ERROR: this quantization requires an importance matrix!\n"
|
||||
|
|
@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
}
|
||||
|
||||
qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// Set split info if needed
|
||||
|
|
@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
|
||||
ggml_type new_type = default_type;
|
||||
|
||||
// if so, what will be the target type?
|
||||
// if so, what will be the new type?
|
||||
if (do_quantize) {
|
||||
new_type = llama_tensor_get_type(qs, params, tensor, default_type);
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
do_quantize = tensor->type != new_type;
|
||||
}
|
||||
|
||||
llama_tensor_update_stats(qs, name);
|
||||
// count stats for this tensor based on its name
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight") != std::string::npos) {
|
||||
++qs.i_attention_wv;
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
++qs.i_ffn_down;
|
||||
} else if (name.find("ffn_gate") != std::string::npos) {
|
||||
++qs.i_ffn_gate;
|
||||
} else if (name.find("ffn_up") != std::string::npos) {
|
||||
++qs.i_ffn_up;
|
||||
}
|
||||
}
|
||||
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
|
@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
||||
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
|
||||
__func__, qs.n_fallback);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
llama_model_quantize_params result = {
|
||||
/*.nthread =*/ 0,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
||||
/*.allow_requantize =*/ false,
|
||||
|
|
|
|||
Loading…
Reference in New Issue