refactor
This commit is contained in:
parent
3c1f94a49d
commit
a26db356c9
|
|
@ -11,6 +11,7 @@
|
|||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <thread>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
// Quantization types. Changes to this struct must be replicated in quantize.cpp
|
||||
|
|
@ -19,6 +20,35 @@ struct tensor_quantization {
|
|||
ggml_type quant = GGML_TYPE_COUNT;
|
||||
};
|
||||
|
||||
struct quantize_state_impl {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
|
||||
int n_attention_wv = 0;
|
||||
int n_ffn_down = 0;
|
||||
int n_ffn_gate = 0;
|
||||
int n_ffn_up = 0;
|
||||
int i_attention_wv = 0;
|
||||
int i_ffn_down = 0;
|
||||
int i_ffn_gate = 0;
|
||||
int i_ffn_up = 0;
|
||||
|
||||
int n_fallback = 0;
|
||||
|
||||
bool has_imatrix = false;
|
||||
|
||||
// used to figure out if a model shares tok_embd with the output weight
|
||||
bool has_output = false;
|
||||
|
||||
// if this flag is false, the code will skip updating this struct
|
||||
bool do_count = false;
|
||||
|
||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||
: model(model)
|
||||
, params(params)
|
||||
{}
|
||||
};
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
|
|
@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
|
|||
return orig_name;
|
||||
}
|
||||
|
||||
struct quantize_state_impl {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
|
||||
int n_attention_wv = 0;
|
||||
int n_ffn_down = 0;
|
||||
int n_ffn_gate = 0;
|
||||
int n_ffn_up = 0;
|
||||
int i_attention_wv = 0;
|
||||
int i_ffn_down = 0;
|
||||
int i_ffn_gate = 0;
|
||||
int i_ffn_up = 0;
|
||||
|
||||
int n_fallback = 0;
|
||||
|
||||
bool has_imatrix = false;
|
||||
|
||||
// used to figure out if a model shares tok_embd with the output weight
|
||||
bool has_output = false;
|
||||
|
||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||
: model(model)
|
||||
, params(params)
|
||||
{}
|
||||
};
|
||||
|
||||
static void llama_tensor_dequantize_impl(
|
||||
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
|
|
@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl(
|
|||
// internal standard logic for selecting the target tensor type for a specific
|
||||
// quantization mixture & model architecture
|
||||
static ggml_type llama_tensor_get_type_impl(
|
||||
quantize_state_impl & qs,
|
||||
quantize_state_impl * qs,
|
||||
ggml_type new_type,
|
||||
const ggml_tensor * tensor,
|
||||
const llama_ftype ftype
|
||||
|
|
@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
const llm_arch arch = qs.model.arch;
|
||||
const llm_arch arch = qs->model.arch;
|
||||
const auto tn = LLM_TN(arch);
|
||||
|
||||
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
|
||||
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
||||
};
|
||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||
const int n_expert = std::max(1, (int)qs->model.hparams.n_expert);
|
||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||
if (n_expert > 1) {
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
||||
|
|
@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
|
||||
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
||||
// with the quantization of the output tensor
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
||||
if (qs->params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs->params->output_tensor_type;
|
||||
} else {
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
|
@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
||||
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->token_embedding_type;
|
||||
if (qs->params->token_embedding_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs->params->token_embedding_type;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
|
|
@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||
if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
||||
else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (name.find("ffn_down") != std::string::npos) {
|
||||
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
||||
if (qs->i_ffn_down < qs->n_ffn_down/8) {
|
||||
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
||||
}
|
||||
}
|
||||
else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (qs->model.hparams.n_expert == 8) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
||||
|
|
@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
||||
new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
if (qs.model.type == LLM_TYPE_70B) {
|
||||
use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
if (qs->model.type == LLM_TYPE_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (qs->model.hparams.n_expert == 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
} else if (name.find("attn_k.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (qs->model.hparams.n_expert == 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
|
|
@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
new_type = GGML_TYPE_IQ2_S;
|
||||
}
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||
auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) {
|
||||
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
|
|
@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
: GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
||||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
||||
(qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
|
|
@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
||||
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
|
|
@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
||||
&& qs.has_imatrix && i_layer < n_layer/8) {
|
||||
&& qs->has_imatrix && i_layer < n_layer/8) {
|
||||
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
||||
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
||||
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
||||
|
|
@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
}
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (qs->model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
||||
|
|
@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (name.find("ffn_gate") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
||||
auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
}
|
||||
else if (name.find("ffn_up") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
||||
auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
|
|
@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl(
|
|||
|
||||
// determine the ggml_type that this tensor should be quantized to
|
||||
static ggml_type llama_tensor_get_type(
|
||||
quantize_state_impl & qs,
|
||||
quantize_state_impl * qs,
|
||||
const llama_model_quantize_params * params,
|
||||
const ggml_tensor * tensor,
|
||||
const ggml_type default_type
|
||||
|
|
@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type(
|
|||
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
|
||||
new_type = GGML_TYPE_F16;
|
||||
}
|
||||
++qs.n_fallback;
|
||||
if (qs->do_count) {
|
||||
++qs->n_fallback;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
model.load_hparams(ml);
|
||||
model.load_stats (ml);
|
||||
|
||||
quantize_state_impl qs(model, params);
|
||||
auto qs = std::make_unique<quantize_state_impl>(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = ml.ftype;
|
||||
|
|
@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
||||
if (imatrix_data) {
|
||||
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
||||
qs.has_imatrix = true;
|
||||
qs->has_imatrix = true;
|
||||
// check imatrix for nans or infs
|
||||
for (const auto & kv : *imatrix_data) {
|
||||
for (float f : kv.second) {
|
||||
|
|
@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
const ggml_tensor * tensor = it->tensor;
|
||||
const std::string name = tensor->name;
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_qkv.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight")!= std::string::npos) {
|
||||
++qs.n_attention_wv;
|
||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs.has_output = true;
|
||||
}
|
||||
|
||||
// populate the original tensors so we get an initial meta data
|
||||
uint16_t i_split = params->keep_split ? it->idx : 0;
|
||||
if (!ctx_outs[i_split]) {
|
||||
|
|
@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
}
|
||||
|
||||
qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// Set split info if needed
|
||||
if (n_split > 1) {
|
||||
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
||||
|
|
@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
||||
qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer;
|
||||
qs->do_count = true; // we start counting stats for the main loop
|
||||
|
||||
// iterate over all weights (main loop)
|
||||
for (const auto * it : weights) {
|
||||
const auto & weight = *it;
|
||||
|
|
@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
const std::string name = ggml_get_name(tensor);
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_qkv.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight")!= std::string::npos) {
|
||||
++qs->n_attention_wv;
|
||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs->has_output = true;
|
||||
}
|
||||
|
||||
if (!params->dry_run) {
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < tensor_size) {
|
||||
|
|
@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
// count stats for this tensor based on its name
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight") != std::string::npos) {
|
||||
++qs.i_attention_wv;
|
||||
++qs->i_attention_wv;
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
++qs.i_ffn_down;
|
||||
++qs->i_ffn_down;
|
||||
} else if (name.find("ffn_gate") != std::string::npos) {
|
||||
++qs.i_ffn_gate;
|
||||
++qs->i_ffn_gate;
|
||||
} else if (name.find("ffn_up") != std::string::npos) {
|
||||
++qs.i_ffn_up;
|
||||
++qs->i_ffn_up;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
);
|
||||
}
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
if (qs->n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
|
||||
__func__, qs.n_fallback);
|
||||
__func__, qs->n_fallback);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue