add --dry-run to llama-quantize
This commit is contained in:
parent
0d22288f00
commit
56c27b13ad
|
|
@ -735,24 +735,31 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
};
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
new_ofstream(0);
|
||||
|
||||
// no output file for --dry-run
|
||||
if (!params->dry_run) {
|
||||
new_ofstream(0);
|
||||
}
|
||||
|
||||
for (const auto * it : tensors) {
|
||||
const auto & weight = *it;
|
||||
ggml_tensor * tensor = weight.tensor;
|
||||
if (weight.idx != cur_split && params->keep_split) {
|
||||
if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
|
||||
close_ofstream();
|
||||
new_ofstream(weight.idx);
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||
read_data.resize(ggml_nbytes(tensor));
|
||||
if (!params->dry_run) {
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||
read_data.resize(ggml_nbytes(tensor));
|
||||
}
|
||||
tensor->data = read_data.data();
|
||||
}
|
||||
tensor->data = read_data.data();
|
||||
ml.load_data_for(tensor);
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
||||
++idx, ml.n_tensors,
|
||||
|
|
@ -900,126 +907,148 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
quantize = tensor->type != new_type;
|
||||
}
|
||||
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it == imatrix_data->end()) {
|
||||
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
||||
} else {
|
||||
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
||||
imatrix = it->second.data();
|
||||
} else {
|
||||
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
||||
|
||||
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
||||
// this is a significant error and it may be good idea to abort the process if this happens,
|
||||
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
||||
// tok_embd should be ignored in this case, since it always causes this warning
|
||||
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ2_XS ||
|
||||
new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ1_S ||
|
||||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
||||
LLAMA_LOG_ERROR("============================================================\n\n");
|
||||
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
||||
}
|
||||
|
||||
float * f32_data;
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
// we have now decided on the target type for this tensor
|
||||
// the --dry-run option calculates the final quantization size without quantizting
|
||||
if (params->dry_run) {
|
||||
if (quantize) {
|
||||
new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
|
||||
ggml_nbytes(tensor)/1024.0/1024.0,
|
||||
new_size/1024.0/1024.0,
|
||||
ggml_type_name(new_type));
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
|
||||
}
|
||||
total_size_org += ggml_nbytes(tensor);
|
||||
total_size_new += new_size;
|
||||
continue;
|
||||
} else {
|
||||
// no --dry-run, perform quantization
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it == imatrix_data->end()) {
|
||||
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
||||
} else {
|
||||
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
||||
imatrix = it->second.data();
|
||||
} else {
|
||||
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
||||
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
|
||||
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
||||
#if 0
|
||||
if (new_type == GGML_TYPE_MXFP4) {
|
||||
auto * x = f32_data_03;
|
||||
|
||||
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
||||
std::vector<float> deq(nrows*n_per_row);
|
||||
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
||||
qtype->to_float(new_data_03, deq.data(), deq.size());
|
||||
|
||||
double err = 0.0f;
|
||||
for (int i = 0; i < (int) deq.size(); ++i) {
|
||||
err += fabsf(deq[i] - x[i]);
|
||||
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
||||
if (deq[i] != x[i]) {
|
||||
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
||||
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
||||
// this is a significant error and it may be good idea to abort the process if this happens,
|
||||
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
||||
// tok_embd should be ignored in this case, since it always causes this warning
|
||||
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
||||
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
||||
}
|
||||
}
|
||||
}
|
||||
//LLAMA_LOG_INFO("err = %f\n", err);
|
||||
GGML_ASSERT(err == 0.00000);
|
||||
}
|
||||
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ2_XS ||
|
||||
new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ1_S ||
|
||||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
||||
LLAMA_LOG_ERROR("============================================================\n\n");
|
||||
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
||||
}
|
||||
|
||||
float * f32_data;
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
|
||||
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
||||
#if 0
|
||||
if (new_type == GGML_TYPE_MXFP4) {
|
||||
auto * x = f32_data_03;
|
||||
|
||||
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
||||
std::vector<float> deq(nrows*n_per_row);
|
||||
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
||||
qtype->to_float(new_data_03, deq.data(), deq.size());
|
||||
|
||||
double err = 0.0f;
|
||||
for (int i = 0; i < (int) deq.size(); ++i) {
|
||||
err += fabsf(deq[i] - x[i]);
|
||||
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
||||
if (deq[i] != x[i]) {
|
||||
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
||||
}
|
||||
}
|
||||
//LLAMA_LOG_INFO("err = %f\n", err);
|
||||
GGML_ASSERT(err == 0.00000);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
total_size_org += ggml_nbytes(tensor);
|
||||
total_size_new += new_size;
|
||||
total_size_org += ggml_nbytes(tensor);
|
||||
total_size_new += new_size;
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
} // no --dry-run
|
||||
} // iterate over tensors
|
||||
|
||||
if (!params->dry_run) {
|
||||
close_ofstream();
|
||||
}
|
||||
close_ofstream();
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
|
|
|
|||
|
|
@ -626,7 +626,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_backend_init();
|
||||
|
||||
// parse command line arguments
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[arg_idx];
|
||||
arg_idx++;
|
||||
std::string fname_out;
|
||||
|
|
@ -634,22 +634,26 @@ int main(int argc, char ** argv) {
|
|||
std::string ftype_str;
|
||||
std::string suffix = ".gguf";
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// argv[arg_idx] is the ftype directly: <input> <ftype>
|
||||
if (!params.dry_run) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
} else {
|
||||
// argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
|
||||
fname_out = argv[arg_idx];
|
||||
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
|
||||
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
||||
|
|
@ -692,14 +696,21 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
if (!params.dry_run) {
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (params.dry_run) {
|
||||
fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
}
|
||||
|
||||
if (params.nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", params.nthread);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue