Merge ce0ad2986b into 05fa625eac
This commit is contained in:
commit
17c81c4ee6
|
|
@ -393,6 +393,7 @@ extern "C" {
|
|||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
void * prune_layers; // pointer to vector containing layer indices to prune
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
} llama_model_quantize_params;
|
||||
|
||||
typedef struct llama_logit_bias {
|
||||
|
|
|
|||
|
|
@ -109,9 +109,9 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|||
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -120,7 +120,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
|
||||
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
|
||||
printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
|
||||
printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
|
||||
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
|
||||
printf(" --allow-requantize\n");
|
||||
printf(" allow requantizing tensors that have already been quantized\n");
|
||||
|
|
@ -156,7 +156,10 @@ static void usage(const char * executable) {
|
|||
printf(" generate quantized model in the same shards as input\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n\n");
|
||||
printf(" WARNING: this is an advanced option, use with care.\n");
|
||||
printf(" --dry-run\n");
|
||||
printf(" calculate and show the final quantization size without performing quantization\n");
|
||||
printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
|
||||
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
|
||||
printf("-----------------------------------------------------------------------------\n");
|
||||
printf(" allowed quantization types\n");
|
||||
|
|
@ -532,6 +535,8 @@ int main(int argc, char ** argv) {
|
|||
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
|
||||
params.dry_run = true;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
|
|
@ -630,22 +635,26 @@ int main(int argc, char ** argv) {
|
|||
std::string ftype_str;
|
||||
std::string suffix = ".gguf";
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// argv[arg_idx] is the ftype directly: <input> <ftype>
|
||||
if (!params.dry_run) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||
if (!params.keep_split) {
|
||||
fname_out += suffix;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
} else {
|
||||
// argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
|
||||
fname_out = argv[arg_idx];
|
||||
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
|
||||
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
||||
|
|
@ -677,25 +686,21 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
|
||||
fprintf(stderr, "\n==========================================================================================================\n");
|
||||
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||
fprintf(stderr, "==========================================================================================================\n\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
if (!params.dry_run) {
|
||||
if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
|
||||
fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (params.dry_run) {
|
||||
fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
}
|
||||
|
||||
if (params.nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", params.nthread);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue