Merge ce0ad2986b into 05fa625eac

2026-02-17 07:00:30 +09:00 · 2026-02-17 07:00:30 +09:00 · 17c81c4ee6
parent 05fa625eac ce0ad2986b
commit 17c81c4ee6
4 changed files with 504 additions and 368 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -393,6 +393,7 @@ extern "C" {
        void * kv_overrides;                  // pointer to vector containing overrides
        void * tensor_types;                  // pointer to vector containing tensor types
        void * prune_layers;                  // pointer to vector containing layer indices to prune
+        bool dry_run;                         // calculate and show the final quantization size without performing quantization
    } llama_model_quantize_params;

    typedef struct llama_logit_bias {
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@ -109,9 +109,9 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {

 std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
    char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+    snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
    }
    return buf;
 }
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -120,7 +120,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 static void usage(const char * executable) {
    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
-    printf("       [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
    printf("  --allow-requantize\n");
    printf("                                      allow requantizing tensors that have already been quantized\n");
@ -156,7 +156,10 @@ static void usage(const char * executable) {
    printf("                                      generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
-    printf("                                      WARNING: this is an advanced option, use with care.\n\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n");
+    printf("  --dry-run\n");
+    printf("                                      calculate and show the final quantization size without performing quantization\n");
+    printf("                                      example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
    printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
    printf("-----------------------------------------------------------------------------\n");
    printf(" allowed quantization types\n");
@ -532,6 +535,8 @@ int main(int argc, char ** argv) {
            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
+            params.dry_run = true;
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@ -630,22 +635,26 @@ int main(int argc, char ** argv) {
    std::string ftype_str;
    std::string suffix = ".gguf";
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
-        std::string fpath;
-        const size_t pos = fname_inp.find_last_of("/\\");
-        if (pos != std::string::npos) {
-            fpath = fname_inp.substr(0, pos + 1);
-        }
+        // argv[arg_idx] is the ftype directly: <input> <ftype>
+        if (!params.dry_run) {
+            std::string fpath;
+            const size_t pos = fname_inp.find_last_of("/\\");
+            if (pos != std::string::npos) {
+                fpath = fname_inp.substr(0, pos + 1);
+            }

-        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
-        fname_out = fpath + "ggml-model-" + ftype_str;
-        if (!params.keep_split) {
-            fname_out += suffix;
+            // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+            fname_out = fpath + "ggml-model-" + ftype_str;
+            if (!params.keep_split) {
+                fname_out += suffix;
+            }
        }
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    } else {
+        // argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
        fname_out = argv[arg_idx];
        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
@ -677,25 +686,21 @@ int main(int argc, char ** argv) {
        }
    }

-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
-        fprintf(stderr, "\n==========================================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "==========================================================================================================\n\n\n");
-        return 1;
-    }
-
-    if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
-        fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
-        return 1;
+    if (!params.dry_run) {
+        if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
+            fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
    }

    print_build_info();

-    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    if (params.dry_run) {
+        fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
+    } else {
+        fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    }
+
    if (params.nthread > 0) {
        fprintf(stderr, " using %d threads", params.nthread);
    }