Merge feda897fa2 into 9e2e2198b0

2026-03-15 23:55:07 +02:00 · 2026-03-15 23:55:07 +02:00 · 04eb22b1fe
parent 9e2e2198b0 feda897fa2
commit 04eb22b1fe
4 changed files with 1813 additions and 534 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -393,9 +393,16 @@ extern "C" {
        bool keep_split;                      // quantize to the same number of shards
        bool dry_run;                         // calculate and show the final quantization size without performing quantization
        void * imatrix;                       // pointer to importance matrix data
+        void * activations;                   // pointer to activations data
+        void * statistics;                    // pointer to statistics data
        void * kv_overrides;                  // pointer to vector containing overrides
        void * tensor_types;                  // pointer to vector containing tensor types
        void * prune_layers;                  // pointer to vector containing layer indices to prune
+        float target_bpw;                     // target bits per weight (bpw)
+        int64_t target_size;                  // target file size in bytes
+        bool save_state;                      // keep bpw state file
+        void * state_file;                    // pointer to bpw state file
+        float importance_pct;                 // identify up to pct% of tensors as important
    } llama_model_quantize_params;

    typedef struct llama_logit_bias {
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@ -56,8 +56,13 @@ Options:
 * `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file

 Advanced options:
-* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
+* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times
 * `--prune-layers` prune (remove) the layers in the list
+* `--target-bpw` automatically choose quant types to meet an overall bits per weight (bpw) target
+* `--target-size` automatically choose quant types to meet a file size target
+* `--ignore-tensor-importance` during target computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models
+* `--save-state` save the target computation to a file. By default, it saves to `<model name>-<model hash>-mse.bpw_state` unless `--state-file` is also specified
+* `--state-file` file name to load from / save to target computations
 * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times

 Examples:
@ -97,59 +102,64 @@ Examples:
 ./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
 ```

+```bash
+# quantize model targeting a specific bpw average and save the target computations to the default file. Model type is optional and can be omitted
+./llama-quantize --target-bpw 4.5678 --save-state --imatrix imatrix.gguf input-model-f32.gguf 8
+```
+
+```bash
+# quantize model targeting a specific file size and save the target computations to a custom file. Model type is optional and can be omitted
+./llama-quantize --target-size 1.5gb --save-state --state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
+```
+
+```bash
+# quantize model targeting a specific bpw average reusing previous target computations
+./llama-quantize --target-bpw 2.5 ---state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
+```
+
 ## Memory/Disk Requirements

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1):

 | Model | Original size | Quantized size (Q4_K_M) |
-| ----: | ------------: | ----------------------: |
+|------:|--------------:|------------------------:|
 |    8B |       32.1 GB |                  4.9 GB |
 |   70B |      280.9 GB |                 43.1 GB |
 |  405B |    1,625.1 GB |                249.1 GB |

-
 ## Quantization

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,

 ### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)

-| Measure                     | IQ1_S        | IQ1_M        | IQ2_XXS      | IQ2_XS        | IQ2_S         | IQ2_M        |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       2.0042 |       2.1460 |       2.3824 |        2.5882 |        2.7403 |       2.9294 |
-| size (GiB)                  |       1.87   |       2.01   |       2.23   |        2.42   |        2.56   |       2.74   |
-| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
-| text generation t/s @ 128   |  79.73 ±0.79 |  72.92 ±0.14 |  79.86 ±0.22 |  78.04 ±0.46  |  77.30 ±2.47  |  74.44 ±0.15 |
-
-| Measure                     | IQ3_XXS      | IQ3_XS       | IQ3_S        | IQ3_M         | IQ4_XS        | IQ4_NL       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       3.2548 |       3.4977 |       3.6606 |        3.7628 |        4.4597 |       4.6818 |
-| size (GiB)                  |       3.04   |       3.27   |       3.42   |        3.52   |        4.17   |       4.38   |
-| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
-| text generation t/s @ 128   |  73.95 ±0.20 |  71.67 ±0.54 |  69.31 ±0.63 |  70.15 ±0.33  |  77.51 ±0.20  |  76.63 ±0.28 |
-
-
-| Measure                     | Q2_K_S       | Q2_K         | Q3_K_S       | Q3_K_M       | Q3_K_L       | Q4_K_S       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
-| bits/weight                 |       2.9697 |       3.1593 |       3.6429 |       3.9960 |       4.2979 |       4.6672 |
-| size (GiB)                  |       2.78   |       2.95   |       3.41   |       3.74   |       4.02   |       4.36   |
-| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
-| text generation t/s @ 128   |  90.01 ±0.12 |  79.85 ±0.20 |  69.84 ±0.18 |  71.68 ±0.22 |  69.38 ±0.49 |  76.71 ±0.20 |
-
-| Measure                     | Q4_K_S       | Q4_K_M        | Q5_K_S       | Q5_K_M       | Q6_K          | Q8_0         |
-| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
-| bits/weight                 |       4.6672 |        4.8944 |       5.5704 |       5.7036 |        6.5633 |       8.5008 |
-| size (GiB)                  |       4.36   |        4.58   |       5.21   |       5.33   |        6.14   |       7.95   |
-| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
-| text generation t/s @ 128   |  76.71 ±0.20 |  71.93 ±1.52  |  69.53 ±0.18 |  67.23 ±1.08 |  58.67 ±3.13  |  50.93 ±0.08 |
-
-| Measure                     | F16          |
-| --------------------------- | ------------ |
-| bits/weight                 |      16.0005 |
-| size (GiB)                  |      14.96   |
-| prompt processing t/s @ 512 | 923.49 ±0.53 |
-| text generation t/s @ 128   |  29.17 ±0.04 |
+| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 |
+|:----------:|------------:|-----------:|----------------------------:|--------------------------:|
+|   IQ1_S    |      2.0042 |       1.87 |                858.88 ±1.22 |               79.73 ±0.79 |
+|   IQ1_M    |      2.1460 |       2.01 |                847.99 ±0.47 |               72.92 ±0.14 |
+|  IQ2_XXS   |      2.3824 |       2.23 |                852.39 ±0.85 |               79.86 ±0.22 |
+|   IQ2_XS   |      2.5882 |       2.42 |               826.99 ±12.51 |               78.04 ±0.46 |
+|   IQ2_S    |      2.7403 |       2.56 |               783.55 ±13.73 |               77.30 ±2.47 |
+|   IQ2_M    |      2.9294 |       2.74 |                787.68 ±7.00 |               74.44 ±0.15 |
+|  IQ3_XXS   |      3.2548 |       3.04 |                813.88 ±6.53 |               73.95 ±0.20 |
+|   IQ3_XS   |      3.4977 |       3.27 |                708.71 ±1.26 |               71.67 ±0.54 |
+|   IQ3_S    |      3.6606 |       3.42 |                798.78 ±8.81 |               69.31 ±0.63 |
+|   IQ3_M    |      3.7628 |       3.52 |               768.70 ±13.73 |               70.15 ±0.33 |
+|   IQ4_XS   |      4.4597 |       4.17 |               771.80 ±11.38 |               77.51 ±0.20 |
+|   IQ4_NL   |      4.6818 |       4.38 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q2_K_S   |      2.9697 |       2.78 |                798.91 ±6.40 |               90.01 ±0.12 |
+|    Q2_K    |      3.1593 |       2.95 |                784.45 ±7.85 |               79.85 ±0.20 |
+|   Q3_K_S   |      3.6429 |       3.41 |                752.17 ±7.94 |               71.68 ±0.22 |
+|   Q3_K_L   |      4.2979 |       4.02 |                761.17 ±7.55 |               69.38 ±0.49 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_M   |      4.8944 |       4.58 |               821.81 ±21.44 |               71.93 ±1.52 |
+|   Q5_K_S   |      5.5704 |       5.21 |                752.52 ±0.99 |               69.53 ±0.18 |
+|   Q5_K_M   |      5.7036 |       5.33 |                758.69 ±7.43 |               67.23 ±1.08 |
+|    Q6_K    |      6.5633 |       6.14 |               812.01 ±10.82 |               58.67 ±3.13 |
+|    Q8_0    |      8.5008 |       7.95 |                865.09 ±8.30 |               50.93 ±0.08 |
+|    F16     |     16.0005 |      14.96 |                923.49 ±0.53 |               29.17 ±0.04 |

 ## Background information on llama-quantize

--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -8,15 +8,12 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include <vector>
+#include <filesystem>
+#include <fstream>
+#include <map>
 #include <string>
 #include <unordered_map>
-#include <map>
-#include <fstream>
-#include <cmath>
-#include <cctype>
-#include <algorithm>
-#include <filesystem>
+#include <vector>

 // result of parsing --tensor-type option
 // (changes to this struct must be reflected in src/llama-quant.cpp)
@ -246,7 +243,11 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
    return m_last_call;
 }

-static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file,
+    std::vector<std::string> & imatrix_datasets,
+    std::unordered_map<std::string, std::vector<float>> & values_data,
+    std::unordered_map<std::string, std::vector<float>> & activations_data,
+    std::unordered_map<std::string, std::vector<float>> & statistics_data) {

    struct ggml_context * ctx = nullptr;
    struct gguf_init_params meta_gguf_params = {
@ -256,7 +257,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
-        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
    }
    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
    if (n_entries < 1) {
@ -278,11 +279,13 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin

    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);

-    const std::string sums_suffix{ ".in_sum2" };
+    const std::string sums_suffix{ ".in_sum" };
+    const std::string sums2_suffix{ ".in_sum2" };
    const std::string counts_suffix{ ".counts" };
+    const std::string stats_suffix{ ".stats" };

    // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;

    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
        std::string name = cur->name;
@ -290,11 +293,17 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
        if (name.empty()) { continue; }

        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
+            std::get<0>(sums_counts_for[std::move(name)]) = cur;
+        } else if (string_remove_suffix(name, sums2_suffix)) {
            // in_sum2
-            sums_counts_for[std::move(name)].first = cur;
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
        } else if (string_remove_suffix(name, counts_suffix)) {
            // counts
-            sums_counts_for[std::move(name)].second = cur;
+            std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        } else if (string_remove_suffix(name, stats_suffix)) {
+            // stats
+            std::get<3>(sums_counts_for[std::move(name)]) = cur;
        } else {
            // ignore other tensors
        }
@ -302,32 +311,55 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin

    for (const auto & sc : sums_counts_for) {
        const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = sc.second.first;
-        const struct ggml_tensor * counts = sc.second.second;
+        const struct ggml_tensor * sums   = std::get<0>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<1>(sc.second);
+        const struct ggml_tensor * counts = std::get<2>(sc.second);
+        const struct ggml_tensor * stats = std::get<3>(sc.second);

-        if (!sums || !counts) {
+        // check sums2 and counts are present, and that sums and sums2 have the same shape
+        if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
            fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            exit(1);
        }

-        const int64_t ne0 = sums->ne[0];
-        const int64_t ne1 = sums->ne[1];
+        const int64_t ne0 = sums2->ne[0];
+        const int64_t ne1 = sums2->ne[1];

-        auto & e = imatrix_data[name];
-        e.resize(ggml_nelements(sums));
+        auto & activations = activations_data[name];
+        auto & values = values_data[name];
+        if (sums) {
+            activations.resize(ggml_nelements(sums));
+        }
+        if (stats) {
+            auto & statistics = statistics_data[name];
+            statistics.resize(ggml_nelements(stats));
+            if (stats->type == GGML_TYPE_F32) {
+                std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
+            } else {
+                fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
+                    __func__, ggml_type_name(stats->type), name.c_str());
+                statistics.clear();
+                statistics_data.erase(name);
+            }
+
+        }
+
+        values.resize(ggml_nelements(sums2));
        float max_count = 0.0f;
        for (int64_t j = 0; j < ne1; ++j) {
            const float count = ((const float *) counts->data)[j];
            if (count > 0.0f) {
                for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                    values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
+                    if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
                }
            } else {
                // Partial imatrix data, this tensor never got any input during calibration
                for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = 1;
+                    values[j*ne0 + i] = 1;
+                    if (sums) { activations[j*ne0 + i] = 0; }
                }
            }
            if (count > max_count) {
@ -335,7 +367,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
            }
        }
        if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
+                __func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
        }
    }

@ -352,7 +385,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
    }
    printf("]\n");

-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);

    gguf_free(ctx_gguf);
    ggml_free(ctx);
@ -364,41 +397,73 @@ static int prepare_imatrix(const std::string & imatrix_file,
        std::vector<std::string> & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
-        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+        std::unordered_map<std::string, std::vector<float>> & values_data,
+        std::unordered_map<std::string, std::vector<float>> & activations_data,
+        std::unordered_map<std::string, std::vector<float>> & statistics_data) {
    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
    }
-    if (imatrix_data.empty()) {
+    if (values_data.empty()) {
        return m_last_call;
    }
    if (!excluded_weights.empty()) {
        for (const auto & name : excluded_weights) {
-            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
-                auto pos = it->first.find(name);
+            for (auto vt = values_data.begin(); vt != values_data.end();) {
+                auto pos = vt->first.find(name);
                if (pos != std::string::npos) {
-                    it = imatrix_data.erase(it);
+                    vt = values_data.erase(vt);
                } else {
-                    ++it;
+                    ++vt;
+                }
+            }
+            for (auto at = activations_data.begin(); at != activations_data.end();) {
+                auto pos = at->first.find(name);
+                if (pos != std::string::npos) {
+                    at = activations_data.erase(at);
+                } else {
+                    ++at;
+                }
+            }
+            for (auto st = statistics_data.begin(); st != statistics_data.end();) {
+                auto pos = st->first.find(name);
+                if (pos != std::string::npos) {
+                    st = statistics_data.erase(st);
+                } else {
+                    ++st;
                }
            }
        }
    }
    if (!included_weights.empty()) {
-        std::unordered_map<std::string, std::vector<float>> tmp;
+        std::unordered_map<std::string, std::vector<float>> tmp_values;
+        std::unordered_map<std::string, std::vector<float>> tmp_activations;
+        std::unordered_map<std::string, std::vector<float>> tmp_statistics;
        for (const auto & name : included_weights) {
-            for (auto & e : imatrix_data) {
+            for (auto & e : values_data) {
                auto pos = e.first.find(name);
                if (pos != std::string::npos) {
-                    tmp.emplace(std::move(e));
+                    tmp_values.emplace(std::move(e));
+                }
+            }
+            for (auto & a : activations_data) {
+                auto pos = a.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_activations.emplace(std::move(a));
+                }
+            }
+            for (auto & s : statistics_data) {
+                auto pos = s.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_statistics.emplace(std::move(s));
                }
            }
        }
-        imatrix_data = std::move(tmp);
-    }
-    if (!imatrix_data.empty()) {
-        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+        values_data = std::move(tmp_values);
+        activations_data = std::move(tmp_activations);
+        statistics_data = std::move(tmp_statistics);
    }
+
    return m_last_call;
 }

@ -489,6 +554,109 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
    return true;
 }

+static bool parse_target_bpw(const char * data, float & target_bpw) {
+    if (!data) {
+        printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        target_bpw = std::stof(data);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
+static bool parse_importance_pct(const char * data, float & importance_pct) {
+    if (!data) {
+        printf("\n%s: no tensor importance %% provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        importance_pct = std::stof(data);
+        if (importance_pct < 0.0f || importance_pct > 100.0f) {
+            printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
+static bool parse_target_size(const char * data, int64_t & target_size) {
+    if (!data) {
+        printf("\n%s: no target file size provided\n\n", __func__);
+        return false;
+    }
+
+    char * end = nullptr;
+    const double val = std::strtod(data, &end);
+    if (end == data || val < 0) {
+        printf("\n%s: invalid target file size '%s'\n\n", __func__, data);
+        return false;
+    }
+
+    std::string suffix(end);
+    for (auto & c : suffix) { c = std::tolower(c); }
+
+    int64_t mul = 0;
+    if (suffix.empty() || suffix == "b") {
+        mul = 1;
+    } else if (suffix == "k" || suffix == "kb") {
+        mul = 1024;
+    } else if (suffix == "m" || suffix == "mb") {
+        mul = 1024 * 1024;
+    } else if (suffix == "g" || suffix == "gb") {
+        mul = 1024 * 1024 * 1024;
+    } else if (suffix == "t" || suffix == "tb") {
+        mul = 1024LL * 1024 * 1024 * 1024;
+    } else {
+        printf("\n%s: invalid unit '%s' in '%s'. Allowed: b, kb, mb, gb, tb (kilo = 1024 bytes)\n\n", __func__, suffix.c_str(), data);
+        return false;
+    }
+
+    target_size = (int64_t)val * mul;
+    return true;
+}
+
+static const char * get_ftype(const float bpw) {
+    const std::map<float, const char *> quant_bpw = {
+        {1.5625, "IQ1_S"},
+        {1.7500, "IQ1_M"},
+        {2.0625, "IQ2_XXS"},
+        {2.3125, "IQ2_XS"},
+        {2.5625, "IQ2_S"},
+        {2.6250, "Q2_K"},
+        {3.0625, "IQ3_XXS"},
+        {3.4375, "Q3_K"},
+        {4.2500, "IQ4_XS"},
+        {4.5000, "Q4_K"},
+        {5.5000, "Q5_K"},
+        {6.5625, "Q6_K"},
+        {8.5000, "Q8_0"},
+#ifdef GGML_USE_METAL
+        {16.0000, "F16"}
+#else
+        {16.0000, "BF16"}
+#endif
+    };
+
+    return quant_bpw.lower_bound(bpw)->second;
+}
+
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");

@ -504,6 +672,9 @@ int main(int argc, char ** argv) {
    std::vector<llama_model_kv_override> kv_overrides;
    std::vector<tensor_type_option> tensor_type_opts;
    std::vector<int> prune_layers;
+    float target_bpw = -1.0f;
+    int64_t target_size = -1;
+    float importance_pct = 0.0f;

    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -534,6 +705,26 @@ int main(int argc, char ** argv) {
            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_opts)) {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
+            if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--target-size") == 0) {
+            if (arg_idx == argc-1 || !parse_target_size(argv[++arg_idx], target_size)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
+            if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--save-state") == 0) {
+            params.save_state = true;
+        } else if (strcmp(argv[arg_idx], "--state-file") == 0) {
+            if (arg_idx < argc-1) {
+                params.state_file = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
            if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                usage(argv[0]);
@ -582,10 +773,12 @@ int main(int argc, char ** argv) {
    }

    std::vector<std::string> imatrix_datasets;
-    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
-    if (!imatrix_data.empty()) {
-        params.imatrix = &imatrix_data;
+    std::unordered_map<std::string, std::vector<float>> values_data;
+    std::unordered_map<std::string, std::vector<float>> activations_data;
+    std::unordered_map<std::string, std::vector<float>> statistics_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
+    if (!values_data.empty()) {
+        params.imatrix = &values_data;
        {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
@ -608,7 +801,7 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
+            kvo.val_i64 = values_data.size();
            kv_overrides.emplace_back(std::move(kvo));
        }

@ -620,6 +813,12 @@ int main(int argc, char ** argv) {
            kv_overrides.emplace_back(std::move(kvo));
        }
    }
+    if (!activations_data.empty()) {
+        params.activations = &activations_data;
+    }
+    if (!statistics_data.empty()) {
+        params.statistics = &statistics_data;
+    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
        kv_overrides.back().key[0] = 0;
@ -631,6 +830,15 @@ int main(int argc, char ** argv) {
    if (!prune_layers.empty()) {
        params.prune_layers = &prune_layers;
    }
+    if (target_bpw != -1.0f) {
+        params.target_bpw = target_bpw;
+    }
+    if (target_size != -1) {
+        params.target_size = target_size;
+    }
+    if (importance_pct != 0.0f) {
+        params.importance_pct = importance_pct;
+    }

    llama_backend_init();

@ -641,6 +849,7 @@ int main(int argc, char ** argv) {

    std::string ftype_str;
    std::string suffix = ".gguf";
+    std::vector<const char *> tmp_argv(argv, argv + argc);
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        // argv[arg_idx] is the ftype directly: <input> <ftype>
        if (!params.dry_run) {
@ -668,7 +877,15 @@ int main(int argc, char ** argv) {
        }
        arg_idx++;

-        if (argc <= arg_idx) {
+        // If --target-bpw or --target-size are set, select a quantization type unless user specifies type and threads
+        if (argc - arg_idx <= 1 && (params.target_bpw != -1.0f || params.target_size != -1)) {
+            auto * ftype = params.target_bpw != -1.0f ? const_cast<char *>(get_ftype(params.target_bpw)) : const_cast<char *>("F16");
+            if (argc == arg_idx) { tmp_argv.push_back(ftype); }
+            else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
+            tmp_argv.push_back(nullptr);
+            argv = const_cast<char **>(tmp_argv.data());
+            argc++;
+        } else if (argc <= arg_idx) {
            fprintf(stderr, "%s: missing ftype\n", __func__);
            return 1;
        }