Merge efe9c8b933 into 18ddaea2ae

2026-01-02 23:47:03 +02:00 · 2026-01-02 23:47:03 +02:00 · f9be5e628c
parent 18ddaea2ae efe9c8b933
commit f9be5e628c
4 changed files with 1504 additions and 156 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -378,9 +378,14 @@ extern "C" {
        bool pure;                            // quantize all tensors to the default type
        bool keep_split;                      // quantize to the same number of shards
        void * imatrix;                       // pointer to importance matrix data
+        void * activations;                   // pointer to activations data
        void * kv_overrides;                  // pointer to vector containing overrides
        void * tensor_types;                  // pointer to vector containing tensor types
        void * prune_layers;                  // pointer to vector containing layer indices to prune
+        float target_bpw;                     // target bits per weight (bpw)
+        bool keep_bpw_state;                  // keep bpw state file
+        void * bpw_state;                     // pointer to bpw state file
+        bool no_importance;                   // allocate target bpw budget equitably across all tensors
    } llama_model_quantize_params;

    typedef struct llama_logit_bias {
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@ -56,8 +56,10 @@ Options:
 * `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file

 Advanced options:
-* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
+* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times
 * `--prune-layers` prune (remove) the layers in the list
+* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average
+* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models
 * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times

 Examples:
@ -97,59 +99,54 @@ Examples:
 ./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
 ```

+```bash
+# quantize model targeting a specific bpw average and save the bpw computations to the default file. Model type is optional and can be omitted
+./llama-quantize --target-bpw 4.567 --keep-bpw-state --imatrix imatrix.gguf input-model-f32.gguf 8
+```
+
 ## Memory/Disk Requirements

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):

 | Model | Original size | Quantized size (Q4_K_M) |
-| ----: | ------------: | ----------------------: |
+|------:|--------------:|------------------------:|
 |    8B |       32.1 GB |                  4.9 GB |
 |   70B |      280.9 GB |                 43.1 GB |
 |  405B |    1,625.1 GB |                249.1 GB |

-
 ## Quantization

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,

 ### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)

-| Measure                     | IQ1_S        | IQ1_M        | IQ2_XXS      | IQ2_XS        | IQ2_S         | IQ2_M        |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       2.0042 |       2.1460 |       2.3824 |        2.5882 |        2.7403 |       2.9294 |
-| size (GiB)                  |       1.87   |       2.01   |       2.23   |        2.42   |        2.56   |       2.74   |
-| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
-| text generation t/s @ 128   |  79.73 ±0.79 |  72.92 ±0.14 |  79.86 ±0.22 |  78.04 ±0.46  |  77.30 ±2.47  |  74.44 ±0.15 |
-
-| Measure                     | IQ3_XXS      | IQ3_XS       | IQ3_S        | IQ3_M         | IQ4_XS        | IQ4_NL       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       3.2548 |       3.4977 |       3.6606 |        3.7628 |        4.4597 |       4.6818 |
-| size (GiB)                  |       3.04   |       3.27   |       3.42   |        3.52   |        4.17   |       4.38   |
-| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
-| text generation t/s @ 128   |  73.95 ±0.20 |  71.67 ±0.54 |  69.31 ±0.63 |  70.15 ±0.33  |  77.51 ±0.20  |  76.63 ±0.28 |
-
-
-| Measure                     | Q2_K_S       | Q2_K         | Q3_K_S       | Q3_K_M       | Q3_K_L       | Q4_K_S       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
-| bits/weight                 |       2.9697 |       3.1593 |       3.6429 |       3.9960 |       4.2979 |       4.6672 |
-| size (GiB)                  |       2.78   |       2.95   |       3.41   |       3.74   |       4.02   |       4.36   |
-| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
-| text generation t/s @ 128   |  90.01 ±0.12 |  79.85 ±0.20 |  69.84 ±0.18 |  71.68 ±0.22 |  69.38 ±0.49 |  76.71 ±0.20 |
-
-| Measure                     | Q4_K_S       | Q4_K_M        | Q5_K_S       | Q5_K_M       | Q6_K          | Q8_0         |
-| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
-| bits/weight                 |       4.6672 |        4.8944 |       5.5704 |       5.7036 |        6.5633 |       8.5008 |
-| size (GiB)                  |       4.36   |        4.58   |       5.21   |       5.33   |        6.14   |       7.95   |
-| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
-| text generation t/s @ 128   |  76.71 ±0.20 |  71.93 ±1.52  |  69.53 ±0.18 |  67.23 ±1.08 |  58.67 ±3.13  |  50.93 ±0.08 |
-
-| Measure                     | F16          |
-| --------------------------- | ------------ |
-| bits/weight                 |      16.0005 |
-| size (GiB)                  |      14.96   |
-| prompt processing t/s @ 512 | 923.49 ±0.53 |
-| text generation t/s @ 128   |  29.17 ±0.04 |
+| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 |
+|:----------:|------------:|-----------:|----------------------------:|--------------------------:|
+|   IQ1_S    |      2.0042 |       1.87 |                858.88 ±1.22 |               79.73 ±0.79 |
+|   IQ1_M    |      2.1460 |       2.01 |                847.99 ±0.47 |               72.92 ±0.14 |
+|  IQ2_XXS   |      2.3824 |       2.23 |                852.39 ±0.85 |               79.86 ±0.22 |
+|   IQ2_XS   |      2.5882 |       2.42 |               826.99 ±12.51 |               78.04 ±0.46 |
+|   IQ2_S    |      2.7403 |       2.56 |               783.55 ±13.73 |               77.30 ±2.47 |
+|   IQ2_M    |      2.9294 |       2.74 |                787.68 ±7.00 |               74.44 ±0.15 |
+|  IQ3_XXS   |      3.2548 |       3.04 |                813.88 ±6.53 |               73.95 ±0.20 |
+|   IQ3_XS   |      3.4977 |       3.27 |                708.71 ±1.26 |               71.67 ±0.54 |
+|   IQ3_S    |      3.6606 |       3.42 |                798.78 ±8.81 |               69.31 ±0.63 |
+|   IQ3_M    |      3.7628 |       3.52 |               768.70 ±13.73 |               70.15 ±0.33 |
+|   IQ4_XS   |      4.4597 |       4.17 |               771.80 ±11.38 |               77.51 ±0.20 |
+|   IQ4_NL   |      4.6818 |       4.38 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q2_K_S   |      2.9697 |       2.78 |                798.91 ±6.40 |               90.01 ±0.12 |
+|    Q2_K    |      3.1593 |       2.95 |                784.45 ±7.85 |               79.85 ±0.20 |
+|   Q3_K_S   |      3.6429 |       3.41 |                752.17 ±7.94 |               71.68 ±0.22 |
+|   Q3_K_L   |      4.2979 |       4.02 |                761.17 ±7.55 |               69.38 ±0.49 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_M   |      4.8944 |       4.58 |               821.81 ±21.44 |               71.93 ±1.52 |
+|   Q5_K_S   |      5.5704 |       5.21 |                752.52 ±0.99 |               69.53 ±0.18 |
+|   Q5_K_M   |      5.7036 |       5.33 |                758.69 ±7.43 |               67.23 ±1.08 |
+|    Q6_K    |      6.5633 |       6.14 |               812.01 ±10.82 |               58.67 ±3.13 |
+|    Q8_0    |      8.5008 |       7.95 |                865.09 ±8.30 |               50.93 ±0.08 |
+|    F16     |     16.0005 |      14.96 |                923.49 ±0.53 |               29.17 ±0.04 |

 ## Background information on llama-quantize

--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -118,21 +118,27 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp

 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
-    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
+    printf("        [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n");
+    printf("        [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure: disable k-quant mixtures and quantize all tensors to the same type\n");
    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. Example: --tensor-type attn_q=q8_0\n");
    printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
    printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
    printf("      Advanced option to remove all tensors from the given layers\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
+    printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
+    printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
+    printf("  --keep-bpw-state: save the bpw computations to <model name>-<model hash>.bpw_state\n");
+    printf("  --bpw-state: file name to use instead of default\n");
    printf("  --keep-split: will generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@ -215,7 +221,10 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
    return m_last_call;
 }

-static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file,
+    std::vector<std::string> & imatrix_datasets,
+    std::unordered_map<std::string, std::vector<float>> & values_data,
+    std::unordered_map<std::string, std::vector<float>> & activations_data) {

    struct ggml_context * ctx = nullptr;
    struct gguf_init_params meta_gguf_params = {
@ -225,7 +234,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
-        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
    }
    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
    if (n_entries < 1) {
@ -247,11 +256,12 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin

    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);

-    const std::string sums_suffix{ ".in_sum2" };
+    const std::string sums_suffix{ ".in_sum" };
+    const std::string sums2_suffix{ ".in_sum2" };
    const std::string counts_suffix{ ".counts" };

    // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;

    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
        std::string name = cur->name;
@ -259,44 +269,55 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
        if (name.empty()) { continue; }

        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
+            std::get<0>(sums_counts_for[std::move(name)]) = cur;
+        } else if (string_remove_suffix(name, sums2_suffix)) {
            // in_sum2
-            sums_counts_for[std::move(name)].first = cur;
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
        } else if (string_remove_suffix(name, counts_suffix)) {
            // counts
-            sums_counts_for[std::move(name)].second = cur;
-        } else {
+            std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        }  else {
            // ignore other tensors
        }
    }

    for (const auto & sc : sums_counts_for) {
        const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = sc.second.first;
-        const struct ggml_tensor * counts = sc.second.second;
+        const struct ggml_tensor * sums   = std::get<0>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<1>(sc.second);
+        const struct ggml_tensor * counts = std::get<2>(sc.second);

-        if (!sums || !counts) {
+        // check sums2 and counts are present, and that sums and sums2 have the same shape
+        if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
            fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            exit(1);
        }

-        const int64_t ne0 = sums->ne[0];
-        const int64_t ne1 = sums->ne[1];
+        const int64_t ne0 = sums2->ne[0];
+        const int64_t ne1 = sums2->ne[1];

-        auto & e = imatrix_data[name];
-        e.resize(ggml_nelements(sums));
+        auto & activations = activations_data[name];
+        auto & values = values_data[name];
+        if (sums) {
+            activations.resize(ggml_nelements(sums));
+        }
+        values.resize(ggml_nelements(sums2));
        float max_count = 0.0f;
        for (int64_t j = 0; j < ne1; ++j) {
            const float count = ((const float *) counts->data)[j];
            if (count > 0.0f) {
                for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                    values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
+                    if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
                }
            } else {
                // Partial imatrix data, this tensor never got any input during calibration
                for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = 1;
+                    values[j*ne0 + i] = 1;
+                    if (sums) { activations[j*ne0 + i] = 0; }
                }
            }
            if (count > max_count) {
@ -304,7 +325,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
            }
        }
        if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
+                __func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
        }
    }

@ -321,7 +343,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
    }
    printf("]\n");

-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);

    gguf_free(ctx_gguf);
    ggml_free(ctx);
@ -333,41 +355,56 @@ static int prepare_imatrix(const std::string & imatrix_file,
        std::vector<std::string> & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
-        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+        std::unordered_map<std::string, std::vector<float>> & values_data,
+        std::unordered_map<std::string, std::vector<float>> & activations_data) {
    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
    }
-    if (imatrix_data.empty()) {
+    if (values_data.empty()) {
        return m_last_call;
    }
    if (!excluded_weights.empty()) {
        for (const auto & name : excluded_weights) {
-            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
-                auto pos = it->first.find(name);
+            for (auto vt = values_data.begin(); vt != values_data.end();) {
+                auto pos = vt->first.find(name);
                if (pos != std::string::npos) {
-                    it = imatrix_data.erase(it);
+                    vt = values_data.erase(vt);
                } else {
-                    ++it;
+                    ++vt;
+                }
+            }
+            for (auto at = activations_data.begin(); at != activations_data.end();) {
+                auto pos = at->first.find(name);
+                if (pos != std::string::npos) {
+                    at = activations_data.erase(at);
+                } else {
+                    ++at;
                }
            }
        }
    }
    if (!included_weights.empty()) {
-        std::unordered_map<std::string, std::vector<float>> tmp;
+        std::unordered_map<std::string, std::vector<float>> tmp_values;
+        std::unordered_map<std::string, std::vector<float>> tmp_activations;
        for (const auto & name : included_weights) {
-            for (auto & e : imatrix_data) {
+            for (auto & e : values_data) {
                auto pos = e.first.find(name);
                if (pos != std::string::npos) {
-                    tmp.emplace(std::move(e));
+                    tmp_values.emplace(std::move(e));
+                }
+            }
+            for (auto & a : activations_data) {
+                auto pos = a.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_activations.emplace(std::move(a));
                }
            }
        }
-        imatrix_data = std::move(tmp);
-    }
-    if (!imatrix_data.empty()) {
-        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+        values_data = std::move(tmp_values);
+        activations_data = std::move(tmp_activations);
    }
+
    return m_last_call;
 }

@ -441,6 +478,52 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
    return true;
 }

+static bool parse_target_bpw(const char * data, float & target_bpw) {
+    if (!data) {
+        printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        target_bpw = std::stof(data);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
+static const char * get_ftype(const float bpw) {
+    const std::map<float, const char *> quant_bpw = {
+        {1.5625, "IQ1_S"},
+        {1.7500, "IQ1_M"},
+        {2.0625, "IQ2_XXS"},
+        {2.3125, "IQ2_XS"},
+        {2.5625, "IQ2_S"},
+        {2.6250, "Q2_K"},
+        {3.0625, "IQ3_XXS"},
+        {3.4375, "Q3_K"},
+        {4.2500, "IQ4_XS"},
+        {4.5000, "Q4_K"},
+        {5.5000, "Q5_K"},
+        {6.5625, "Q6_K"},
+        {8.5000, "Q8_0"},
+#ifdef GGML_USE_METAL
+        {16.0000, "F16"}
+#else
+        {16.0000, "BF16"}
+#endif
+    };
+
+    return quant_bpw.lower_bound(bpw)->second;
+}
+
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@ -454,6 +537,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_model_kv_override> kv_overrides;
    std::vector<tensor_quantization> tensor_types;
    std::vector<int> prune_layers;
+    float target_bpw = -1.0f;

    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -480,6 +564,20 @@ int main(int argc, char ** argv) {
            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
+            if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--no-importance") == 0) {
+            params.no_importance = true;
+        } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
+            params.keep_bpw_state = true;
+        } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {
+            if (arg_idx < argc-1) {
+                params.bpw_state = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
            if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                usage(argv[0]);
@ -526,10 +624,11 @@ int main(int argc, char ** argv) {
    }

    std::vector<std::string> imatrix_datasets;
-    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
-    if (!imatrix_data.empty()) {
-        params.imatrix = &imatrix_data;
+    std::unordered_map<std::string, std::vector<float>> values_data;
+    std::unordered_map<std::string, std::vector<float>> activations_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
+    if (!values_data.empty()) {
+        params.imatrix = &values_data;
        {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
@ -552,7 +651,7 @@ int main(int argc, char ** argv) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
+            kvo.val_i64 = values_data.size();
            kv_overrides.emplace_back(std::move(kvo));
        }

@ -564,6 +663,9 @@ int main(int argc, char ** argv) {
            kv_overrides.emplace_back(std::move(kvo));
        }
    }
+    if (!activations_data.empty()) {
+        params.activations = &activations_data;
+    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
        kv_overrides.back().key[0] = 0;
@ -575,6 +677,9 @@ int main(int argc, char ** argv) {
    if (!prune_layers.empty()) {
        params.prune_layers = &prune_layers;
    }
+    if (target_bpw != -1.0f) {
+        params.target_bpw = target_bpw;
+    }

    llama_backend_init();

@ -585,6 +690,7 @@ int main(int argc, char ** argv) {

    std::string ftype_str;
    std::string suffix = ".gguf";
+    std::vector<const char *> tmp_argv(argv, argv + argc);
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of("/\\");
@ -608,7 +714,15 @@ int main(int argc, char ** argv) {
        }
        arg_idx++;

-        if (argc <= arg_idx) {
+        // select quantization type if target_bpw is set unless user specifies type and threads
+        if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
+            auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
+            if (argc == arg_idx) { tmp_argv.push_back(ftype); }
+            else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
+            tmp_argv.push_back(nullptr);
+            argv = const_cast<char **>(tmp_argv.data());
+            argc++;
+        } else if (argc <= arg_idx) {
            fprintf(stderr, "%s: missing ftype\n", __func__);
            return 1;
        }
@ -637,7 +751,7 @@ int main(int argc, char ** argv) {
         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && values_data.empty()) {
        fprintf(stderr, "\n==========================================================================================================\n");
        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
        fprintf(stderr, "==========================================================================================================\n\n\n");