This commit is contained in:
Ed Addario 2026-03-15 23:55:07 +02:00 committed by GitHub
commit 04eb22b1fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 1813 additions and 534 deletions

View File

@ -393,9 +393,16 @@ extern "C" {
bool keep_split; // quantize to the same number of shards
bool dry_run; // calculate and show the final quantization size without performing quantization
void * imatrix; // pointer to importance matrix data
void * activations; // pointer to activations data
void * statistics; // pointer to statistics data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
void * prune_layers; // pointer to vector containing layer indices to prune
float target_bpw; // target bits per weight (bpw)
int64_t target_size; // target file size in bytes
bool save_state; // keep bpw state file
void * state_file; // pointer to bpw state file
float importance_pct; // identify up to pct% of tensors as important
} llama_model_quantize_params;
typedef struct llama_logit_bias {

File diff suppressed because it is too large Load Diff

View File

@ -56,8 +56,13 @@ Options:
* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file
Advanced options:
* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times
* `--prune-layers` prune (remove) the layers in the list
* `--target-bpw` automatically choose quant types to meet an overall bits per weight (bpw) target
* `--target-size` automatically choose quant types to meet a file size target
* `--ignore-tensor-importance` during target computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models
* `--save-state` save the target computation to a file. By default, it saves to `<model name>-<model hash>-mse.bpw_state` unless `--state-file` is also specified
* `--state-file` file name to load from / save to target computations
* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
Examples:
@ -97,59 +102,64 @@ Examples:
./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
```
```bash
# quantize model targeting a specific bpw average and save the target computations to the default file. Model type is optional and can be omitted
./llama-quantize --target-bpw 4.5678 --save-state --imatrix imatrix.gguf input-model-f32.gguf 8
```
```bash
# quantize model targeting a specific file size and save the target computations to a custom file. Model type is optional and can be omitted
./llama-quantize --target-size 1.5gb --save-state --state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
```
```bash
# quantize model targeting a specific bpw average reusing previous target computations
./llama-quantize --target-bpw 2.5 ---state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
```
## Memory/Disk Requirements
When running the larger models, make sure you have enough disk space to store all the intermediate files.
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1):
| Model | Original size | Quantized size (Q4_K_M) |
| ----: | ------------: | ----------------------: |
|------:|--------------:|------------------------:|
| 8B | 32.1 GB | 4.9 GB |
| 70B | 280.9 GB | 43.1 GB |
| 405B | 1,625.1 GB | 249.1 GB |
## Quantization
Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,
### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
| Measure | IQ1_S | IQ1_M | IQ2_XXS | IQ2_XS | IQ2_S | IQ2_M |
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
| bits/weight | 2.0042 | 2.1460 | 2.3824 | 2.5882 | 2.7403 | 2.9294 |
| size (GiB) | 1.87 | 2.01 | 2.23 | 2.42 | 2.56 | 2.74 |
| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
| text generation t/s @ 128 | 79.73 ±0.79 | 72.92 ±0.14 | 79.86 ±0.22 | 78.04 ±0.46 | 77.30 ±2.47 | 74.44 ±0.15 |
| Measure | IQ3_XXS | IQ3_XS | IQ3_S | IQ3_M | IQ4_XS | IQ4_NL |
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
| bits/weight | 3.2548 | 3.4977 | 3.6606 | 3.7628 | 4.4597 | 4.6818 |
| size (GiB) | 3.04 | 3.27 | 3.42 | 3.52 | 4.17 | 4.38 |
| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
| text generation t/s @ 128 | 73.95 ±0.20 | 71.67 ±0.54 | 69.31 ±0.63 | 70.15 ±0.33 | 77.51 ±0.20 | 76.63 ±0.28 |
| Measure | Q2_K_S | Q2_K | Q3_K_S | Q3_K_M | Q3_K_L | Q4_K_S |
| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
| bits/weight | 2.9697 | 3.1593 | 3.6429 | 3.9960 | 4.2979 | 4.6672 |
| size (GiB) | 2.78 | 2.95 | 3.41 | 3.74 | 4.02 | 4.36 |
| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
| text generation t/s @ 128 | 90.01 ±0.12 | 79.85 ±0.20 | 69.84 ±0.18 | 71.68 ±0.22 | 69.38 ±0.49 | 76.71 ±0.20 |
| Measure | Q4_K_S | Q4_K_M | Q5_K_S | Q5_K_M | Q6_K | Q8_0 |
| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
| bits/weight | 4.6672 | 4.8944 | 5.5704 | 5.7036 | 6.5633 | 8.5008 |
| size (GiB) | 4.36 | 4.58 | 5.21 | 5.33 | 6.14 | 7.95 |
| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
| text generation t/s @ 128 | 76.71 ±0.20 | 71.93 ±1.52 | 69.53 ±0.18 | 67.23 ±1.08 | 58.67 ±3.13 | 50.93 ±0.08 |
| Measure | F16 |
| --------------------------- | ------------ |
| bits/weight | 16.0005 |
| size (GiB) | 14.96 |
| prompt processing t/s @ 512 | 923.49 ±0.53 |
| text generation t/s @ 128 | 29.17 ±0.04 |
| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 |
|:----------:|------------:|-----------:|----------------------------:|--------------------------:|
| IQ1_S | 2.0042 | 1.87 | 858.88 ±1.22 | 79.73 ±0.79 |
| IQ1_M | 2.1460 | 2.01 | 847.99 ±0.47 | 72.92 ±0.14 |
| IQ2_XXS | 2.3824 | 2.23 | 852.39 ±0.85 | 79.86 ±0.22 |
| IQ2_XS | 2.5882 | 2.42 | 826.99 ±12.51 | 78.04 ±0.46 |
| IQ2_S | 2.7403 | 2.56 | 783.55 ±13.73 | 77.30 ±2.47 |
| IQ2_M | 2.9294 | 2.74 | 787.68 ±7.00 | 74.44 ±0.15 |
| IQ3_XXS | 3.2548 | 3.04 | 813.88 ±6.53 | 73.95 ±0.20 |
| IQ3_XS | 3.4977 | 3.27 | 708.71 ±1.26 | 71.67 ±0.54 |
| IQ3_S | 3.6606 | 3.42 | 798.78 ±8.81 | 69.31 ±0.63 |
| IQ3_M | 3.7628 | 3.52 | 768.70 ±13.73 | 70.15 ±0.33 |
| IQ4_XS | 4.4597 | 4.17 | 771.80 ±11.38 | 77.51 ±0.20 |
| IQ4_NL | 4.6818 | 4.38 | 818.55 ±9.58 | 76.71 ±0.20 |
| Q2_K_S | 2.9697 | 2.78 | 798.91 ±6.40 | 90.01 ±0.12 |
| Q2_K | 3.1593 | 2.95 | 784.45 ±7.85 | 79.85 ±0.20 |
| Q3_K_S | 3.6429 | 3.41 | 752.17 ±7.94 | 71.68 ±0.22 |
| Q3_K_L | 4.2979 | 4.02 | 761.17 ±7.55 | 69.38 ±0.49 |
| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 |
| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 |
| Q4_K_M | 4.8944 | 4.58 | 821.81 ±21.44 | 71.93 ±1.52 |
| Q5_K_S | 5.5704 | 5.21 | 752.52 ±0.99 | 69.53 ±0.18 |
| Q5_K_M | 5.7036 | 5.33 | 758.69 ±7.43 | 67.23 ±1.08 |
| Q6_K | 6.5633 | 6.14 | 812.01 ±10.82 | 58.67 ±3.13 |
| Q8_0 | 8.5008 | 7.95 | 865.09 ±8.30 | 50.93 ±0.08 |
| F16 | 16.0005 | 14.96 | 923.49 ±0.53 | 29.17 ±0.04 |
## Background information on llama-quantize

View File

@ -8,15 +8,12 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <vector>
#include <filesystem>
#include <fstream>
#include <map>
#include <string>
#include <unordered_map>
#include <map>
#include <fstream>
#include <cmath>
#include <cctype>
#include <algorithm>
#include <filesystem>
#include <vector>
// result of parsing --tensor-type option
// (changes to this struct must be reflected in src/llama-quant.cpp)
@ -246,7 +243,11 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
return m_last_call;
}
static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
static int load_imatrix(const std::string & imatrix_file,
std::vector<std::string> & imatrix_datasets,
std::unordered_map<std::string, std::vector<float>> & values_data,
std::unordered_map<std::string, std::vector<float>> & activations_data,
std::unordered_map<std::string, std::vector<float>> & statistics_data) {
struct ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
@ -256,7 +257,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
if (!ctx_gguf) {
fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
}
const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
if (n_entries < 1) {
@ -278,11 +279,13 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
const std::string sums_suffix{ ".in_sum2" };
const std::string sums_suffix{ ".in_sum" };
const std::string sums2_suffix{ ".in_sum2" };
const std::string counts_suffix{ ".counts" };
const std::string stats_suffix{ ".stats" };
// Using an ordered map to get a deterministic iteration order.
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string name = cur->name;
@ -290,11 +293,17 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
if (name.empty()) { continue; }
if (string_remove_suffix(name, sums_suffix)) {
// in_sum
std::get<0>(sums_counts_for[std::move(name)]) = cur;
} else if (string_remove_suffix(name, sums2_suffix)) {
// in_sum2
sums_counts_for[std::move(name)].first = cur;
std::get<1>(sums_counts_for[std::move(name)]) = cur;
} else if (string_remove_suffix(name, counts_suffix)) {
// counts
sums_counts_for[std::move(name)].second = cur;
std::get<2>(sums_counts_for[std::move(name)]) = cur;
} else if (string_remove_suffix(name, stats_suffix)) {
// stats
std::get<3>(sums_counts_for[std::move(name)]) = cur;
} else {
// ignore other tensors
}
@ -302,32 +311,55 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
for (const auto & sc : sums_counts_for) {
const std::string & name = sc.first;
const struct ggml_tensor * sums = sc.second.first;
const struct ggml_tensor * counts = sc.second.second;
const struct ggml_tensor * sums = std::get<0>(sc.second);
const struct ggml_tensor * sums2 = std::get<1>(sc.second);
const struct ggml_tensor * counts = std::get<2>(sc.second);
const struct ggml_tensor * stats = std::get<3>(sc.second);
if (!sums || !counts) {
// check sums2 and counts are present, and that sums and sums2 have the same shape
if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1);
}
const int64_t ne0 = sums->ne[0];
const int64_t ne1 = sums->ne[1];
const int64_t ne0 = sums2->ne[0];
const int64_t ne1 = sums2->ne[1];
auto & e = imatrix_data[name];
e.resize(ggml_nelements(sums));
auto & activations = activations_data[name];
auto & values = values_data[name];
if (sums) {
activations.resize(ggml_nelements(sums));
}
if (stats) {
auto & statistics = statistics_data[name];
statistics.resize(ggml_nelements(stats));
if (stats->type == GGML_TYPE_F32) {
std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
} else {
fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
__func__, ggml_type_name(stats->type), name.c_str());
statistics.clear();
statistics_data.erase(name);
}
}
values.resize(ggml_nelements(sums2));
float max_count = 0.0f;
for (int64_t j = 0; j < ne1; ++j) {
const float count = ((const float *) counts->data)[j];
if (count > 0.0f) {
for (int64_t i = 0; i < ne0; ++i) {
e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
}
} else {
// Partial imatrix data, this tensor never got any input during calibration
for (int64_t i = 0; i < ne0; ++i) {
e[j*ne0 + i] = 1;
values[j*ne0 + i] = 1;
if (sums) { activations[j*ne0 + i] = 0; }
}
}
if (count > max_count) {
@ -335,7 +367,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
}
}
if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
__func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
}
}
@ -352,7 +385,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
}
printf("]\n");
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);
gguf_free(ctx_gguf);
ggml_free(ctx);
@ -364,41 +397,73 @@ static int prepare_imatrix(const std::string & imatrix_file,
std::vector<std::string> & imatrix_dataset,
const std::vector<std::string> & included_weights,
const std::vector<std::string> & excluded_weights,
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::unordered_map<std::string, std::vector<float>> & values_data,
std::unordered_map<std::string, std::vector<float>> & activations_data,
std::unordered_map<std::string, std::vector<float>> & statistics_data) {
int m_last_call = -1;
if (!imatrix_file.empty()) {
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
}
if (imatrix_data.empty()) {
if (values_data.empty()) {
return m_last_call;
}
if (!excluded_weights.empty()) {
for (const auto & name : excluded_weights) {
for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
auto pos = it->first.find(name);
for (auto vt = values_data.begin(); vt != values_data.end();) {
auto pos = vt->first.find(name);
if (pos != std::string::npos) {
it = imatrix_data.erase(it);
vt = values_data.erase(vt);
} else {
++it;
++vt;
}
}
for (auto at = activations_data.begin(); at != activations_data.end();) {
auto pos = at->first.find(name);
if (pos != std::string::npos) {
at = activations_data.erase(at);
} else {
++at;
}
}
for (auto st = statistics_data.begin(); st != statistics_data.end();) {
auto pos = st->first.find(name);
if (pos != std::string::npos) {
st = statistics_data.erase(st);
} else {
++st;
}
}
}
}
if (!included_weights.empty()) {
std::unordered_map<std::string, std::vector<float>> tmp;
std::unordered_map<std::string, std::vector<float>> tmp_values;
std::unordered_map<std::string, std::vector<float>> tmp_activations;
std::unordered_map<std::string, std::vector<float>> tmp_statistics;
for (const auto & name : included_weights) {
for (auto & e : imatrix_data) {
for (auto & e : values_data) {
auto pos = e.first.find(name);
if (pos != std::string::npos) {
tmp.emplace(std::move(e));
tmp_values.emplace(std::move(e));
}
}
for (auto & a : activations_data) {
auto pos = a.first.find(name);
if (pos != std::string::npos) {
tmp_activations.emplace(std::move(a));
}
}
for (auto & s : statistics_data) {
auto pos = s.first.find(name);
if (pos != std::string::npos) {
tmp_statistics.emplace(std::move(s));
}
}
}
imatrix_data = std::move(tmp);
}
if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
values_data = std::move(tmp_values);
activations_data = std::move(tmp_activations);
statistics_data = std::move(tmp_statistics);
}
return m_last_call;
}
@ -489,6 +554,109 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
return true;
}
static bool parse_target_bpw(const char * data, float & target_bpw) {
if (!data) {
printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
return false;
}
try {
target_bpw = std::stof(data);
if (target_bpw < 0.0f || target_bpw > 16.0f) {
printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
return false;
}
}
catch (const std::exception & e) {
printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
return false;
}
return true;
}
static bool parse_importance_pct(const char * data, float & importance_pct) {
if (!data) {
printf("\n%s: no tensor importance %% provided\n\n", __func__);
return false;
}
try {
importance_pct = std::stof(data);
if (importance_pct < 0.0f || importance_pct > 100.0f) {
printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
return false;
}
}
catch (const std::exception & e) {
printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
return false;
}
return true;
}
static bool parse_target_size(const char * data, int64_t & target_size) {
if (!data) {
printf("\n%s: no target file size provided\n\n", __func__);
return false;
}
char * end = nullptr;
const double val = std::strtod(data, &end);
if (end == data || val < 0) {
printf("\n%s: invalid target file size '%s'\n\n", __func__, data);
return false;
}
std::string suffix(end);
for (auto & c : suffix) { c = std::tolower(c); }
int64_t mul = 0;
if (suffix.empty() || suffix == "b") {
mul = 1;
} else if (suffix == "k" || suffix == "kb") {
mul = 1024;
} else if (suffix == "m" || suffix == "mb") {
mul = 1024 * 1024;
} else if (suffix == "g" || suffix == "gb") {
mul = 1024 * 1024 * 1024;
} else if (suffix == "t" || suffix == "tb") {
mul = 1024LL * 1024 * 1024 * 1024;
} else {
printf("\n%s: invalid unit '%s' in '%s'. Allowed: b, kb, mb, gb, tb (kilo = 1024 bytes)\n\n", __func__, suffix.c_str(), data);
return false;
}
target_size = (int64_t)val * mul;
return true;
}
static const char * get_ftype(const float bpw) {
const std::map<float, const char *> quant_bpw = {
{1.5625, "IQ1_S"},
{1.7500, "IQ1_M"},
{2.0625, "IQ2_XXS"},
{2.3125, "IQ2_XS"},
{2.5625, "IQ2_S"},
{2.6250, "Q2_K"},
{3.0625, "IQ3_XXS"},
{3.4375, "Q3_K"},
{4.2500, "IQ4_XS"},
{4.5000, "Q4_K"},
{5.5000, "Q5_K"},
{6.5625, "Q6_K"},
{8.5000, "Q8_0"},
#ifdef GGML_USE_METAL
{16.0000, "F16"}
#else
{16.0000, "BF16"}
#endif
};
return quant_bpw.lower_bound(bpw)->second;
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
@ -504,6 +672,9 @@ int main(int argc, char ** argv) {
std::vector<llama_model_kv_override> kv_overrides;
std::vector<tensor_type_option> tensor_type_opts;
std::vector<int> prune_layers;
float target_bpw = -1.0f;
int64_t target_size = -1;
float importance_pct = 0.0f;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -534,6 +705,26 @@ int main(int argc, char ** argv) {
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_opts)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--target-size") == 0) {
if (arg_idx == argc-1 || !parse_target_size(argv[++arg_idx], target_size)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--save-state") == 0) {
params.save_state = true;
} else if (strcmp(argv[arg_idx], "--state-file") == 0) {
if (arg_idx < argc-1) {
params.state_file = argv[++arg_idx];
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
usage(argv[0]);
@ -582,10 +773,12 @@ int main(int argc, char ** argv) {
}
std::vector<std::string> imatrix_datasets;
std::unordered_map<std::string, std::vector<float>> imatrix_data;
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
std::unordered_map<std::string, std::vector<float>> values_data;
std::unordered_map<std::string, std::vector<float>> activations_data;
std::unordered_map<std::string, std::vector<float>> statistics_data;
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
if (!values_data.empty()) {
params.imatrix = &values_data;
{
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
@ -608,7 +801,7 @@ int main(int argc, char ** argv) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = imatrix_data.size();
kvo.val_i64 = values_data.size();
kv_overrides.emplace_back(std::move(kvo));
}
@ -620,6 +813,12 @@ int main(int argc, char ** argv) {
kv_overrides.emplace_back(std::move(kvo));
}
}
if (!activations_data.empty()) {
params.activations = &activations_data;
}
if (!statistics_data.empty()) {
params.statistics = &statistics_data;
}
if (!kv_overrides.empty()) {
kv_overrides.emplace_back();
kv_overrides.back().key[0] = 0;
@ -631,6 +830,15 @@ int main(int argc, char ** argv) {
if (!prune_layers.empty()) {
params.prune_layers = &prune_layers;
}
if (target_bpw != -1.0f) {
params.target_bpw = target_bpw;
}
if (target_size != -1) {
params.target_size = target_size;
}
if (importance_pct != 0.0f) {
params.importance_pct = importance_pct;
}
llama_backend_init();
@ -641,6 +849,7 @@ int main(int argc, char ** argv) {
std::string ftype_str;
std::string suffix = ".gguf";
std::vector<const char *> tmp_argv(argv, argv + argc);
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
// argv[arg_idx] is the ftype directly: <input> <ftype>
if (!params.dry_run) {
@ -668,7 +877,15 @@ int main(int argc, char ** argv) {
}
arg_idx++;
if (argc <= arg_idx) {
// If --target-bpw or --target-size are set, select a quantization type unless user specifies type and threads
if (argc - arg_idx <= 1 && (params.target_bpw != -1.0f || params.target_size != -1)) {
auto * ftype = params.target_bpw != -1.0f ? const_cast<char *>(get_ftype(params.target_bpw)) : const_cast<char *>("F16");
if (argc == arg_idx) { tmp_argv.push_back(ftype); }
else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
tmp_argv.push_back(nullptr);
argv = const_cast<char **>(tmp_argv.data());
argc++;
} else if (argc <= arg_idx) {
fprintf(stderr, "%s: missing ftype\n", __func__);
return 1;
}