Merge feda897fa2 into 9e2e2198b0
This commit is contained in:
commit
04eb22b1fe
|
|
@ -393,9 +393,16 @@ extern "C" {
|
|||
bool keep_split; // quantize to the same number of shards
|
||||
bool dry_run; // calculate and show the final quantization size without performing quantization
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * activations; // pointer to activations data
|
||||
void * statistics; // pointer to statistics data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * tensor_types; // pointer to vector containing tensor types
|
||||
void * prune_layers; // pointer to vector containing layer indices to prune
|
||||
float target_bpw; // target bits per weight (bpw)
|
||||
int64_t target_size; // target file size in bytes
|
||||
bool save_state; // keep bpw state file
|
||||
void * state_file; // pointer to bpw state file
|
||||
float importance_pct; // identify up to pct% of tensors as important
|
||||
} llama_model_quantize_params;
|
||||
|
||||
typedef struct llama_logit_bias {
|
||||
|
|
|
|||
1949
src/llama-quant.cpp
1949
src/llama-quant.cpp
File diff suppressed because it is too large
Load Diff
|
|
@ -56,8 +56,13 @@ Options:
|
|||
* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file
|
||||
|
||||
Advanced options:
|
||||
* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
|
||||
* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times
|
||||
* `--prune-layers` prune (remove) the layers in the list
|
||||
* `--target-bpw` automatically choose quant types to meet an overall bits per weight (bpw) target
|
||||
* `--target-size` automatically choose quant types to meet a file size target
|
||||
* `--ignore-tensor-importance` during target computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models
|
||||
* `--save-state` save the target computation to a file. By default, it saves to `<model name>-<model hash>-mse.bpw_state` unless `--state-file` is also specified
|
||||
* `--state-file` file name to load from / save to target computations
|
||||
* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
|
||||
|
||||
Examples:
|
||||
|
|
@ -97,59 +102,64 @@ Examples:
|
|||
./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model targeting a specific bpw average and save the target computations to the default file. Model type is optional and can be omitted
|
||||
./llama-quantize --target-bpw 4.5678 --save-state --imatrix imatrix.gguf input-model-f32.gguf 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model targeting a specific file size and save the target computations to a custom file. Model type is optional and can be omitted
|
||||
./llama-quantize --target-size 1.5gb --save-state --state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
|
||||
```
|
||||
|
||||
```bash
|
||||
# quantize model targeting a specific bpw average reusing previous target computations
|
||||
./llama-quantize --target-bpw 2.5 ---state-file my-state-file.dat --imatrix imatrix.gguf input-model-f32.gguf 8
|
||||
```
|
||||
|
||||
## Memory/Disk Requirements
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1):
|
||||
|
||||
| Model | Original size | Quantized size (Q4_K_M) |
|
||||
| ----: | ------------: | ----------------------: |
|
||||
|------:|--------------:|------------------------:|
|
||||
| 8B | 32.1 GB | 4.9 GB |
|
||||
| 70B | 280.9 GB | 43.1 GB |
|
||||
| 405B | 1,625.1 GB | 249.1 GB |
|
||||
|
||||
|
||||
## Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,
|
||||
|
||||
### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
|
||||
|
||||
| Measure | IQ1_S | IQ1_M | IQ2_XXS | IQ2_XS | IQ2_S | IQ2_M |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
|
||||
| bits/weight | 2.0042 | 2.1460 | 2.3824 | 2.5882 | 2.7403 | 2.9294 |
|
||||
| size (GiB) | 1.87 | 2.01 | 2.23 | 2.42 | 2.56 | 2.74 |
|
||||
| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
|
||||
| text generation t/s @ 128 | 79.73 ±0.79 | 72.92 ±0.14 | 79.86 ±0.22 | 78.04 ±0.46 | 77.30 ±2.47 | 74.44 ±0.15 |
|
||||
|
||||
| Measure | IQ3_XXS | IQ3_XS | IQ3_S | IQ3_M | IQ4_XS | IQ4_NL |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
|
||||
| bits/weight | 3.2548 | 3.4977 | 3.6606 | 3.7628 | 4.4597 | 4.6818 |
|
||||
| size (GiB) | 3.04 | 3.27 | 3.42 | 3.52 | 4.17 | 4.38 |
|
||||
| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
|
||||
| text generation t/s @ 128 | 73.95 ±0.20 | 71.67 ±0.54 | 69.31 ±0.63 | 70.15 ±0.33 | 77.51 ±0.20 | 76.63 ±0.28 |
|
||||
|
||||
|
||||
| Measure | Q2_K_S | Q2_K | Q3_K_S | Q3_K_M | Q3_K_L | Q4_K_S |
|
||||
| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
|
||||
| bits/weight | 2.9697 | 3.1593 | 3.6429 | 3.9960 | 4.2979 | 4.6672 |
|
||||
| size (GiB) | 2.78 | 2.95 | 3.41 | 3.74 | 4.02 | 4.36 |
|
||||
| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
|
||||
| text generation t/s @ 128 | 90.01 ±0.12 | 79.85 ±0.20 | 69.84 ±0.18 | 71.68 ±0.22 | 69.38 ±0.49 | 76.71 ±0.20 |
|
||||
|
||||
| Measure | Q4_K_S | Q4_K_M | Q5_K_S | Q5_K_M | Q6_K | Q8_0 |
|
||||
| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
|
||||
| bits/weight | 4.6672 | 4.8944 | 5.5704 | 5.7036 | 6.5633 | 8.5008 |
|
||||
| size (GiB) | 4.36 | 4.58 | 5.21 | 5.33 | 6.14 | 7.95 |
|
||||
| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
|
||||
| text generation t/s @ 128 | 76.71 ±0.20 | 71.93 ±1.52 | 69.53 ±0.18 | 67.23 ±1.08 | 58.67 ±3.13 | 50.93 ±0.08 |
|
||||
|
||||
| Measure | F16 |
|
||||
| --------------------------- | ------------ |
|
||||
| bits/weight | 16.0005 |
|
||||
| size (GiB) | 14.96 |
|
||||
| prompt processing t/s @ 512 | 923.49 ±0.53 |
|
||||
| text generation t/s @ 128 | 29.17 ±0.04 |
|
||||
| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 |
|
||||
|:----------:|------------:|-----------:|----------------------------:|--------------------------:|
|
||||
| IQ1_S | 2.0042 | 1.87 | 858.88 ±1.22 | 79.73 ±0.79 |
|
||||
| IQ1_M | 2.1460 | 2.01 | 847.99 ±0.47 | 72.92 ±0.14 |
|
||||
| IQ2_XXS | 2.3824 | 2.23 | 852.39 ±0.85 | 79.86 ±0.22 |
|
||||
| IQ2_XS | 2.5882 | 2.42 | 826.99 ±12.51 | 78.04 ±0.46 |
|
||||
| IQ2_S | 2.7403 | 2.56 | 783.55 ±13.73 | 77.30 ±2.47 |
|
||||
| IQ2_M | 2.9294 | 2.74 | 787.68 ±7.00 | 74.44 ±0.15 |
|
||||
| IQ3_XXS | 3.2548 | 3.04 | 813.88 ±6.53 | 73.95 ±0.20 |
|
||||
| IQ3_XS | 3.4977 | 3.27 | 708.71 ±1.26 | 71.67 ±0.54 |
|
||||
| IQ3_S | 3.6606 | 3.42 | 798.78 ±8.81 | 69.31 ±0.63 |
|
||||
| IQ3_M | 3.7628 | 3.52 | 768.70 ±13.73 | 70.15 ±0.33 |
|
||||
| IQ4_XS | 4.4597 | 4.17 | 771.80 ±11.38 | 77.51 ±0.20 |
|
||||
| IQ4_NL | 4.6818 | 4.38 | 818.55 ±9.58 | 76.71 ±0.20 |
|
||||
| Q2_K_S | 2.9697 | 2.78 | 798.91 ±6.40 | 90.01 ±0.12 |
|
||||
| Q2_K | 3.1593 | 2.95 | 784.45 ±7.85 | 79.85 ±0.20 |
|
||||
| Q3_K_S | 3.6429 | 3.41 | 752.17 ±7.94 | 71.68 ±0.22 |
|
||||
| Q3_K_L | 4.2979 | 4.02 | 761.17 ±7.55 | 69.38 ±0.49 |
|
||||
| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 |
|
||||
| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 |
|
||||
| Q4_K_M | 4.8944 | 4.58 | 821.81 ±21.44 | 71.93 ±1.52 |
|
||||
| Q5_K_S | 5.5704 | 5.21 | 752.52 ±0.99 | 69.53 ±0.18 |
|
||||
| Q5_K_M | 5.7036 | 5.33 | 758.69 ±7.43 | 67.23 ±1.08 |
|
||||
| Q6_K | 6.5633 | 6.14 | 812.01 ±10.82 | 58.67 ±3.13 |
|
||||
| Q8_0 | 8.5008 | 7.95 | 865.09 ±8.30 | 50.93 ±0.08 |
|
||||
| F16 | 16.0005 | 14.96 | 923.49 ±0.53 | 29.17 ±0.04 |
|
||||
|
||||
## Background information on llama-quantize
|
||||
|
||||
|
|
|
|||
|
|
@ -8,15 +8,12 @@
|
|||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <cctype>
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <vector>
|
||||
|
||||
// result of parsing --tensor-type option
|
||||
// (changes to this struct must be reflected in src/llama-quant.cpp)
|
||||
|
|
@ -246,7 +243,11 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
|
|||
return m_last_call;
|
||||
}
|
||||
|
||||
static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
static int load_imatrix(const std::string & imatrix_file,
|
||||
std::vector<std::string> & imatrix_datasets,
|
||||
std::unordered_map<std::string, std::vector<float>> & values_data,
|
||||
std::unordered_map<std::string, std::vector<float>> & activations_data,
|
||||
std::unordered_map<std::string, std::vector<float>> & statistics_data) {
|
||||
|
||||
struct ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
|
|
@ -256,7 +257,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
|
||||
return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
|
||||
return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
|
||||
}
|
||||
const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
|
||||
if (n_entries < 1) {
|
||||
|
|
@ -278,11 +279,13 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
|
||||
const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
|
||||
|
||||
const std::string sums_suffix{ ".in_sum2" };
|
||||
const std::string sums_suffix{ ".in_sum" };
|
||||
const std::string sums2_suffix{ ".in_sum2" };
|
||||
const std::string counts_suffix{ ".counts" };
|
||||
const std::string stats_suffix{ ".stats" };
|
||||
|
||||
// Using an ordered map to get a deterministic iteration order.
|
||||
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
|
||||
std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string name = cur->name;
|
||||
|
|
@ -290,11 +293,17 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
if (name.empty()) { continue; }
|
||||
|
||||
if (string_remove_suffix(name, sums_suffix)) {
|
||||
// in_sum
|
||||
std::get<0>(sums_counts_for[std::move(name)]) = cur;
|
||||
} else if (string_remove_suffix(name, sums2_suffix)) {
|
||||
// in_sum2
|
||||
sums_counts_for[std::move(name)].first = cur;
|
||||
std::get<1>(sums_counts_for[std::move(name)]) = cur;
|
||||
} else if (string_remove_suffix(name, counts_suffix)) {
|
||||
// counts
|
||||
sums_counts_for[std::move(name)].second = cur;
|
||||
std::get<2>(sums_counts_for[std::move(name)]) = cur;
|
||||
} else if (string_remove_suffix(name, stats_suffix)) {
|
||||
// stats
|
||||
std::get<3>(sums_counts_for[std::move(name)]) = cur;
|
||||
} else {
|
||||
// ignore other tensors
|
||||
}
|
||||
|
|
@ -302,32 +311,55 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
|
||||
for (const auto & sc : sums_counts_for) {
|
||||
const std::string & name = sc.first;
|
||||
const struct ggml_tensor * sums = sc.second.first;
|
||||
const struct ggml_tensor * counts = sc.second.second;
|
||||
const struct ggml_tensor * sums = std::get<0>(sc.second);
|
||||
const struct ggml_tensor * sums2 = std::get<1>(sc.second);
|
||||
const struct ggml_tensor * counts = std::get<2>(sc.second);
|
||||
const struct ggml_tensor * stats = std::get<3>(sc.second);
|
||||
|
||||
if (!sums || !counts) {
|
||||
// check sums2 and counts are present, and that sums and sums2 have the same shape
|
||||
if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
|
||||
fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const int64_t ne0 = sums->ne[0];
|
||||
const int64_t ne1 = sums->ne[1];
|
||||
const int64_t ne0 = sums2->ne[0];
|
||||
const int64_t ne1 = sums2->ne[1];
|
||||
|
||||
auto & e = imatrix_data[name];
|
||||
e.resize(ggml_nelements(sums));
|
||||
auto & activations = activations_data[name];
|
||||
auto & values = values_data[name];
|
||||
if (sums) {
|
||||
activations.resize(ggml_nelements(sums));
|
||||
}
|
||||
if (stats) {
|
||||
auto & statistics = statistics_data[name];
|
||||
statistics.resize(ggml_nelements(stats));
|
||||
if (stats->type == GGML_TYPE_F32) {
|
||||
std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
|
||||
__func__, ggml_type_name(stats->type), name.c_str());
|
||||
statistics.clear();
|
||||
statistics_data.erase(name);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
values.resize(ggml_nelements(sums2));
|
||||
float max_count = 0.0f;
|
||||
for (int64_t j = 0; j < ne1; ++j) {
|
||||
const float count = ((const float *) counts->data)[j];
|
||||
if (count > 0.0f) {
|
||||
for (int64_t i = 0; i < ne0; ++i) {
|
||||
e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
|
||||
values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
|
||||
if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
|
||||
}
|
||||
} else {
|
||||
// Partial imatrix data, this tensor never got any input during calibration
|
||||
for (int64_t i = 0; i < ne0; ++i) {
|
||||
e[j*ne0 + i] = 1;
|
||||
values[j*ne0 + i] = 1;
|
||||
if (sums) { activations[j*ne0 + i] = 0; }
|
||||
}
|
||||
}
|
||||
if (count > max_count) {
|
||||
|
|
@ -335,7 +367,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
}
|
||||
}
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
|
||||
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
|
||||
__func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -352,7 +385,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
|||
}
|
||||
printf("]\n");
|
||||
|
||||
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
|
||||
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
|
|
@ -364,41 +397,73 @@ static int prepare_imatrix(const std::string & imatrix_file,
|
|||
std::vector<std::string> & imatrix_dataset,
|
||||
const std::vector<std::string> & included_weights,
|
||||
const std::vector<std::string> & excluded_weights,
|
||||
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
||||
std::unordered_map<std::string, std::vector<float>> & values_data,
|
||||
std::unordered_map<std::string, std::vector<float>> & activations_data,
|
||||
std::unordered_map<std::string, std::vector<float>> & statistics_data) {
|
||||
int m_last_call = -1;
|
||||
if (!imatrix_file.empty()) {
|
||||
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
|
||||
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
|
||||
}
|
||||
if (imatrix_data.empty()) {
|
||||
if (values_data.empty()) {
|
||||
return m_last_call;
|
||||
}
|
||||
if (!excluded_weights.empty()) {
|
||||
for (const auto & name : excluded_weights) {
|
||||
for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
|
||||
auto pos = it->first.find(name);
|
||||
for (auto vt = values_data.begin(); vt != values_data.end();) {
|
||||
auto pos = vt->first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
it = imatrix_data.erase(it);
|
||||
vt = values_data.erase(vt);
|
||||
} else {
|
||||
++it;
|
||||
++vt;
|
||||
}
|
||||
}
|
||||
for (auto at = activations_data.begin(); at != activations_data.end();) {
|
||||
auto pos = at->first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
at = activations_data.erase(at);
|
||||
} else {
|
||||
++at;
|
||||
}
|
||||
}
|
||||
for (auto st = statistics_data.begin(); st != statistics_data.end();) {
|
||||
auto pos = st->first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
st = statistics_data.erase(st);
|
||||
} else {
|
||||
++st;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!included_weights.empty()) {
|
||||
std::unordered_map<std::string, std::vector<float>> tmp;
|
||||
std::unordered_map<std::string, std::vector<float>> tmp_values;
|
||||
std::unordered_map<std::string, std::vector<float>> tmp_activations;
|
||||
std::unordered_map<std::string, std::vector<float>> tmp_statistics;
|
||||
for (const auto & name : included_weights) {
|
||||
for (auto & e : imatrix_data) {
|
||||
for (auto & e : values_data) {
|
||||
auto pos = e.first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
tmp.emplace(std::move(e));
|
||||
tmp_values.emplace(std::move(e));
|
||||
}
|
||||
}
|
||||
for (auto & a : activations_data) {
|
||||
auto pos = a.first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
tmp_activations.emplace(std::move(a));
|
||||
}
|
||||
}
|
||||
for (auto & s : statistics_data) {
|
||||
auto pos = s.first.find(name);
|
||||
if (pos != std::string::npos) {
|
||||
tmp_statistics.emplace(std::move(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
imatrix_data = std::move(tmp);
|
||||
}
|
||||
if (!imatrix_data.empty()) {
|
||||
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
|
||||
values_data = std::move(tmp_values);
|
||||
activations_data = std::move(tmp_activations);
|
||||
statistics_data = std::move(tmp_statistics);
|
||||
}
|
||||
|
||||
return m_last_call;
|
||||
}
|
||||
|
||||
|
|
@ -489,6 +554,109 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool parse_target_bpw(const char * data, float & target_bpw) {
|
||||
if (!data) {
|
||||
printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
target_bpw = std::stof(data);
|
||||
if (target_bpw < 0.0f || target_bpw > 16.0f) {
|
||||
printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (const std::exception & e) {
|
||||
printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_importance_pct(const char * data, float & importance_pct) {
|
||||
if (!data) {
|
||||
printf("\n%s: no tensor importance %% provided\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
importance_pct = std::stof(data);
|
||||
if (importance_pct < 0.0f || importance_pct > 100.0f) {
|
||||
printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (const std::exception & e) {
|
||||
printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_target_size(const char * data, int64_t & target_size) {
|
||||
if (!data) {
|
||||
printf("\n%s: no target file size provided\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
char * end = nullptr;
|
||||
const double val = std::strtod(data, &end);
|
||||
if (end == data || val < 0) {
|
||||
printf("\n%s: invalid target file size '%s'\n\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string suffix(end);
|
||||
for (auto & c : suffix) { c = std::tolower(c); }
|
||||
|
||||
int64_t mul = 0;
|
||||
if (suffix.empty() || suffix == "b") {
|
||||
mul = 1;
|
||||
} else if (suffix == "k" || suffix == "kb") {
|
||||
mul = 1024;
|
||||
} else if (suffix == "m" || suffix == "mb") {
|
||||
mul = 1024 * 1024;
|
||||
} else if (suffix == "g" || suffix == "gb") {
|
||||
mul = 1024 * 1024 * 1024;
|
||||
} else if (suffix == "t" || suffix == "tb") {
|
||||
mul = 1024LL * 1024 * 1024 * 1024;
|
||||
} else {
|
||||
printf("\n%s: invalid unit '%s' in '%s'. Allowed: b, kb, mb, gb, tb (kilo = 1024 bytes)\n\n", __func__, suffix.c_str(), data);
|
||||
return false;
|
||||
}
|
||||
|
||||
target_size = (int64_t)val * mul;
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * get_ftype(const float bpw) {
|
||||
const std::map<float, const char *> quant_bpw = {
|
||||
{1.5625, "IQ1_S"},
|
||||
{1.7500, "IQ1_M"},
|
||||
{2.0625, "IQ2_XXS"},
|
||||
{2.3125, "IQ2_XS"},
|
||||
{2.5625, "IQ2_S"},
|
||||
{2.6250, "Q2_K"},
|
||||
{3.0625, "IQ3_XXS"},
|
||||
{3.4375, "Q3_K"},
|
||||
{4.2500, "IQ4_XS"},
|
||||
{4.5000, "Q4_K"},
|
||||
{5.5000, "Q5_K"},
|
||||
{6.5625, "Q6_K"},
|
||||
{8.5000, "Q8_0"},
|
||||
#ifdef GGML_USE_METAL
|
||||
{16.0000, "F16"}
|
||||
#else
|
||||
{16.0000, "BF16"}
|
||||
#endif
|
||||
};
|
||||
|
||||
return quant_bpw.lower_bound(bpw)->second;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
|
|
@ -504,6 +672,9 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<tensor_type_option> tensor_type_opts;
|
||||
std::vector<int> prune_layers;
|
||||
float target_bpw = -1.0f;
|
||||
int64_t target_size = -1;
|
||||
float importance_pct = 0.0f;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
|
|
@ -534,6 +705,26 @@ int main(int argc, char ** argv) {
|
|||
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_opts)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--target-size") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_target_size(argv[++arg_idx], target_size)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--save-state") == 0) {
|
||||
params.save_state = true;
|
||||
} else if (strcmp(argv[arg_idx], "--state-file") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
params.state_file = argv[++arg_idx];
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
|
||||
usage(argv[0]);
|
||||
|
|
@ -582,10 +773,12 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
std::vector<std::string> imatrix_datasets;
|
||||
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
|
||||
if (!imatrix_data.empty()) {
|
||||
params.imatrix = &imatrix_data;
|
||||
std::unordered_map<std::string, std::vector<float>> values_data;
|
||||
std::unordered_map<std::string, std::vector<float>> activations_data;
|
||||
std::unordered_map<std::string, std::vector<float>> statistics_data;
|
||||
int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
|
||||
if (!values_data.empty()) {
|
||||
params.imatrix = &values_data;
|
||||
{
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||
|
|
@ -608,7 +801,7 @@ int main(int argc, char ** argv) {
|
|||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = imatrix_data.size();
|
||||
kvo.val_i64 = values_data.size();
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
|
|
@ -620,6 +813,12 @@ int main(int argc, char ** argv) {
|
|||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
}
|
||||
if (!activations_data.empty()) {
|
||||
params.activations = &activations_data;
|
||||
}
|
||||
if (!statistics_data.empty()) {
|
||||
params.statistics = &statistics_data;
|
||||
}
|
||||
if (!kv_overrides.empty()) {
|
||||
kv_overrides.emplace_back();
|
||||
kv_overrides.back().key[0] = 0;
|
||||
|
|
@ -631,6 +830,15 @@ int main(int argc, char ** argv) {
|
|||
if (!prune_layers.empty()) {
|
||||
params.prune_layers = &prune_layers;
|
||||
}
|
||||
if (target_bpw != -1.0f) {
|
||||
params.target_bpw = target_bpw;
|
||||
}
|
||||
if (target_size != -1) {
|
||||
params.target_size = target_size;
|
||||
}
|
||||
if (importance_pct != 0.0f) {
|
||||
params.importance_pct = importance_pct;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
|
|
@ -641,6 +849,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::string ftype_str;
|
||||
std::string suffix = ".gguf";
|
||||
std::vector<const char *> tmp_argv(argv, argv + argc);
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
// argv[arg_idx] is the ftype directly: <input> <ftype>
|
||||
if (!params.dry_run) {
|
||||
|
|
@ -668,7 +877,15 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
arg_idx++;
|
||||
|
||||
if (argc <= arg_idx) {
|
||||
// If --target-bpw or --target-size are set, select a quantization type unless user specifies type and threads
|
||||
if (argc - arg_idx <= 1 && (params.target_bpw != -1.0f || params.target_size != -1)) {
|
||||
auto * ftype = params.target_bpw != -1.0f ? const_cast<char *>(get_ftype(params.target_bpw)) : const_cast<char *>("F16");
|
||||
if (argc == arg_idx) { tmp_argv.push_back(ftype); }
|
||||
else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
|
||||
tmp_argv.push_back(nullptr);
|
||||
argv = const_cast<char **>(tmp_argv.data());
|
||||
argc++;
|
||||
} else if (argc <= arg_idx) {
|
||||
fprintf(stderr, "%s: missing ftype\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue