Update with latest changes to state counters
This commit is contained in:
parent
506a49006c
commit
3948227d23
37
CLAUDE.md
37
CLAUDE.md
|
|
@ -1 +1,36 @@
|
|||
IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
|
||||
IMPORTANT: Ensure you've thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
|
||||
|
||||
# AI Policy
|
||||
- AI is assistive only; AI-generated PRs are restricted per AGENTS.md
|
||||
- Contributor reviews and writes code themselves
|
||||
|
||||
# Code Style & Conventions
|
||||
- snake_case naming; optimize for longest common prefix
|
||||
- 4 spaces indentation, brackets on same line
|
||||
- `void * ptr`, `int & a` (space around pointer/reference)
|
||||
- Avoid templates and fancy STL
|
||||
- Use sized integer types (`int32_t`) in public API
|
||||
- See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines, naming, and PR process
|
||||
|
||||
# ggml Tensor Conventions
|
||||
- Data stored in row-major order
|
||||
- Dimension 0 = columns, dimension 1 = rows, dimension 2 = matrices
|
||||
- **Matrix multiply is unconventional**: `C = ggml_mul_mat(ctx, A, B)` means `C^T = A * B^T`
|
||||
|
||||
# Quantization
|
||||
- See [docs/quantization/](docs/quantization/) for comprehensive documentation
|
||||
- See [docs/quantization/09-adding-new-types.md](docs/quantization/09-adding-new-types.md) for adding new types
|
||||
|
||||
## Key Files
|
||||
- `ggml/include/ggml.h`: type enums (`ggml_type`)
|
||||
- `ggml/src/ggml-common.h`: block structures
|
||||
- `ggml/src/ggml-quants.c`: reference quantize/dequantize implementations
|
||||
- `tools/quantize/quantize.cpp`: CLI tool
|
||||
- `src/llama-quant.cpp`: core quantization engine
|
||||
|
||||
## Quantization Families
|
||||
- **Q**: simple uniform quantization
|
||||
- **K**: super-block quantization (multiple sub-blocks per super-block)
|
||||
- **IQ**: importance-weighted quantization
|
||||
- **T**: ternary quantization
|
||||
- **MXFP**: Microsoft floating-point quantization
|
||||
|
|
|
|||
|
|
@ -13,22 +13,6 @@
|
|||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// tensor categorization - used to avoid repeated string matching in quantization logic.
|
||||
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
|
||||
enum class tensor_category {
|
||||
TOKEN_EMBD,
|
||||
ATTENTION_Q,
|
||||
ATTENTION_V,
|
||||
ATTENTION_K,
|
||||
ATTENTION_QKV,
|
||||
ATTENTION_KV_B,
|
||||
ATTENTION_OUTPUT,
|
||||
FFN_UP,
|
||||
FFN_GATE,
|
||||
FFN_DOWN,
|
||||
OUTPUT,
|
||||
OTHER
|
||||
};
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
|
|
@ -150,15 +134,6 @@ static bool category_is_attn_v(tensor_category cat) {
|
|||
cat == tensor_category::ATTENTION_KV_B;
|
||||
}
|
||||
|
||||
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
||||
struct tensor_metadata {
|
||||
ggml_type target_type;
|
||||
tensor_category category;
|
||||
std::string remapped_imatrix_name;
|
||||
bool allows_quantization;
|
||||
bool requires_imatrix;
|
||||
};
|
||||
|
||||
//
|
||||
// dequantization
|
||||
//
|
||||
|
|
@ -355,6 +330,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
|
|||
return return_type;
|
||||
}
|
||||
|
||||
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
|
||||
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
|
|
@ -604,15 +580,8 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
return new_type;
|
||||
}
|
||||
|
||||
// public API: compute category from tensor name and delegate to _impl
|
||||
ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
tensor_category category = tensor_get_category(name);
|
||||
return llama_tensor_get_type_impl(qs, new_type, tensor, ftype, category);
|
||||
}
|
||||
|
||||
// outer wrapper: determine the ggml_type that this tensor should be quantized to (used internally by llama_model_quantize_impl)
|
||||
static ggml_type llama_tensor_get_type_internal(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
|
||||
// outer wrapper: determine the ggml_type that this tensor should be quantized to
|
||||
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
|
||||
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
|
||||
return tensor->type;
|
||||
}
|
||||
|
|
@ -845,15 +814,16 @@ const char * llama_ftype_to_name(llama_ftype ftype) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names) {
|
||||
for (const auto & name : tensor_names) {
|
||||
tensor_category cat = tensor_get_category(name);
|
||||
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
|
||||
for (auto & tm : metadata) {
|
||||
tensor_category cat = tensor_get_category(tm.name);
|
||||
tm.category = cat;
|
||||
|
||||
if (category_is_attn_v(cat)) {
|
||||
++qs.n_attention_wv;
|
||||
}
|
||||
|
||||
if (tensor_name_match_output_weight(name.c_str())) {
|
||||
if (cat == tensor_category::OUTPUT) {
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
|
|
@ -996,15 +966,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
});
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::string> tensor_names;
|
||||
tensor_names.reserve(tensors.size());
|
||||
for (const auto * it : tensors) {
|
||||
tensor_names.emplace_back(ggml_get_name(it->tensor));
|
||||
}
|
||||
init_quantize_state_counters(qs, tensor_names);
|
||||
// compute tensor metadata once and cache it
|
||||
std::vector<tensor_metadata> metadata(tensors.size());
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
metadata[i].name = ggml_get_name(tensors[i]->tensor);
|
||||
}
|
||||
|
||||
// initialize quantization state counters and metadata categories
|
||||
init_quantize_state_counters(qs, metadata);
|
||||
|
||||
int idx = 0;
|
||||
uint16_t n_split = 1;
|
||||
|
||||
|
|
@ -1017,25 +987,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
||||
ctx_outs[0] = std::move(ctx_out);
|
||||
|
||||
// compute tensor metadata once and cache it
|
||||
std::vector<tensor_metadata> metadata(tensors.size());
|
||||
|
||||
// initialize quantization state before preliminary loop (counters for use_more_bits)
|
||||
{
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
const auto cat = tensor_get_category(tensors[i]->tensor->name);
|
||||
if (category_is_attn_v(cat)) {
|
||||
++qs.n_attention_wv;
|
||||
}
|
||||
if (cat == tensor_category::OUTPUT) {
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
metadata[i].category = cat; // save and re-use the category while we're at it
|
||||
}
|
||||
// these also need to be set to n_layer by default
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
||||
}
|
||||
|
||||
// flag for --dry-run
|
||||
bool will_require_imatrix = false;
|
||||
|
||||
|
|
@ -1059,7 +1010,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
|
||||
|
||||
if (metadata[i].allows_quantization) {
|
||||
metadata[i].target_type = llama_tensor_get_type_internal(qs, params, tensor, default_type, metadata[i]);
|
||||
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
|
||||
} else {
|
||||
metadata[i].target_type = tensor->type;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,6 +12,33 @@
|
|||
|
||||
struct llama_model;
|
||||
|
||||
// tensor categorization - used to avoid repeated string matching in quantization logic.
|
||||
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
|
||||
enum class tensor_category {
|
||||
TOKEN_EMBD,
|
||||
ATTENTION_Q,
|
||||
ATTENTION_V,
|
||||
ATTENTION_K,
|
||||
ATTENTION_QKV,
|
||||
ATTENTION_KV_B,
|
||||
ATTENTION_OUTPUT,
|
||||
FFN_UP,
|
||||
FFN_GATE,
|
||||
FFN_DOWN,
|
||||
OUTPUT,
|
||||
OTHER
|
||||
};
|
||||
|
||||
// per-tensor metadata, computed in the preliminary loop and used in the main loop
|
||||
struct tensor_metadata {
|
||||
std::string name;
|
||||
ggml_type target_type;
|
||||
tensor_category category;
|
||||
std::string remapped_imatrix_name;
|
||||
bool allows_quantization;
|
||||
bool requires_imatrix;
|
||||
};
|
||||
|
||||
// result of parsing --tensor-type option
|
||||
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
|
||||
struct tensor_type_option {
|
||||
|
|
@ -56,7 +83,7 @@ struct quantize_state_impl {
|
|||
}
|
||||
};
|
||||
|
||||
ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype);
|
||||
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
|
||||
ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
||||
|
||||
// Ftype name <-> enum conversions.
|
||||
|
|
@ -64,9 +91,9 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype);
|
|||
llama_ftype llama_ftype_from_name(const char * name);
|
||||
const char * llama_ftype_to_name(llama_ftype ftype);
|
||||
|
||||
// Initialize quantize_state_impl counters by scanning tensor names.
|
||||
// tensor_names: all quantizable weight tensor names in the model.
|
||||
void init_quantize_state_counters(quantize_state_impl & qs, const std::vector<std::string> & tensor_names);
|
||||
// Initialize quantize_state_impl counters and populate tensor_metadata categories.
|
||||
// metadata: vector with name fields already set, will have category field populated.
|
||||
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata);
|
||||
|
||||
// Returns true if this tensor should be quantized (based on name, dims, params).
|
||||
bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);
|
||||
|
|
|
|||
|
|
@ -515,7 +515,8 @@ static std::string detect_gguf_filename(const std::string & repo, const std::str
|
|||
static std::optional<gguf_remote_model> fetch_and_parse(
|
||||
const std::string & repo,
|
||||
const std::string & filename,
|
||||
const std::string & cache_path) {
|
||||
const std::string & cache_path,
|
||||
bool verbose) {
|
||||
std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
|
||||
|
||||
// Progressive download inspired by RangeView.fetchChunk()
|
||||
|
|
@ -524,7 +525,9 @@ static std::optional<gguf_remote_model> fetch_and_parse(
|
|||
const size_t max_chunk = 64 * 1024 * 1024;
|
||||
|
||||
while (chunk_size <= max_chunk) {
|
||||
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
|
||||
}
|
||||
|
||||
char range_buf[64];
|
||||
snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
|
||||
|
|
@ -565,7 +568,8 @@ static std::optional<gguf_remote_model> fetch_or_cached(
|
|||
const std::string & repo,
|
||||
const std::string & filename,
|
||||
const std::string & cdir,
|
||||
const std::string & repo_part) {
|
||||
const std::string & repo_part,
|
||||
bool verbose) {
|
||||
std::string fname_part = sanitize_for_path(filename);
|
||||
std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
|
||||
|
||||
|
|
@ -574,20 +578,23 @@ static std::optional<gguf_remote_model> fetch_or_cached(
|
|||
if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
|
||||
auto result = gguf_parse_meta(cached);
|
||||
if (result.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs_create_directory_with_parents(cdir);
|
||||
return fetch_and_parse(repo, filename, cache_path);
|
||||
return fetch_and_parse(repo, filename, cache_path, verbose);
|
||||
}
|
||||
|
||||
std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
||||
const std::string & repo,
|
||||
const std::string & quant,
|
||||
const std::string & cache_dir) {
|
||||
const std::string & cache_dir,
|
||||
bool verbose) {
|
||||
std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
|
||||
std::string repo_part = sanitize_for_path(repo);
|
||||
|
||||
|
|
@ -597,7 +604,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
|
||||
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
|
||||
if (!model_opt.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
|
||||
return std::nullopt;
|
||||
|
|
@ -612,8 +619,10 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
return std::nullopt;
|
||||
}
|
||||
|
||||
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
||||
model.n_split, model.n_split - 1);
|
||||
if (verbose) {
|
||||
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
|
||||
model.n_split, model.n_split - 1);
|
||||
}
|
||||
|
||||
for (int i = 2; i <= model.n_split; i++) {
|
||||
char num_buf[6], total_buf[6];
|
||||
|
|
@ -621,7 +630,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
|||
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
||||
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
|
||||
if (!shard.has_value()) {
|
||||
fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
|
||||
return std::nullopt;
|
||||
|
|
|
|||
|
|
@ -39,4 +39,5 @@ struct gguf_remote_model {
|
|||
std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
||||
const std::string & repo,
|
||||
const std::string & quant = "Q8_0",
|
||||
const std::string & cache_dir = ""); // empty = default
|
||||
const std::string & cache_dir = "", // empty = default
|
||||
bool verbose = true);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=2560, n_ff=10240, n_vocab=262144, n_layer=34, n_head=8, n_head_kv=4
|
||||
|
||||
[F32] f32
|
||||
token_embd.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
token_embd.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
token_embd.weight q6_K
|
||||
|
|
@ -1205,7 +1203,6 @@ blk.33.attn_output.weight iq2_xxs
|
|||
blk.33.attn_v.weight q2_K
|
||||
|
||||
[BF16] bf16
|
||||
token_embd.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
token_embd.weight q6_K
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=8192, n_ff=28672, n_vocab=128256, n_layer=80, n_head=64, n_head_kv=8
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -3324,7 +3322,6 @@ blk.79.attn_v.weight q4_K
|
|||
output.weight q5_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
token_embd.weight q4_K
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=1024, n_ff=3072, n_vocab=151936, n_layer=28, n_head=16, n_head_kv=8
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -1013,7 +1011,6 @@ blk.27.attn_output.weight iq2_xxs
|
|||
blk.27.attn_v.weight q2_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
output.weight q6_K
|
||||
|
|
|
|||
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=5120, n_ff=17408, n_vocab=151936, n_layer=40, n_head=40, n_head_kv=8
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -1613,7 +1611,6 @@ blk.39.attn_output.weight iq2_xxs
|
|||
blk.39.attn_v.weight q4_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
output.weight q6_K
|
||||
|
|
|
|||
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=2048, n_ff=5120, n_vocab=151936, n_layer=48, n_head=16, n_head_kv=2, n_expert=512
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -1790,7 +1788,6 @@ blk.47.attn_v.weight q4_K
|
|||
output.weight q5_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
token_embd.weight q4_K
|
||||
|
|
|
|||
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=5120, n_ff=17408, n_vocab=248320, n_layer=64, n_head=24, n_head_kv=4
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -1898,7 +1896,6 @@ blk.63.attn_output.weight iq2_xxs
|
|||
blk.63.attn_v.weight q4_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
output.weight q6_K
|
||||
|
|
|
|||
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=4096, n_ff=0, n_vocab=248320, n_layer=60, n_head=32, n_head_kv=2, n_expert=512
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -2205,7 +2203,6 @@ blk.59.attn_output.weight iq2_xxs
|
|||
blk.59.attn_v.weight q4_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
output.weight q6_K
|
||||
|
|
|
|||
|
|
@ -2,10 +2,8 @@
|
|||
# n_embd=4096, n_ff=11264, n_vocab=128896, n_layer=45, n_head=64, n_head_kv=8, n_expert=288
|
||||
|
||||
[F32] f32
|
||||
output.weight q6_K
|
||||
|
||||
[F16] f16
|
||||
output.weight q6_K
|
||||
|
||||
[Q4_0] q4_0
|
||||
output.weight q6_K
|
||||
|
|
@ -2078,7 +2076,6 @@ blk.44.attn_output.weight iq2_xxs
|
|||
blk.44.attn_v.weight q4_K
|
||||
|
||||
[BF16] bf16
|
||||
output.weight q6_K
|
||||
|
||||
[TQ1_0] tq1_0
|
||||
output.weight q6_K
|
||||
|
|
|
|||
|
|
@ -268,21 +268,20 @@ static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_
|
|||
|
||||
quantize_state_impl qs(mdl, &qparams);
|
||||
|
||||
std::vector<std::string> names;
|
||||
names.reserve(tensors.size());
|
||||
for (const auto & mt : tensors) {
|
||||
names.push_back(mt.tensor->name);
|
||||
std::vector<tensor_metadata> metadata(tensors.size());
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
metadata[i].name = tensors[i].tensor->name;
|
||||
}
|
||||
init_quantize_state_counters(qs, names);
|
||||
init_quantize_state_counters(qs, metadata);
|
||||
|
||||
ggml_type default_type = llama_ftype_get_default_type(ftype);
|
||||
|
||||
std::vector<std::pair<std::string, ggml_type>> result;
|
||||
result.reserve(tensors.size());
|
||||
|
||||
for (const auto & mt : tensors) {
|
||||
ggml_type got = llama_tensor_get_type(qs, default_type, mt.tensor, ftype);
|
||||
result.push_back({ mt.tensor->name, got });
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
ggml_type got = llama_tensor_get_type(qs, &qparams, tensors[i].tensor, default_type, metadata[i]);
|
||||
result.push_back({ metadata[i].name, got });
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
@ -408,7 +407,7 @@ static bool run_test_section(llama_model & mdl,
|
|||
}
|
||||
|
||||
if (got != expected) {
|
||||
printf(" FAIL %-50s expected %s, got %s\n", name.c_str(), ggml_type_name(expected), ggml_type_name(got));
|
||||
printf(" FAIL %-50s %-10s expected %s, got %s\n", name.c_str(), llama_ftype_to_name(section.ftype), ggml_type_name(expected), ggml_type_name(got));
|
||||
all_pass = false;
|
||||
}
|
||||
}
|
||||
|
|
@ -432,8 +431,7 @@ static int run_remote_tests(const std::string & snapshot_dir, const char * argv0
|
|||
std::string name = model_name_from_repo(spec.repo);
|
||||
printf("=== %s ===\n", name.c_str());
|
||||
|
||||
fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
|
||||
auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
|
||||
auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
|
||||
if (!result.has_value()) {
|
||||
printf(" SKIP (could not fetch model metadata)\n\n");
|
||||
total_skip++;
|
||||
|
|
@ -506,5 +504,8 @@ int main(int argc, char ** argv) {
|
|||
return run_generate(snapshot_dir);
|
||||
}
|
||||
|
||||
// suppress llama log warnings during test (e.g. tensor type fallback messages)
|
||||
llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
|
||||
|
||||
return run_remote_tests(snapshot_dir, argv[0]);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue