Add code comments
This commit is contained in:
parent
5f7bba7828
commit
b6d718a4a6
|
|
@ -623,7 +623,7 @@ static void signal_handler(int) {
|
||||||
bpw_stop.store(true, std::memory_order_relaxed);
|
bpw_stop.store(true, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns tensor type overrides to meet a global bpw target
|
// Returns tensor type overrides that meet a global bpw target
|
||||||
static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
|
|
@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
}
|
}
|
||||||
} signal_guard;
|
} signal_guard;
|
||||||
|
|
||||||
|
// Error and bias projection per GGML_TYPE per tensor
|
||||||
struct candidate_types {
|
struct candidate_types {
|
||||||
ggml_type type = GGML_TYPE_COUNT;
|
ggml_type type = GGML_TYPE_COUNT;
|
||||||
float bpw = 0.0f;
|
float bpw = 0.0f;
|
||||||
|
|
@ -659,6 +660,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
double proj = 0.0;
|
double proj = 0.0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Per‑tensor quantization mix that satisfies a global bpw target
|
||||||
struct tensor_info {
|
struct tensor_info {
|
||||||
const llama_model_loader::llama_tensor_weight * w = nullptr;
|
const llama_model_loader::llama_tensor_weight * w = nullptr;
|
||||||
std::vector<candidate_types> candidate;
|
std::vector<candidate_types> candidate;
|
||||||
|
|
@ -697,22 +699,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
|
constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
|
||||||
const char * func = __func__;
|
const char * func = __func__;
|
||||||
|
|
||||||
|
// Tensor size in bytes for a given type
|
||||||
auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
|
auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
|
||||||
const int64_t n_per_row = t->ne[0];
|
const int64_t n_per_row = t->ne[0];
|
||||||
const size_t row_sz = ggml_row_size(typ, n_per_row);
|
const size_t row_sz = ggml_row_size(typ, n_per_row);
|
||||||
return (size_t)ggml_nrows(t) * row_sz;
|
return (size_t)ggml_nrows(t) * row_sz;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Tensor bpw for a given type
|
||||||
auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
|
auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
|
||||||
const size_t bytes = tensor_bytes(t, typ);
|
const size_t bytes = tensor_bytes(t, typ);
|
||||||
return (double)bytes * 8.0 / (double)ggml_nelements(t);
|
return (double)bytes * 8.0 / (double)ggml_nelements(t);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Check if tensor is compatible with quantization type
|
||||||
auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool {
|
auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool {
|
||||||
const int64_t blck = ggml_blck_size(typ);
|
const int64_t blck = ggml_blck_size(typ);
|
||||||
return blck <= 1 || (t->ne[0] % blck) == 0;
|
return blck <= 1 || (t->ne[0] % blck) == 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Get suitable fallback for type
|
||||||
|
auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
|
||||||
|
if (is_compatible(t, typ)) { return typ; }
|
||||||
|
const ggml_type fb = fallback_type(typ);
|
||||||
|
return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if tensor is an IQ type
|
||||||
auto is_iq = [](const enum ggml_type t) {
|
auto is_iq = [](const enum ggml_type t) {
|
||||||
switch (t) {
|
switch (t) {
|
||||||
case GGML_TYPE_IQ1_S:
|
case GGML_TYPE_IQ1_S:
|
||||||
|
|
@ -730,12 +743,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
|
// Check if tensor can be quantized
|
||||||
if (is_compatible(t, typ)) { return typ; }
|
|
||||||
const ggml_type fb = fallback_type(typ);
|
|
||||||
return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
|
|
||||||
};
|
|
||||||
|
|
||||||
auto can_quantize = [&](const ggml_tensor * t) -> bool {
|
auto can_quantize = [&](const ggml_tensor * t) -> bool {
|
||||||
if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors
|
if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors
|
||||||
return is_quantizable(ggml_get_name(t), model.arch, params);
|
return is_quantizable(ggml_get_name(t), model.arch, params);
|
||||||
|
|
@ -750,6 +758,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
size_t n_elements = 0;
|
size_t n_elements = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// DJB2 hashing algorithm
|
||||||
auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t {
|
auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t {
|
||||||
uint64_t h = 5381;
|
uint64_t h = 5381;
|
||||||
for (size_t i = 0; i < n; ++i) {
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
|
@ -758,6 +767,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
return h ? h : arbitrary_magic;
|
return h ? h : arbitrary_magic;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Get model ID from metadata hash
|
||||||
auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
|
auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
|
||||||
const size_t sz = gguf_get_meta_size(ctx);
|
const size_t sz = gguf_get_meta_size(ctx);
|
||||||
std::vector<uint8_t> buf(sz);
|
std::vector<uint8_t> buf(sz);
|
||||||
|
|
@ -794,6 +804,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Serializes vector<tensor_info> to disk
|
||||||
auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
|
auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
|
||||||
const std::string tmp = checkpoint_file + ".tmp";
|
const std::string tmp = checkpoint_file + ".tmp";
|
||||||
std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
|
std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
|
||||||
|
|
@ -832,6 +843,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
|
LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Deserializes vector<tensor_info> from disk
|
||||||
auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
|
auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
|
||||||
std::unordered_map<std::string, saved_info> out;
|
std::unordered_map<std::string, saved_info> out;
|
||||||
std::ifstream ifs(checkpoint_file, std::ios::binary);
|
std::ifstream ifs(checkpoint_file, std::ios::binary);
|
||||||
|
|
@ -890,6 +902,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
return out;
|
return out;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Deletes checkpoint file unless --keep-bpw-state is set
|
||||||
auto delete_bpw_state = [&] {
|
auto delete_bpw_state = [&] {
|
||||||
std::ifstream ifs(checkpoint_file);
|
std::ifstream ifs(checkpoint_file);
|
||||||
if (ifs.good() && !params->keep_bpw_state) {
|
if (ifs.good() && !params->keep_bpw_state) {
|
||||||
|
|
@ -898,6 +911,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Check for user interrupt and save progress
|
||||||
auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
|
auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
|
||||||
if (bpw_stop.load(std::memory_order_relaxed)) {
|
if (bpw_stop.load(std::memory_order_relaxed)) {
|
||||||
LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
|
LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
|
||||||
|
|
@ -1161,7 +1175,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
|
|
||||||
const auto bpw_data = load_bpw_state();
|
const auto bpw_data = load_bpw_state();
|
||||||
|
|
||||||
// Parallelize tensor processing - courtesy of https://github.com/ddh0
|
// Parallelize tensor processing (courtesy of https://github.com/ddh0)
|
||||||
auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
|
auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
|
||||||
std::vector<no_init<uint8_t>> & thread_local_buffer,
|
std::vector<no_init<uint8_t>> & thread_local_buffer,
|
||||||
std::mutex & loader_mutex,
|
std::mutex & loader_mutex,
|
||||||
|
|
@ -1555,6 +1569,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
|
size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
|
||||||
size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
|
size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
|
||||||
|
|
||||||
|
// Get the types' override
|
||||||
auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
|
auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
|
||||||
std::unordered_map<std::string, ggml_type> overrides;
|
std::unordered_map<std::string, ggml_type> overrides;
|
||||||
LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
|
LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
|
||||||
|
|
@ -1592,7 +1607,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
return important;
|
return important;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Lagrangian relaxation to minimise error subject to a bpw target constraint
|
// Lagrangian relaxation to minimize error subject to a bpw target constraint
|
||||||
auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
|
auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
|
||||||
choice.resize(all.size());
|
choice.resize(all.size());
|
||||||
bytes = 0;
|
bytes = 0;
|
||||||
|
|
@ -1636,7 +1651,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
|
|
||||||
lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo);
|
lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo);
|
||||||
|
|
||||||
// increase mu until we get under budget or hit a safety cap
|
// Increase mu until we get under budget or hit a safety cap
|
||||||
{
|
{
|
||||||
int expand = 0;
|
int expand = 0;
|
||||||
size_t prev_bytes_hi = std::numeric_limits<size_t>::max();
|
size_t prev_bytes_hi = std::numeric_limits<size_t>::max();
|
||||||
|
|
@ -1741,7 +1756,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
delete_bpw_state(); // we're done, clear any checkpoint
|
delete_bpw_state();
|
||||||
|
|
||||||
return emit_overrides();
|
return emit_overrides();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue