Hide the regex functionality back in llama-quant.cpp, use a unique pointer to a new struct 'compiled_tensor_type_patterns' which contains the patterns
This commit is contained in:
parent
8ebfe03f95
commit
4a2f648db2
|
|
@ -134,6 +134,26 @@ static bool category_is_attn_v(tensor_category cat) {
|
||||||
cat == tensor_category::ATTENTION_KV_B;
|
cat == tensor_category::ATTENTION_KV_B;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct compiled_tensor_type_patterns {
|
||||||
|
std::vector<std::pair<std::regex, ggml_type>> patterns;
|
||||||
|
};
|
||||||
|
|
||||||
|
quantize_state_impl::quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||||
|
: model(model), params(params)
|
||||||
|
{
|
||||||
|
if (params->tensor_types) {
|
||||||
|
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
|
||||||
|
if (!tensor_types.empty()) {
|
||||||
|
tensor_type_patterns = std::make_unique<compiled_tensor_type_patterns>();
|
||||||
|
for (const auto & [tname, qtype] : tensor_types) {
|
||||||
|
tensor_type_patterns->patterns.emplace_back(std::regex(tname), qtype);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quantize_state_impl::~quantize_state_impl() = default;
|
||||||
|
|
||||||
//
|
//
|
||||||
// dequantization
|
// dequantization
|
||||||
//
|
//
|
||||||
|
|
@ -598,9 +618,9 @@ ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quan
|
||||||
if (!params->pure && ggml_is_quantized(default_type)) {
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
||||||
// if the user provided tensor types - use those
|
// if the user provided tensor types - use those
|
||||||
bool manual = false;
|
bool manual = false;
|
||||||
if (!qs.tensor_type_patterns.empty()) {
|
if (qs.tensor_type_patterns) {
|
||||||
const std::string tensor_name(tensor->name);
|
const std::string tensor_name(tensor->name);
|
||||||
for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
|
for (const auto & [pattern, qtype] : qs.tensor_type_patterns->patterns) {
|
||||||
if (std::regex_search(tensor_name, pattern)) {
|
if (std::regex_search(tensor_name, pattern)) {
|
||||||
if (qtype != new_type) {
|
if (qtype != new_type) {
|
||||||
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
|
||||||
|
|
@ -940,8 +960,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
const auto * it = tensors[i];
|
const auto * it = tensors[i];
|
||||||
const struct ggml_tensor * tensor = it->tensor;
|
const struct ggml_tensor * tensor = it->tensor;
|
||||||
|
|
||||||
metadata[i].category = tensor_get_category(name);
|
|
||||||
|
|
||||||
uint16_t i_split = params->keep_split ? it->idx : 0;
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
||||||
if (!ctx_outs[i_split]) {
|
if (!ctx_outs[i_split]) {
|
||||||
ctx_outs[i_split].reset(gguf_init_empty());
|
ctx_outs[i_split].reset(gguf_init_empty());
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
#include "llama-arch.h"
|
#include "llama-arch.h"
|
||||||
|
|
||||||
#include <regex>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
@ -46,6 +46,8 @@ struct tensor_type_option {
|
||||||
ggml_type type = GGML_TYPE_COUNT;
|
ggml_type type = GGML_TYPE_COUNT;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct compiled_tensor_type_patterns;
|
||||||
|
|
||||||
struct quantize_state_impl {
|
struct quantize_state_impl {
|
||||||
const llama_model & model;
|
const llama_model & model;
|
||||||
const llama_model_quantize_params * params;
|
const llama_model_quantize_params * params;
|
||||||
|
|
@ -67,20 +69,11 @@ struct quantize_state_impl {
|
||||||
// used to figure out if a model has tied embeddings (tok_embd shares weights with output)
|
// used to figure out if a model has tied embeddings (tok_embd shares weights with output)
|
||||||
bool has_tied_embeddings = true; // assume tied until we see output.weight
|
bool has_tied_embeddings = true; // assume tied until we see output.weight
|
||||||
|
|
||||||
// tensor type override patterns (compiled once, used twice)
|
// tensor type override patterns (compiled once, used in llama_tensor_get_type)
|
||||||
std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
|
std::unique_ptr<compiled_tensor_type_patterns> tensor_type_patterns;
|
||||||
|
|
||||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
|
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params);
|
||||||
model(model), params(params)
|
~quantize_state_impl();
|
||||||
{
|
|
||||||
// compile regex patterns once - they are expensive
|
|
||||||
if (params->tensor_types) {
|
|
||||||
const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
|
|
||||||
for (const auto & [tname, qtype] : tensor_types) {
|
|
||||||
tensor_type_patterns.emplace_back(std::regex(tname), qtype);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
|
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue