llama-quant: add support for mmproj (#16592)
* llama-quant: add support for mmproj * Update src/llama.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * check prefix instead * small fix --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
5acd455460
commit
3e3cb19f64
|
|
@ -5,6 +5,7 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
|
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||||
{ LLM_ARCH_DECI, "deci" },
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
|
|
@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||||
|
{
|
||||||
|
LLM_ARCH_CLIP,
|
||||||
|
{},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
|
LLM_ARCH_CLIP,
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_LLAMA4,
|
LLM_ARCH_LLAMA4,
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
|
|
|
||||||
|
|
@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||||
|
|
||||||
// everything past this point is not vocab-related
|
// everything past this point is not vocab-related
|
||||||
if (hparams.vocab_only) {
|
// for CLIP models, we only need to load tensors, no hparams
|
||||||
|
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
|
||||||
llama_rope_type llama_model_rope_type(const llama_model * model) {
|
llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
switch (model->arch) {
|
switch (model->arch) {
|
||||||
// these models do not use RoPE
|
// these models do not use RoPE
|
||||||
|
case LLM_ARCH_CLIP:
|
||||||
case LLM_ARCH_GPT2:
|
case LLM_ARCH_GPT2:
|
||||||
case LLM_ARCH_GPTJ:
|
case LLM_ARCH_GPTJ:
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
|
|
|
||||||
|
|
@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_clip_model = false;
|
||||||
for (const auto * it : tensors) {
|
for (const auto * it : tensors) {
|
||||||
const struct ggml_tensor * tensor = it->tensor;
|
const struct ggml_tensor * tensor = it->tensor;
|
||||||
|
|
||||||
|
|
@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
qs.has_output = true;
|
qs.has_output = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
||||||
}
|
}
|
||||||
|
|
||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
// sanity checks for models that have attention layers
|
// sanity checks for models that have attention layers
|
||||||
if (qs.n_attention_wv != 0)
|
if (qs.n_attention_wv != 0 && !is_clip_model)
|
||||||
{
|
{
|
||||||
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
||||||
// attention layers have a non-zero number of kv heads
|
// attention layers have a non-zero number of kv heads
|
||||||
|
|
@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
// do not quantize relative position bias (T5)
|
// do not quantize relative position bias (T5)
|
||||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||||
|
|
||||||
|
// do not quantize specific multimodal tensors
|
||||||
|
quantize &= name.find(".position_embd.") == std::string::npos;
|
||||||
|
|
||||||
ggml_type new_type;
|
ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
size_t new_size;
|
size_t new_size;
|
||||||
|
|
|
||||||
|
|
@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
||||||
}
|
}
|
||||||
|
if (model.arch == LLM_ARCH_CLIP) {
|
||||||
|
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
model.load_vocab(ml);
|
model.load_vocab(ml);
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue