add indexer gguf params
This commit is contained in:
parent
9e4e556cc0
commit
64184c1236
|
|
@ -8716,6 +8716,11 @@ class GlmMoeDsaModel(DeepseekV2Model):
|
||||||
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
|
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
|
||||||
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
|
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
|
||||||
|
|
||||||
|
# DSA indexer parameters
|
||||||
|
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
|
||||||
|
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
|
||||||
|
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch, name, bid):
|
def modify_tensors(self, data_torch, name, bid):
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -180,6 +180,11 @@ class Keys:
|
||||||
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
||||||
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
|
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
|
||||||
|
|
||||||
|
class Indexer:
|
||||||
|
HEAD_COUNT = "{arch}.attention.indexer.head_count"
|
||||||
|
KEY_LENGTH = "{arch}.attention.indexer.key_length"
|
||||||
|
TOP_K = "{arch}.attention.indexer.top_k"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||||
|
|
|
||||||
|
|
@ -768,6 +768,15 @@ class GGUFWriter:
|
||||||
def add_value_length_mla(self, length: int) -> None:
|
def add_value_length_mla(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_indexer_head_count(self, count: int | Sequence[int]) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_indexer_key_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Indexer.KEY_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_indexer_top_k(self, top_k: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Indexer.TOP_K.format(arch=self.arch), top_k)
|
||||||
|
|
||||||
def add_max_alibi_bias(self, bias: float) -> None:
|
def add_max_alibi_bias(self, bias: float) -> None:
|
||||||
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2627,6 +2627,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_INDEXER_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
||||||
// These tensors only exist in the last layer(s) and are treated as output tensors
|
// These tensors only exist in the last layer(s) and are treated as output tensors
|
||||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||||
|
|
|
||||||
|
|
@ -5526,10 +5526,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
||||||
|
|
||||||
// DSA indexer
|
// DSA indexer
|
||||||
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {n_embd_head_k_mla}, flags);
|
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
||||||
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {n_embd_head_k_mla}, flags);
|
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {n_embd_head_k}, flags);
|
||||||
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, n_head}, flags);
|
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, n_head}, flags);
|
||||||
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, n_embd_head_k_mla}, flags);
|
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, n_embd_head_k}, flags);
|
||||||
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
|
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
|
||||||
|
|
||||||
if (i < (int) hparams.n_layer_dense_lead) {
|
if (i < (int) hparams.n_layer_dense_lead) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue