This commit is contained in:
fairydreaming 2026-04-01 07:48:12 +02:00 committed by GitHub
commit cd9691014c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 4204 additions and 8 deletions

View File

@ -831,6 +831,8 @@ class ModelBase:
gguf.MODEL_TENSOR.SSM_CONV1D_Q,
gguf.MODEL_TENSOR.SSM_CONV1D_K,
gguf.MODEL_TENSOR.SSM_CONV1D_V,
# DSA indexer weights should be F32
gguf.MODEL_TENSOR.INDEXER_PROJ,
)
)
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
@ -8737,6 +8739,147 @@ class DeepseekV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register(
"DeepseekV32ForCausalLM",
)
class DeepseekV32Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK32
# TODO @ngxson : remove this when we support MTP for deepseek models
skip_mtp = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
assert tokenizer.add_bos_token, "Change value of add_bos_token to true in tokenizer_config.json file."
self._set_vocab_gpt2()
def set_gguf_parameters(self):
# note: deepseek32 using MLA converts into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1
super().set_gguf_parameters()
hparams = self.hparams
# first_k_dense_replace: number of leading layers using dense FFN instead of MoE
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
# note: deepseek32 using MLA converts into MQA with larger heads, then decompresses to MHA
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
# MoE parameters (required by C++ code for DEEPSEEK32 arch)
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
# ref https://github.com/ggml-org/llama.cpp/pull/17945
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
# NextN/MTP prediction layers
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
# DSA indexer parameters
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("language_model."):
name = name.replace("language_model.", "")
# rename e_score_correction_bias tensors
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
# skip Multi-Token Prediction (MTP) layers
if self.skip_mtp:
block_count = self.hparams["num_hidden_layers"]
match = re.match(r"model.layers.(\d+)", name)
if match and int(match.group(1)) >= block_count:
return
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
yield from super().modify_tensors(data_torch, merged_name, bid)
return
else:
return
# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
if name.endswith("kv_b_proj.weight"):
name_kb = name.replace("kv_b_proj", "k_b_proj")
name_vb = name.replace("kv_b_proj", "v_b_proj")
n_head_kv = self.hparams["num_key_value_heads"]
v_head_dim = self.hparams["v_head_dim"]
qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
k_b = k_b.transpose(1, 2)
yield from super().modify_tensors(k_b, name_kb, bid)
yield from super().modify_tensors(v_b, name_vb, bid)
return
yield from super().modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register(
"Mistral3ForConditionalGeneration",
"Ministral3ForCausalLM",

View File

@ -559,6 +559,8 @@ extern "C" {
GGML_OP_RWKV_WKV7,
GGML_OP_SOLVE_TRI,
GGML_OP_GATED_DELTA_NET,
GGML_OP_HADAMARD,
GGML_OP_SCATTER,
GGML_OP_UNARY,
@ -2481,6 +2483,23 @@ extern "C" {
struct ggml_tensor * beta,
struct ggml_tensor * state);
GGML_API struct ggml_tensor * ggml_hadamard(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n);
GGML_API struct ggml_tensor * ggml_scatter(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * ids,
float c);
GGML_API struct ggml_tensor * ggml_scatter_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * ids,
float c);
// custom operators
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);

View File

@ -2031,6 +2031,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_gated_delta_net(params, tensor);
} break;
case GGML_OP_HADAMARD:
{
ggml_compute_forward_hadamard(params, tensor);
} break;
case GGML_OP_SCATTER:
{
ggml_compute_forward_scatter(params, tensor);
} break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@ -2353,6 +2361,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_RWKV_WKV7:
case GGML_OP_HADAMARD:
case GGML_OP_SCATTER:
{
n_tasks = n_threads;
} break;

View File

@ -2232,8 +2232,42 @@ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, gg
}
}
static void ggml_compute_forward_fill_f16(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
GGML_TENSOR_LOCALS(size_t, nb, dst, nb);
const auto [ir0, ir1] = get_thread_range(params, dst);
for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne2*ne1);
const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
ggml_vec_set_f16(ne0, dst_ptr, c);
}
}
void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
ggml_compute_forward_fill_f32(params, dst);
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_fill_f32(params, dst);
} break;
case GGML_TYPE_F16:
{
ggml_compute_forward_fill_f16(params, dst);
} break;
default:
{
GGML_ABORT("unsupported type for ggml_compute_forward_fill: %s", ggml_type_name(src0->type));
}
}
}
// ggml_compute_tri
@ -11217,3 +11251,227 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
}
}
}
// ggml_compute_forward_hadamard
// Based on a source code from: https://github.com/ikawrakow/ik_llama.cpp
// Copyright (C) 2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
template <typename T>
void fast_ht(int n, T * values) {
constexpr float ksqrt2 = 0.707106781f;
float scale = 1;
for (int h = 1; h < n; h <<= 1) {
for (int i = 0; i < n; i += 2*h) {
for (int j = i; j < i + h; ++j) {
T x = values[j], y = values[j + h];
values[j+0] = x + y;
values[j+h] = x - y;
}
}
scale *= ksqrt2;
}
for (int i = 0; i < n; ++i) values[i] *= scale;
}
static void ggml_compute_forward_hadamard_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
const int ith = params->ith;
const int nth = params->nth;
int nh = dst->op_params[0];
GGML_ASSERT(nh > 1 && ((nh & (nh - 1)) == 0)); // power of 2
GGML_ASSERT(dst->ne[0] % nh == 0);
int nc = dst->ne[0]/nh;
int nr = ggml_nrows(dst) * nc;
int npt = (nr + nth - 1)/nth;
int first = npt*ith;
int last = std::min(first + npt, nr);
for (int ir = first; ir < last; ++ir) {
int i3 = ir / (dst->ne[1] * dst->ne[2] * nc);
int i2 = (ir - i3*dst->ne[1] * dst->ne[2] * nc)/(dst->ne[1] * nc);
int i1 = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc)/nc;
int ic = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc - i1*nc);
auto x = (const float *)((const char *)src0->data + i3*src0->nb[3] + i2*src0->nb[2] + i1*src0->nb[1]) + ic*nh;
auto y = ( float *)(( char *)dst->data + i3*dst->nb[3] + i2*dst->nb[2] + i1*dst->nb[1]) + ic*nh;
memcpy(y, x, nh*sizeof(float));
fast_ht(nh, y);
}
}
void ggml_compute_forward_hadamard(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hadamard_f32(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}
// ggml_compute_forward_scatter
static void ggml_compute_forward_scatter_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const float c = ggml_get_op_params_f32(dst, 0);
const bool inplace = ggml_get_op_params_i32(dst, 1);
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
const float * src0_ptr = (float *) ((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 );
const int32_t * ids_ptr = (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
// copy whole row from src0
if (!inplace) {
ggml_vec_cpy_f32(ne00, dst_ptr, src0_ptr);
}
// set dst elements indicated by indices in src1 to c
for (int j = 0; j < ne10; ++j) {
int id = ids_ptr[j];
GGML_ASSERT(id >= 0 && id < ne00);
dst_ptr[id] = c;
}
}
}
static void ggml_compute_forward_scatter_f16(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
const bool inplace = ggml_get_op_params_i32(dst, 1);
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 );
const int32_t * ids_ptr = (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
// copy whole row from src0
if (!inplace) {
// ggml_vec_cpy_f16(ne00, dst_ptr, src0_ptr)
for (int i = 0; i < ne00; ++i) {
dst_ptr[i] = src0_ptr[i];
}
}
// set dst elements indicated by indices in src1 to c
for (int j = 0; j < ne10; ++j) {
int id = ids_ptr[j];
GGML_ASSERT(id >= 0 && id < ne00);
dst_ptr[id] = c;
}
}
}
void ggml_compute_forward_scatter(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_scatter_f32(params, dst);
} break;
case GGML_TYPE_F16:
{
ggml_compute_forward_scatter_f16(params, dst);
} break;
default:
{
GGML_ABORT("unsupported type for ggml_compute_forward_scatter: %s", ggml_type_name(src0->type));
}
}
}

View File

@ -103,6 +103,8 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_hadamard(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_scatter(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@ -61,6 +61,8 @@
#include "ggml-cuda/tri.cuh"
#include "ggml-cuda/cumsum.cuh"
#include "ggml-cuda/fill.cuh"
#include "ggml-cuda/hadamard.cuh"
#include "ggml-cuda/scatter.cuh"
#include "ggml.h"
#include <algorithm>
@ -2815,6 +2817,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_FILL:
ggml_cuda_op_fill(ctx, dst);
break;
case GGML_OP_HADAMARD:
ggml_cuda_op_hadamard(ctx, dst);
break;
case GGML_OP_SCATTER:
ggml_cuda_op_scatter(ctx, dst);
break;
default:
return false;
}
@ -5057,8 +5065,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_TRI:
case GGML_OP_DIAG:
case GGML_OP_SOLVE_TRI:
case GGML_OP_SCATTER:
return true;
case GGML_OP_HADAMARD: {
int nh = op->op_params[0];
return (nh == 64 || nh == 128 || nh == 256) && op->ne[0] % nh == 0 && op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
}
default:
return false;
}

View File

@ -0,0 +1,73 @@
// Copyright (C) 2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
#include "hadamard.cuh"
template <int nh>
static __global__ void hadamard_f32(const char * src, char * dst, int ne0,
size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3) {
constexpr float ksqrt2 = 0.707106781f;
int nc = ne0/nh;
int ii1 = blockIdx.x;
int i1 = ii1 / nc;
int ic = ii1 % nc;
int i2 = blockIdx.y;
int i3 = blockIdx.z;
int tid = threadIdx.x;
const float * x = (const float *)((const char *)src + i1*nb01 + i2*nb02 + i3*nb03) + ic*nh;
float * y = ( float *)((const char *)dst + i1*nb1 + i2*nb2 + i3*nb3) + ic*nh;
__shared__ float ys[nh];
ys[2*tid+0] = x[2*tid+0] + x[2*tid+1];
ys[2*tid+1] = x[2*tid+0] - x[2*tid+1];
float scale = ksqrt2;
#pragma unroll
for (int h = 2; h < nh; h <<= 1) {
__syncthreads();
int ii = tid/h, jj = tid%h;
int j = 2*h*ii+jj;
float u = ys[j], v = ys[j+h];
ys[j+0] = u + v;
ys[j+h] = u - v;
scale *= ksqrt2;
}
__syncthreads();
y[2*tid+0] = ys[2*tid+0] * scale;
y[2*tid+1] = ys[2*tid+1] * scale;
}
static void hadamard_f32_cuda(int nh, const char * x, char * y, int ne0, int ne1, int ne2, int ne3,
size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3, cudaStream_t stream) {
int nc = ne0/nh;
int nrows = nc*ne1;
dim3 num_blocks = dim3(nrows, ne2, ne3);
switch (nh) {
case 64: hadamard_f32< 64><<<num_blocks, 32, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
case 128: hadamard_f32<128><<<num_blocks, 64, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
case 256: hadamard_f32<256><<<num_blocks, 128, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
default: GGML_ABORT("Unsupported Hadamard block size");
}
}
void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_are_same_shape(src, dst));
int nh = dst->op_params[0];
GGML_ASSERT(dst->ne[0] % nh == 0);
GGML_ASSERT(nh > 1 && ((nh & (nh - 1)) == 0)); // power of 2
hadamard_f32_cuda(nh, (const char *)src->data, (char *)dst->data, src->ne[0], src->ne[1], src->ne[2], src->ne[3],
src->nb[1], src->nb[2], src->nb[3], dst->nb[1], dst->nb[2], dst->nb[3], ctx.stream());
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -0,0 +1,88 @@
#include "scatter.cuh"
#include "convert.cuh"
template <typename T>
static __global__ void scatter_kernel(
const int32_t * src0, T * dst, const T c,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
size_t nb1, size_t nb2, size_t nb3,
size_t nb01, size_t nb02, size_t nb03
) {
const int64_t total_blocks = ne01 * ne02 * ne03;
for (int64_t block_idx = blockIdx.x; block_idx < total_blocks; block_idx += gridDim.x) {
const int64_t i1 = block_idx % ne01;
const int64_t i2 = (block_idx / ne01) % ne02;
const int64_t i3 = block_idx / (ne01 * ne02);
T * dst_row = (T *)((char *)dst + i1*nb1 + i2*nb2 + i3*nb3);
const int * src0_row = (const int *)((const char *)src0 + i1*nb01 + i2*nb02 + i3*nb03);
for (int64_t i0 = threadIdx.x; i0 < ne00; i0 += blockDim.x) {
const int32_t id = src0_row[i0];
dst_row[id] = c;
}
}
}
void ggml_cuda_op_scatter(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(nb10 == sizeof(int32_t));
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(dst));
float c = ggml_get_op_params_f32(dst, 0);
bool inplace = ggml_get_op_params_i32(dst, 1);
// step 1 - copy whole src0 to dst
if (!inplace) {
cudaStream_t main_stream = ctx.stream();
char * dst_ddc = (char *) dst->data;
char * src0_ddc = (char *) src0->data;
CUDA_CHECK(cudaMemcpyAsync(dst_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
}
// step 2 - set elements in dst indicated by ids to c
const int32_t * src1_d = (const int32_t *) src1->data;
void * dst_d = dst->data;
int threads = std::min((int) ne10, 512); // ids
int64_t total_blocks = ne11 * ne12 * ne13;
int blocks = (int) std::min((int64_t) 65535, total_blocks);
switch (dst->type) {
case GGML_TYPE_F32:
scatter_kernel<<<blocks, threads, 0, ctx.stream()>>>(
src1_d, (float *) dst_d, c,
ne10, ne11, ne12, ne13,
nb1, nb2, nb3,
nb11, nb12, nb13
);
break;
case GGML_TYPE_F16:
scatter_kernel<<<blocks, threads, 0, ctx.stream()>>>(
src1_d, (half *) dst_d, ggml_cuda_cast<half>(c),
ne10, ne11, ne12, ne13,
nb1, nb2, nb3,
nb11, nb12, nb13
);
break;
default:
GGML_ABORT("unsupported type");
}
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_scatter(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1040,6 +1040,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"RWKV_WKV7",
"SOLVE_TRI",
"GATED_DELTA_NET",
"HADAMARD",
"SCATTER",
"UNARY",
@ -1057,7 +1059,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1150,6 +1152,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rwkv_wkv7(r, w, k, v, a, b, s)",
"A X = B, A triangular, solve X",
"gated_delta_net(q, k, v, g, beta, s)",
"hadamard(x)",
"scatter(x,ids,c)",
"unary(x)",
@ -1167,7 +1171,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -5189,7 +5193,7 @@ static struct ggml_tensor * ggml_fill_impl(
struct ggml_tensor * a,
float c,
bool inplace) {
GGML_ASSERT(a->type == GGML_TYPE_F32);
GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
GGML_ASSERT(ggml_is_contiguous(a));
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -6194,6 +6198,69 @@ struct ggml_tensor * ggml_gated_delta_net(
return result;
}
// ggml_hadamard
struct ggml_tensor * ggml_hadamard(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n) {
GGML_ASSERT(a->type == GGML_TYPE_F32); // will not bother implementing for other data types
GGML_ASSERT(n > 1); // no point in Hadamard transforms with less than 2 elements
GGML_ASSERT(a->ne[0] % n == 0);
GGML_ASSERT(n > 0 && ((n & (n - 1)) == 0)); // must be a power of 2
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
result->op = GGML_OP_HADAMARD;
result->src[0] = a;
result->op_params[0] = n;
return result;
}
// ggml_scatter
static struct ggml_tensor * ggml_scatter_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * ids,
float c,
bool inplace) {
GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[1] == ids->ne[1]);
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
ggml_set_op_params_f32(result, 0, c);
ggml_set_op_params_i32(result, 1, inplace ? 1 : 0);
result->op = GGML_OP_SCATTER;
result->src[0] = a;
result->src[1] = ids;
return result;
}
struct ggml_tensor * ggml_scatter(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * ids,
float c) {
return ggml_scatter_impl(ctx, a, ids, c, false);
}
struct ggml_tensor * ggml_scatter_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * ids,
float c) {
return ggml_scatter_impl(ctx, a, ids, c, true);
}
////////////////////////////////////////////////////////////////////////////////
struct ggml_hash_set ggml_hash_set_new(size_t size) {

View File

@ -440,6 +440,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK = auto()
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
DEEPSEEK32 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
@ -903,6 +904,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
MODEL_ARCH.DEEPSEEK32: "deepseek32",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
@ -2720,6 +2722,46 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.DEEPSEEK32: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_K_B,
MODEL_TENSOR.ATTN_V_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
MODEL_TENSOR.INDEXER_K_NORM,
MODEL_TENSOR.INDEXER_PROJ,
MODEL_TENSOR.INDEXER_ATTN_K,
MODEL_TENSOR.INDEXER_ATTN_Q_B,
# NextN/MTP tensors - preserved but unused
MODEL_TENSOR.NEXTN_EH_PROJ,
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
MODEL_TENSOR.NEXTN_ENORM,
MODEL_TENSOR.NEXTN_HNORM,
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -3833,6 +3875,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.DEEPSEEK32: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.CHATGLM: [
MODEL_TENSOR.ROPE_FREQS,
],

View File

@ -22,6 +22,8 @@ add_library(llama
llama-io.cpp
llama-kv-cache.cpp
llama-kv-cache-iswa.cpp
llama-ik-cache.cpp
llama-kv-cache-dsa.cpp
llama-memory.cpp
llama-memory-hybrid.cpp
llama-memory-hybrid-iswa.cpp
@ -57,6 +59,7 @@ add_library(llama
models/deci.cpp
models/deepseek.cpp
models/deepseek2.cpp
models/deepseek32.cpp
models/delta-net-base.cpp
models/dots1.cpp
models/dream.cpp

View File

@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
{ LLM_ARCH_DEEPSEEK32, "deepseek32" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@ -1608,6 +1609,44 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_EXP_PROBS_B,
};
case LLM_ARCH_DEEPSEEK32:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_INDEXER_K_NORM,
LLM_TENSOR_INDEXER_PROJ,
LLM_TENSOR_INDEXER_ATTN_K,
LLM_TENSOR_INDEXER_ATTN_Q_B,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};
case LLM_ARCH_PLM:
return {
LLM_TENSOR_TOKEN_EMBD,

View File

@ -78,6 +78,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_DEEPSEEK2OCR,
LLM_ARCH_DEEPSEEK32,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,

View File

@ -6,6 +6,7 @@
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-kv-cache-dsa.h"
#include "llama-memory-hybrid.h"
#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
@ -31,6 +32,18 @@ static ggml_tensor * build_kq_mask(
return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
}
static ggml_tensor * build_kq_mask(
ggml_context * ctx,
const llama_ik_cache_context * mctx,
const llama_ubatch & ubatch,
const llama_cparams & cparams) {
const auto n_kv = mctx->get_n_kv();
const auto n_tokens = ubatch.n_tokens;
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
}
static bool can_reuse_kq_mask(
ggml_tensor * kq_mask,
const llama_kv_cache_context * mctx,
@ -50,6 +63,25 @@ static bool can_reuse_kq_mask(
return res;
}
static bool can_reuse_kq_mask(
ggml_tensor * kq_mask,
const llama_ik_cache_context * mctx,
const llama_ubatch & ubatch,
const llama_cparams & cparams) {
const auto n_kv = mctx->get_n_kv();
const auto n_tokens = ubatch.n_tokens;
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
bool res = true;
res &= (kq_mask->ne[0] == n_kv);
res &= (kq_mask->ne[1] == n_tokens/n_stream);
res &= (kq_mask->ne[2] == 1);
res &= (kq_mask->ne[3] == n_stream);
return res;
}
// impl
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
@ -2159,6 +2191,112 @@ ggml_tensor * llm_graph_context::build_attn(
return cur;
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla,
ggml_tensor * top_k,
float kq_scale,
int il) const {
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
ggml_build_forward_expand(gf, q_cur);
ggml_build_forward_expand(gf, v_cur);
ggml_build_forward_expand(gf, k_cur);
const auto * mctx_cur = inp->mctx;
// store to KV cache
{
const auto & k_idxs = inp->get_k_idxs();
ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
}
const auto & kq_mask = inp->get_kq_mask();
// prepare new kq mask - starts filled with -INFINITY
ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY);
// modify it by unmasking tokens that are in top_k indices
ggml_tensor * kq_mask_top_k = ggml_scatter(ctx0, kq_mask_all, top_k, 0);
// combine with the original kq mask
kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask);
ggml_tensor * q = q_cur;
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_top_k, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
if (wo_b) {
cur = ggml_add(ctx0, cur, wo_b);
}
return cur;
}
static std::unique_ptr<llm_graph_input_attn_ik> build_attn_inp_ik_impl(
ggml_context * ctx0,
const llama_ubatch & ubatch,
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_ik_cache_context * mctx_cur) {
auto inp = std::make_unique<llm_graph_input_attn_ik>(hparams, cparams, mctx_cur);
{
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = inp->self_kq_mask;
}
return inp;
}
void llm_graph_input_attn_ik::set_input(const llama_ubatch * ubatch) {
mctx->set_input_k_idxs(self_k_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_ik::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_ik_cache_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
res &= can_reuse_kq_mask(self_kq_mask, mctx, params.ubatch, params.cparams);
return res;
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
@ -2281,6 +2419,17 @@ ggml_tensor * llm_graph_context::build_attn(
return cur;
}
std::pair<llm_graph_input_attn_k *, llm_graph_input_attn_ik *> llm_graph_context::build_attn_inp_k_dsa() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_dsa_context *>(mctx);
auto inp_k = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_base());
auto inp_ik = build_attn_inp_ik_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_ik());
return std::make_pair(
(llm_graph_input_attn_k *) res->add_input(std::move(inp_k)),
(llm_graph_input_attn_ik *) res->add_input(std::move(inp_ik)));
}
// TODO: maybe separate the inner implementation into a separate function
// like with the non-sliding window equivalent
// once sliding-window hybrid caches are a thing.

View File

@ -21,6 +21,7 @@ struct llama_cparams;
struct llama_memory_context_i;
class llama_kv_cache_context;
class llama_ik_cache_context;
class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context;
class llama_memory_hybrid_context;
@ -350,6 +351,39 @@ public:
const llama_kv_cache_context * mctx;
};
// V-less input for the indexer KV cache
class llm_graph_input_attn_ik : public llm_graph_input_i {
public:
llm_graph_input_attn_ik(
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_ik_cache_context * mctx) :
hparams(hparams),
cparams(cparams),
mctx(mctx) {
}
~llm_graph_input_attn_ik() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
const llama_hparams hparams;
const llama_cparams cparams;
const llama_ik_cache_context * mctx;
};
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
public:
llm_graph_input_attn_kv_iswa(
@ -921,6 +955,20 @@ struct llm_graph_context {
float kq_scale,
int il) const;
ggml_tensor * build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
ggml_tensor * top_k, // [n_indexer_top_k, n_tokens]
float kq_scale,
int il) const;
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
@ -952,6 +1000,8 @@ struct llm_graph_context {
float kq_scale,
int il) const;
std::pair<llm_graph_input_attn_k *, llm_graph_input_attn_ik *> build_attn_inp_k_dsa() const;
//
// recurrent
//

1891
src/llama-ik-cache.cpp Normal file

File diff suppressed because it is too large Load Diff

306
src/llama-ik-cache.h Normal file
View File

@ -0,0 +1,306 @@
#pragma once
#include "llama-kv-cache.h"
#include "llama-batch.h"
#include "llama-graph.h"
#include "llama-kv-cells.h"
#include "llama-memory.h"
#include <unordered_map>
#include <vector>
struct llama_cparams;
struct llama_hparams;
struct llama_model;
struct llama_context;
//
// llama_ik_cache
//
class llama_ik_cache : public llama_memory_i {
public:
using stream_copy_info = llama_kv_cache::stream_copy_info;
using slot_info = llama_kv_cache::slot_info;
using slot_info_vec_t = std::vector<slot_info>;
llama_ik_cache(
const llama_model & model,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
bool offload,
bool unified,
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_ik_cache() = default;
//
// llama_memory_i
//
llama_memory_context_ptr init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
bool embd_all) override;
llama_memory_context_ptr init_full() override;
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
bool get_can_shift() const override;
void clear(bool data) override;
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
void seq_keep(llama_seq_id seq_id) override;
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_ik_cache specific API
//
uint32_t get_size() const;
uint32_t get_n_stream() const;
bool get_has_shift() const;
//
// graph_build API
//
uint32_t get_n_kv(const slot_info & sinfo) const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
// store k_cur and v_cur in the cache based on the provided head location
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
//
// preparation API
//
// find places for the provided ubatches in the cache, returns the slot infos
// return empty vector on failure
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
// find a slot of kv cells that can hold the ubatch
// if cont == true, then the slot must be continuous
// return empty slot_info on failure
slot_info find_slot(const llama_ubatch & ubatch, bool cont) const;
// emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
//
// input API
//
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
void set_input_k_shift(ggml_tensor * dst) const;
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
private:
const llama_model & model;
const llama_hparams & hparams;
struct kv_layer {
// layer index in the model
// note: can be different from the layer index in the KV cache
uint32_t il;
ggml_tensor * k;
std::vector<ggml_tensor *> k_stream;
};
bool v_trans = true; // the value tensor is transposed
const uint32_t n_seq_max = 1;
const uint32_t n_stream = 1;
// required padding
const uint32_t n_pad = 1;
// SWA
const uint32_t n_swa = 0;
// env: LLAMA_KV_CACHE_DEBUG
int debug = 0;
// this is the SWA type of the cache - not to be confused with the model SWA type
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// ggml contexts for the KV cache along with the allocated backend buffers:
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
std::vector<uint32_t> v_heads;
std::vector<llama_kv_cells> v_cells;
// maps from a sequence id to a stream id
std::vector<uint32_t> seq_to_stream;
// pending stream copies that will be applied during the next update
stream_copy_info sc_info;
std::vector<kv_layer> layers;
// model layer id -> KV cache layer id
std::unordered_map<int32_t, int32_t> map_layer_ids;
size_t total_size() const;
size_t size_k_bytes() const;
ggml_tensor * build_rope_shift(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale,
uint32_t il) const;
ggml_cgraph * build_graph_shift(
llm_graph_result * res,
llama_context * lctx) const;
struct cell_ranges_t {
uint32_t strm;
std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
};
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
};
class llama_ik_cache_context : public llama_memory_context_i {
public:
// some shorthands
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
using stream_copy_info = llama_kv_cache::stream_copy_info;
// used for errors
llama_ik_cache_context(llama_memory_status status);
// used to create a full-cache context
llama_ik_cache_context(
llama_ik_cache * kv);
// used to create an update context
llama_ik_cache_context(
llama_ik_cache * kv,
llama_context * lctx,
bool do_shift,
stream_copy_info sc_info);
// used to create a batch processing context from a batch
llama_ik_cache_context(
llama_ik_cache * kv,
slot_info_vec_t sinfos,
std::vector<llama_ubatch> ubatches);
virtual ~llama_ik_cache_context();
//
// llama_memory_context_i
//
bool next() override;
bool apply() override;
llama_memory_status get_status() const override;
const llama_ubatch & get_ubatch() const override;
//
// llama_ik_cache_context specific API
//
uint32_t get_n_kv() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
// store k_cur and v_cur in the cache based on the provided head location
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
// - k_idxs [n_tokens]
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
// create destination indices for each head of the current batch for where it would be written in the KV cache
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
// helps understand the implementation logic of cpy_k
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
void set_input_k_shift (ggml_tensor * dst) const;
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
private:
llama_memory_status status;
llama_ik_cache * kv;
llama_context * lctx;
//
// update context
//
bool do_shift = false;
stream_copy_info sc_info;
//
// batch processing context
//
// the index of the cur ubatch to process
size_t i_cur = 0;
slot_info_vec_t sinfos;
std::vector<llama_ubatch> ubatches;
//
// data needed for building the compute graph for the current ubatch:
//
// a heuristic, to avoid attending the full cache if it is not yet utilized
// as the cache gets filled, the benefit from this heuristic disappears
int32_t n_kv;
};

251
src/llama-kv-cache-dsa.cpp Normal file
View File

@ -0,0 +1,251 @@
#include "llama-kv-cache-dsa.h"
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-model.h"
#include <algorithm>
#include <cassert>
//
// llama_kv_cache_dsa
//
llama_kv_cache_dsa::llama_kv_cache_dsa(
const llama_model & model,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
bool offload,
bool unified,
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) :
n_stream(unified ? 1 : n_seq_max) {
LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size);
kv_base = std::make_unique<llama_kv_cache>(
model, type_k, type_v,
v_trans, offload, unified, kv_size, n_seq_max, n_pad,
n_swa, swa_type, filter, reuse);
LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size);
kv_ik = std::make_unique<llama_ik_cache>(
model, type_k, type_v,
v_trans, offload, unified, kv_size, n_seq_max, n_pad,
n_swa, swa_type, filter, reuse);
}
void llama_kv_cache_dsa::clear(bool data) {
kv_base->clear(data);
kv_ik ->clear(data);
}
bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
bool res = true;
res = res & kv_base->seq_rm(seq_id, p0, p1);
res = res & kv_ik ->seq_rm(seq_id, p0, p1);
return res;
}
void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
kv_ik ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
}
void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) {
kv_base->seq_keep(seq_id);
kv_ik ->seq_keep(seq_id);
}
void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
kv_base->seq_add(seq_id, p0, p1, shift);
kv_ik ->seq_add(seq_id, p0, p1, shift);
}
void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
kv_base->seq_div(seq_id, p0, p1, d);
kv_ik ->seq_div(seq_id, p0, p1, d);
}
llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const {
return kv_base->seq_pos_min(seq_id);
}
llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const {
return kv_base->seq_pos_max(seq_id);
}
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_dsa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
for (const auto & buft_size : kv_ik->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
llama_memory_context_ptr llama_kv_cache_dsa::init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
bool embd_all) {
GGML_UNUSED(embd_all);
do {
balloc.split_reset();
std::vector<llama_ubatch> ubatches;
while (true) {
auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
if (ubatch.n_tokens == 0) {
break;
}
ubatches.push_back(std::move(ubatch)); // NOLINT
}
if (balloc.get_n_used() < balloc.get_n_tokens()) {
// failed to find a suitable split
break;
}
auto sinfos_base = kv_base->prepare(ubatches);
if (sinfos_base.empty()) {
break;
}
auto sinfos_ik = kv_ik->prepare(ubatches);
if (sinfos_ik.empty()) {
break;
}
assert(sinfos_base.size() == sinfos_ik.size());
return std::make_unique<llama_kv_cache_dsa_context>(
this, std::move(sinfos_base), std::move(sinfos_ik), std::move(ubatches));
} while (false);
return std::make_unique<llama_kv_cache_dsa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
llama_memory_context_ptr llama_kv_cache_dsa::init_full() {
return std::make_unique<llama_kv_cache_dsa_context>(this);
}
llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) {
return std::make_unique<llama_kv_cache_dsa_context>(this, lctx, optimize);
}
bool llama_kv_cache_dsa::get_can_shift() const {
return kv_base->get_can_shift() &&
kv_ik->get_can_shift() &&
kv_base->get_size() == kv_ik->get_size();
}
void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
kv_base->state_write(io, seq_id, flags);
kv_ik->state_write(io, seq_id, flags);
}
void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
kv_base->state_read(io, seq_id, flags);
kv_ik->state_read(io, seq_id, flags);
}
llama_kv_cache * llama_kv_cache_dsa::get_base() const {
return kv_base.get();
}
llama_ik_cache * llama_kv_cache_dsa::get_ik() const {
return kv_ik.get();
}
//
// llama_kv_cache_dsa_context
//
llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {}
llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv) :
ctx_base(kv->get_base()->init_full()),
ctx_ik(kv->get_ik()->init_full()),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
}
llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv,
llama_context * lctx,
bool optimize) :
ctx_base(kv->get_base()->init_update(lctx, optimize)),
ctx_ik(kv->get_ik()->init_update(lctx, optimize)),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
}
llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_ik,
std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
ctx_ik(new llama_ik_cache_context(kv->get_ik(), std::move(sinfos_ik), this->ubatches)),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
}
llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default;
bool llama_kv_cache_dsa_context::next() {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
ctx_base->next();
ctx_ik ->next();
if (++i_next >= ubatches.size()) {
return false;
}
return true;
}
bool llama_kv_cache_dsa_context::apply() {
assert(!llama_memory_status_is_fail(status));
bool res = true;
res = res & ctx_base->apply();
res = res & ctx_ik ->apply();
return res;
}
llama_memory_status llama_kv_cache_dsa_context::get_status() const {
return status;
}
const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return ubatches[i_next];
}
const llama_kv_cache_context * llama_kv_cache_dsa_context::get_base() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
}
const llama_ik_cache_context * llama_kv_cache_dsa_context::get_ik() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_ik_cache_context *>(ctx_ik.get());
}

137
src/llama-kv-cache-dsa.h Normal file
View File

@ -0,0 +1,137 @@
#pragma once
#include "llama-kv-cache.h"
#include "llama-ik-cache.h"
#include <vector>
//
// llama_kv_cache_dsa
//
// utilizes two KV cache instances: llama_kv_cache and llama_ik_cache
// the first instance is for caching key tensors of the model,
// the second instance is for caching lightning indexer key tensors
class llama_kv_cache_dsa : public llama_memory_i {
public:
llama_kv_cache_dsa(
const llama_model & model,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
bool offload,
bool unified,
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_dsa() = default;
//
// llama_memory_i
//
llama_memory_context_ptr init_batch(
llama_batch_allocr & balloc,
uint32_t n_ubatch,
bool embd_all) override;
llama_memory_context_ptr init_full() override;
llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
bool get_can_shift() const override;
void clear(bool data) override;
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
void seq_keep(llama_seq_id seq_id) override;
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override;
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_kv_cache_dsa specific API
//
llama_kv_cache * get_base() const;
llama_ik_cache * get_ik () const;
private:
const uint32_t n_stream = 1;
std::unique_ptr<llama_kv_cache> kv_base;
std::unique_ptr<llama_ik_cache> kv_ik;
};
class llama_kv_cache_dsa_context : public llama_memory_context_i {
public:
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// used for errors
llama_kv_cache_dsa_context(llama_memory_status status);
// used to create a full-cache context
llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv);
// used to create an update context
llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv,
llama_context * lctx,
bool optimize);
// used to create a batch processing context from a batch
llama_kv_cache_dsa_context(
llama_kv_cache_dsa * kv,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_ik,
std::vector<llama_ubatch> ubatches);
virtual ~llama_kv_cache_dsa_context();
//
// llama_memory_context_i
//
bool next() override;
bool apply() override;
llama_memory_status get_status() const override;
const llama_ubatch & get_ubatch() const override;
//
// llama_kv_cache_dsa_context specific API
//
const llama_kv_cache_context * get_base() const;
const llama_ik_cache_context * get_ik() const;
private:
//llama_kv_cache_dsa * kv;
// the index of the next ubatch to process
size_t i_next = 0;
std::vector<llama_ubatch> ubatches;
const llama_memory_context_ptr ctx_base;
const llama_memory_context_ptr ctx_ik;
const llama_memory_status status;
};

View File

@ -8,6 +8,7 @@
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-kv-cache-dsa.h"
#include "llama-memory-hybrid.h"
#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
@ -144,6 +145,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_310B_A15B: return "310B.A15B";
case LLM_TYPE_355B_A32B: return "355B.A32B";
case LLM_TYPE_397B_A17B: return "397B.A17B";
case LLM_TYPE_685B_A37B: return "685B.A37B";
case LLM_TYPE_744B_A40B: return "744B.A40B";
case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B";
@ -1641,6 +1643,56 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DEEPSEEK32:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
hparams.f_norm_eps = 1e-6; // eps for layer norm
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
// MoE parameters
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// deepseek MLA parameters
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
// DSA parameters
ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
// Expert gating function
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// cancel the factor from the convert script
hparams.rope_yarn_log_mul /= 0.1f;
}
// NextN/MTP parameters
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
// TODO: when MTP is implemented, this should probably be updated if needed
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
switch (hparams.n_layer) {
case 61: type = LLM_TYPE_685B_A37B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_PLM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5032,6 +5084,108 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
case LLM_ARCH_DEEPSEEK32:
{
const bool is_mla = hparams.is_mla();
if (!is_mla) {
throw std::runtime_error("DEEPSEEK32 architecture requires MLA");
}
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_expert_shared = hparams.n_expert_shared;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
// try to load output.weight, if not found, use token_embd (tied embeddings)
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (!output) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
int flags = 0;
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
// skip all tensors in the NextN layers
// TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
}
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
// DSA indexer
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
if (i < (int) hparams.n_layer_dense_lead) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
} else {
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
if (n_expert == 0) {
throw std::runtime_error("n_expert must be > 0");
}
if (n_expert_used == 0) {
throw std::runtime_error("n_expert_used must be > 0");
}
// MoE branch
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
// Shared expert branch
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
}
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
// Optional tensors
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
}
}
} break;
case LLM_ARCH_PLM:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot();
@ -7975,7 +8129,7 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
}
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
@ -8162,6 +8316,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
{
res = nullptr;
} break;
case LLM_ARCH_DEEPSEEK32:
{
res = new llama_kv_cache_dsa(
*this,
params.type_k,
params.type_v,
!cparams.flash_attn,
cparams.offload_kqv,
cparams.kv_unified,
cparams.n_ctx_seq,
cparams.n_seq_max,
1,
hparams.n_swa,
hparams.swa_type,
nullptr,
nullptr);
} break;
// Models that need standard caching should rely on recurrent/hybrid
// checks
default:
@ -8558,6 +8729,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_deepseek2>(*this, params);
} break;
case LLM_ARCH_DEEPSEEK32:
{
llm = std::make_unique<llm_build_deepseek32>(*this, params);
} break;
case LLM_ARCH_CHATGLM:
{
llm = std::make_unique<llm_build_chatglm>(*this, params);
@ -8954,6 +9129,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DEEPSEEK:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_DEEPSEEK2OCR:
case LLM_ARCH_DEEPSEEK32:
case LLM_ARCH_PLM:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GRANITE:

View File

@ -135,6 +135,7 @@ enum llm_type {
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_397B_A17B, // Qwen3.5
LLM_TYPE_685B_A37B, // DeepSeek V3.2
LLM_TYPE_744B_A40B, // GLM-5
LLM_TYPE_E2B,
LLM_TYPE_E4B,

351
src/models/deepseek32.cpp Normal file
View File

@ -0,0 +1,351 @@
#include "models.h"
#include "llama-kv-cache.h"
#include "llama-ik-cache.h"
llm_build_deepseek32::llm_build_deepseek32(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const bool is_mla = hparams.is_mla();
GGML_ASSERT(is_mla);
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
GGML_UNUSED(n_embd_head_v);
const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
const int64_t n_indexer_head = hparams.indexer_n_head;
const int64_t n_embd_indexer_head = hparams.indexer_head_size;
const int64_t n_embd_indexer_head_rope = hparams.n_rot();
const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope;
const uint32_t n_indexer_top_k = hparams.indexer_top_k;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
GGML_ASSERT(ext_factor >= 0.0f);
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
// use the original attn_factor to pre-scale the kq_scale
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
ggml_tensor * cur;
ggml_tensor * inpL;
// {n_embd, n_tokens}
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
std::pair<llm_graph_input_attn_k*, llm_graph_input_attn_ik*> inp_attn_dsa = build_attn_inp_k_dsa();
auto * inp_attn_k = inp_attn_dsa.first;
auto * inp_attn_ik = inp_attn_dsa.second;
ggml_tensor * inp_out_ids = build_inp_out_ids();
int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < effective_n_layers; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self_attention
{
ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(qr, "qr", il);
qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
cb(qr, "qr", il);
ggml_tensor * top_k = nullptr;
// lightning indexer
{
ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr);
cb(indexer_q, "indexer_q", il);
// split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens}
ggml_tensor * indexer_q_pe =
ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens,
ggml_row_size(indexer_q->type, n_embd_indexer_head),
ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0);
cb(indexer_q_pe, "indexer_q_pe", il);
// and {n_embd_indexer_head_nope, n_indexer_head, n_tokens}
ggml_tensor * indexer_q_nope =
ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens,
ggml_row_size(indexer_q->type, n_embd_indexer_head),
ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head,
ggml_row_size(indexer_q->type, n_embd_indexer_head_nope));
cb(indexer_q_nope, "indexer_q_nope", il);
indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot,
LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(indexer_q_pe, "indexer_q_pe", il);
// {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens}
indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0);
cb(indexer_q, "indexer_q", il);
ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur);
cb(indexer_k, "indexer_k", il);
indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il);
cb(indexer_k, "indexer_k", il);
// split into {n_embd_indexer_head_rope, 1, n_tokens}
ggml_tensor * indexer_k_pe =
ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens,
ggml_row_size(indexer_k->type, n_embd_indexer_head),
ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0);
cb(indexer_k_pe, "indexer_k_pe", il);
// and {n_embd_indexer_head_nope, 1, n_tokens}
ggml_tensor * indexer_k_nope =
ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens,
ggml_row_size(indexer_k->type, n_embd_indexer_head),
ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1,
ggml_row_size(indexer_k->type, n_embd_indexer_head_nope));
cb(indexer_k_nope, "indexer_k_nope", il);
indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot,
LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(indexer_k_pe, "indexer_k_pe", il);
// {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens}
indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0);
cb(indexer_k, "indexer_k", il);
// perform Hadamard transform on indexer q and k
indexer_q = ggml_hadamard(ctx0, indexer_q, n_embd_indexer_head);
cb(indexer_q, "indexer_q", il);
indexer_k = ggml_hadamard(ctx0, indexer_k, n_embd_indexer_head);
cb(indexer_k, "indexer_k", il);
// store indexer keys to KV cache
const auto * mctx_cur = inp_attn_ik->mctx;
const auto & k_idxs = inp_attn_ik->get_k_idxs();
ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, indexer_k, k_idxs, il));
// prepare indexer weights
ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur);
cb(indexer_weights, "indexer_weights", il);
indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_indexer_head)));
cb(indexer_weights, "indexer_weights", il);
// get cached indexer keys
indexer_k = mctx_cur->get_k(ctx0, il);
// split the batch into streams if needed
const auto n_stream = indexer_k->ne[3];
indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0);
indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
// calculate indexer kq
indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
cb(indexer_q, "indexer_q", il);
indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
cb(indexer_k, "indexer_k", il);
ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
cb(indexer_kq, "indexer_kq", il);
// ReLU requires contiguous tensors
indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
cb(indexer_kq, "indexer_kq", il);
// apply ReLU
ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
cb(indexer_score, "indexer_score", il);
// multiply scores by indexer weights
indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
cb(indexer_score, "indexer_score", il);
// sum by q n_indexer_head dimension
indexer_score = ggml_sum_rows(ctx0, indexer_score);
cb(indexer_score, "indexer_score", il);
indexer_score = ggml_permute(ctx0, indexer_score, 2, 1, 0, 3);
cb(indexer_score, "indexer_score", il);
indexer_score = ggml_cont(ctx0, indexer_score);
cb(indexer_score, "indexer_score", il);
// TODO maybe pre-scale indexer weights, so we won't have to do it here
indexer_score = ggml_scale(ctx0, indexer_score, 1.0f / sqrtf(float(n_embd_indexer_head)));
cb(indexer_score, "indexer_score", il);
// mask indexer scores
ggml_tensor * indexer_kq_mask = inp_attn_ik->get_kq_mask();
indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask);
cb(indexer_score, "indexer_score", il);
// get indices of top k indexer scores
uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k;
top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k));
cb(top_k, "top_k", il);
}
ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr);
cb(q, "q", il);
// split into {n_embd_head_qk_nope, n_head, n_tokens}
ggml_tensor * q_nope =
ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
cb(q_nope, "q_nope", il);
// and {n_embd_head_qk_rope, n_head, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(
ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_cmpr_pe, "kv_cmpr_pe", il);
// split into {kv_lora_rank, n_tokens}
ggml_tensor * kv_cmpr =
ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
cb(kv_cmpr, "kv_cmpr", il);
// and {n_embd_head_qk_rope, 1, n_tokens}
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
cb(k_pe, "k_pe", il);
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(q_pe, "q_pe", il);
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(k_pe, "k_pe", il);
kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
cb(kv_cmpr, "kv_cmpr", il);
// MLA attention
{
// {n_embd_head_qk_nope, n_tokens, n_head}
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
cb(q_nope, "q_nope_perm", il);
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
cb(q_nope_absorbed, "q_nope_absorbed", il);
// {kv_lora_rank, n_head, n_tokens}
q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
// note: rope must go first for in-place context shifting in build_rope_shift()
ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
cb(Qcur, "Qcur", il);
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
cb(kv_cmpr, "kv_cmpr_reshape", il);
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
cb(Kcur, "Kcur", il);
// {kv_lora_rank, 1, n_tokens}
ggml_tensor * Vcur = kv_cmpr;
cb(Vcur, "Vcur", il);
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
}
}
if (il == effective_n_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
if ((uint32_t) il < hparams.n_layer_dense_lead) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il,
nullptr,
model.layers[il].ffn_gate_up_exps);
cb(moe_out, "ffn_moe_out", il);
// FFN shared expert
{
ggml_tensor * ffn_shexp =
build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
}
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -186,12 +186,16 @@ struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_deepseek : public llm_graph_context {
llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_deepseek2 : public llm_graph_context {
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_deepseek : public llm_graph_context {
llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
struct llm_build_deepseek32 : public llm_graph_context {
llm_build_deepseek32(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_dots1 : public llm_graph_context {

View File

@ -6655,6 +6655,97 @@ struct test_diag : public test_case {
}
};
// GGML_OP_HADAMARD
struct test_hadamard : public test_case {
const ggml_type type_a;
const std::array<int64_t, 4> ne_a;
int nh;
std::string vars() override {
return VARS_TO_STR3(type_a, ne_a, nh);
}
test_hadamard(ggml_type type_a = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
int nh = 128)
: type_a(type_a), ne_a(ne_a), nh(nh) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type_a, 4, ne_a.data());
ggml_set_param(a);
ggml_set_name(a, "a");
ggml_tensor * out = ggml_hadamard(ctx, a, nh);
ggml_set_name(out, "out");
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
init_tensor_uniform(t, -1.0f, 1.0f);
}
}
};
// GGML_OP_SCATTER
struct test_scatter : public test_case {
const ggml_type type_a;
const ggml_type type_ids;
const std::array<int64_t, 4> ne_a;
const std::array<int64_t, 4> ne_ids;
float c;
bool inplace;
std::string vars() override {
return VARS_TO_STR6(type_a, type_ids, ne_a, ne_ids, c, inplace);
}
test_scatter(ggml_type type_a = GGML_TYPE_F32,
ggml_type type_ids = GGML_TYPE_I32,
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
std::array<int64_t, 4> ne_ids = {3, 10, 10, 10},
float c = 2.0f,
bool inplace = false)
: type_a(type_a), type_ids(type_ids), ne_a(ne_a), ne_ids(ne_ids), c(c), inplace(inplace) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type_a, 4, ne_a.data());
ggml_set_param(a);
ggml_set_name(a, "a");
ggml_tensor * ids = ggml_new_tensor(ctx, type_ids, 4, ne_ids.data());
ggml_set_param(ids);
ggml_set_name(ids, "ids");
ggml_tensor * out;
if (inplace) {
out = ggml_scatter_inplace(ctx, a, ids, c);
} else {
out = ggml_scatter(ctx, a, ids, c);
}
ggml_set_name(out, "out");
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) {
// ids
const int num_pos_ids = ggml_nelements(t);
std::vector<int32_t> data(num_pos_ids);
for (int i = 0; i < num_pos_ids; i++) {
data[i] = rand() % ne_a[0];
}
ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
} else {
init_tensor_uniform(t);
}
}
}
};
// Deserializable generic test case
struct input_tensor {
ggml_type type;
@ -8717,6 +8808,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_falcon(2));
#endif
// hadamard
test_cases.emplace_back(new test_hadamard());
// scatter
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, false));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, false));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, false));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, false));
return test_cases;
}
#ifdef _MSC_VER
@ -8992,6 +9096,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 1024, 1)); // 4h PP-1024
test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 64, 1, 1, false, true)); // KDA PP-64
// hadamard
test_cases.emplace_back(new test_hadamard());
// scatter
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, false));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, true));
test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, false));
return test_cases;
}