model : Qwen3 Next (#16095)

* Qwen3 Next - cleaned up version

* Whitespaces and stuff

* Correct minor errors

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Misc. fixes.

* Clean up code, add missing hybrid qualifier

* Did someone transpose the SOLVE_TRI result matrix? Perhaps...

* Whitespace

* Proper tensors for cb calls

* Use llama-graph.h vertical alignment

* BROKEN: chunking

* Set new tensors as inputs.

* Proper chunk logic

* It's the circle of life...

* More shenanigans for n_seq > 1

* Nail in the coffin?

* Fix Windows build

* Eh, one fails on Windows, the other fails on Mac... just use general capture.

* quant : cleanup

* model : cleanup

* qwen3 : cleanup

* cont : cleanup

* cont : cleanup

* ggml : revert change

* qwen3 : cleanup

* cont : cleanup

* Readd cmath

* qwen3 : fix typo

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Usual suspects

* fix my bad suggestion

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Piotr Wilkin (ilintar) 2025-11-28 12:02:56 +01:00 committed by GitHub
parent 73955f7d2a
commit ff55414c42
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 1345 additions and 19 deletions

View File

@ -4183,6 +4183,36 @@ class Qwen3MoeModel(Qwen2MoeModel):
super().set_vocab() super().set_vocab()
@ModelBase.register("Qwen3NextForCausalLM")
class Qwen3NextModel(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3NEXT
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("mtp"):
return [] # ignore MTP layers for now
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)
elif name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
elif "conv1d" in name:
data_torch = data_torch.squeeze()
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
data_torch = data_torch + 1
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RND1") @ModelBase.register("RND1")
class RND1Model(Qwen2MoeModel): class RND1Model(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.RND1 model_arch = gguf.MODEL_ARCH.RND1

View File

@ -4,6 +4,11 @@ set -e
# First try command line argument, then environment variable, then file # First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
if [ -z "$MODEL_TESTING_PROMPT"]; then
MODEL_TESTING_PROMPT="Hello, my name is"
fi
# Final check if we have a model path # Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then if [ -z "$CONVERTED_MODEL" ]; then
@ -14,7 +19,8 @@ if [ -z "$CONVERTED_MODEL" ]; then
fi fi
echo $CONVERTED_MODEL echo $CONVERTED_MODEL
echo $MODEL_TESTING_PROMPT
cmake --build ../../build --target llama-logits -j8 cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is" ../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"

View File

@ -184,8 +184,12 @@ model_name = os.path.basename(model_path)
# of using AutoModelForCausalLM. # of using AutoModelForCausalLM.
print(f"Model class: {model.__class__.__name__}") print(f"Model class: {model.__class__.__name__}")
device = next(model.parameters()).device
if os.getenv("MODEL_TESTING_PROMPT"):
prompt = os.getenv("MODEL_TESTING_PROMPT")
else:
prompt = "Hello, my name is" prompt = "Hello, my name is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
print(f"Input tokens: {input_ids}") print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}") print(f"Input text: {repr(prompt)}")

View File

@ -9766,7 +9766,8 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
} }
const float diag = A_batch[i00 * n + i00]; const float diag = A_batch[i00 * n + i00];
GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix"); assert(diag != 0.0f && "Zero diagonal in triangular matrix");
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag; X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
} }
} }

View File

@ -366,6 +366,7 @@ class MODEL_ARCH(IntEnum):
QWEN2VL = auto() QWEN2VL = auto()
QWEN3 = auto() QWEN3 = auto()
QWEN3MOE = auto() QWEN3MOE = auto()
QWEN3NEXT = auto()
QWEN3VL = auto() QWEN3VL = auto()
QWEN3VLMOE = auto() QWEN3VLMOE = auto()
PHI2 = auto() PHI2 = auto()
@ -531,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
SSM_D = auto() SSM_D = auto()
SSM_NORM = auto() SSM_NORM = auto()
SSM_OUT = auto() SSM_OUT = auto()
SSM_BETA_ALPHA = auto() # qwen3next
TIME_MIX_W0 = auto() TIME_MIX_W0 = auto()
TIME_MIX_W1 = auto() TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto() TIME_MIX_W2 = auto()
@ -736,6 +738,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.QWEN3: "qwen3", MODEL_ARCH.QWEN3: "qwen3",
MODEL_ARCH.QWEN3MOE: "qwen3moe", MODEL_ARCH.QWEN3MOE: "qwen3moe",
MODEL_ARCH.QWEN3NEXT: "qwen3next",
MODEL_ARCH.QWEN3VL: "qwen3vl", MODEL_ARCH.QWEN3VL: "qwen3vl",
MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
@ -900,6 +903,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba",
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
@ -1569,6 +1573,35 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.QWEN3NEXT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_GATE,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_NORM,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_BETA_ALPHA,
MODEL_TENSOR.SSM_OUT
],
MODEL_ARCH.QWEN3VL: [ MODEL_ARCH.QWEN3VL: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,

View File

@ -676,6 +676,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.in_proj", # mamba "backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.in_proj", # plamo2 "model.layers.layers.{bid}.mixer.in_proj", # plamo2
"model.layers.{bid}.linear_attn.in_proj_qkvz", # qwen3next
), ),
MODEL_TENSOR.SSM_CONV1D: ( MODEL_TENSOR.SSM_CONV1D: (
@ -683,6 +684,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.conv1d", # mamba "backbone.layers.{bid}.mixer.conv1d", # mamba
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.conv1d", # plamo2 "model.layers.layers.{bid}.mixer.conv1d", # plamo2
"model.layers.{bid}.linear_attn.conv1d", # qwen3next
), ),
MODEL_TENSOR.SSM_X: ( MODEL_TENSOR.SSM_X: (
@ -697,6 +699,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.dt_proj", # mamba "backbone.layers.{bid}.mixer.dt_proj", # mamba
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2 "model.layers.layers.{bid}.mixer.dt_proj", # plamo2
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
), ),
MODEL_TENSOR.SSM_DT_NORM: ( MODEL_TENSOR.SSM_DT_NORM: (
@ -709,6 +712,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.A_log", # mamba "backbone.layers.{bid}.mixer.A_log", # mamba
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.A_log", # plamo2 "model.layers.layers.{bid}.mixer.A_log", # plamo2
"model.layers.{bid}.linear_attn.A_log", # qwen3next
), ),
MODEL_TENSOR.SSM_B_NORM: ( MODEL_TENSOR.SSM_B_NORM: (
@ -732,6 +736,7 @@ class TensorNameMap:
MODEL_TENSOR.SSM_NORM: ( MODEL_TENSOR.SSM_NORM: (
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
"model.layers.{bid}.linear_attn.norm", # qwen3next
"backbone.layers.{bid}.mixer.norm", # mamba2 "backbone.layers.{bid}.mixer.norm", # mamba2
), ),
@ -739,9 +744,14 @@ class TensorNameMap:
"model.layers.{bid}.out_proj", # mamba-hf "model.layers.{bid}.out_proj", # mamba-hf
"backbone.layers.{bid}.mixer.out_proj", # mamba "backbone.layers.{bid}.mixer.out_proj", # mamba
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.linear_attn.out_proj", # qwen3next
"model.layers.layers.{bid}.mixer.out_proj", # plamo2 "model.layers.layers.{bid}.mixer.out_proj", # plamo2
), ),
MODEL_TENSOR.SSM_BETA_ALPHA: (
"model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next
),
MODEL_TENSOR.TIME_MIX_W0: ( MODEL_TENSOR.TIME_MIX_W0: (
"model.layers.{bid}.attention.w0", # rwkv7 "model.layers.{bid}.attention.w0", # rwkv7
), ),

View File

@ -114,6 +114,7 @@ add_library(llama
models/qwen3vl.cpp models/qwen3vl.cpp
models/qwen3vl-moe.cpp models/qwen3vl-moe.cpp
models/qwen3moe.cpp models/qwen3moe.cpp
models/qwen3next.cpp
models/refact.cpp models/refact.cpp
models/rnd1.cpp models/rnd1.cpp
models/rwkv6-base.cpp models/rwkv6-base.cpp

View File

@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_QWEN3, "qwen3" }, { LLM_ARCH_QWEN3, "qwen3" },
{ LLM_ARCH_QWEN3MOE, "qwen3moe" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" },
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
{ LLM_ARCH_QWEN3VL, "qwen3vl" }, { LLM_ARCH_QWEN3VL, "qwen3vl" },
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" }, { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI2, "phi2" },
@ -829,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
}, },
}, },
{
LLM_ARCH_QWEN3NEXT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
},
},
{ {
LLM_ARCH_QWEN3VL, LLM_ARCH_QWEN3VL,
{ {
@ -2556,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@ -2754,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_LFM2: case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE: case LLM_ARCH_LFM2MOE:
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_QWEN3NEXT:
return true; return true;
default: default:
return false; return false;

View File

@ -36,6 +36,7 @@ enum llm_arch {
LLM_ARCH_QWEN2VL, LLM_ARCH_QWEN2VL,
LLM_ARCH_QWEN3, LLM_ARCH_QWEN3,
LLM_ARCH_QWEN3MOE, LLM_ARCH_QWEN3MOE,
LLM_ARCH_QWEN3NEXT,
LLM_ARCH_QWEN3VL, LLM_ARCH_QWEN3VL,
LLM_ARCH_QWEN3VLMOE, LLM_ARCH_QWEN3VLMOE,
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
@ -381,6 +382,7 @@ enum llm_tensor {
LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_D,
LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_NORM,
LLM_TENSOR_SSM_OUT, LLM_TENSOR_SSM_OUT,
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W0,
LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W1,
LLM_TENSOR_TIME_MIX_W2, LLM_TENSOR_TIME_MIX_W2,

View File

@ -1,5 +1,6 @@
#include "llama-context.h" #include "llama-context.h"
#include "llama-arch.h"
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-batch.h" #include "llama-batch.h"
#include "llama-io.h" #include "llama-io.h"
@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
// //
uint32_t llama_context::graph_max_nodes() const { uint32_t llama_context::graph_max_nodes() const {
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
}
return std::max<uint32_t>(1024u, 8u*model.n_tensors()); return std::max<uint32_t>(1024u, 8u*model.n_tensors());
} }

View File

@ -6,7 +6,7 @@
// bump if necessary // bump if necessary
#define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2 #define LLAMA_MAX_EXPERTS 512 // Qwen3 Next
enum llama_expert_gating_func_type { enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0, LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,

View File

@ -2,7 +2,6 @@
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-mmap.h" #include "llama-mmap.h"
#include "llama-batch.h"
#include "llama-cparams.h" #include "llama-cparams.h"
#include "llama-model-loader.h" #include "llama-model-loader.h"
@ -2225,6 +2224,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_QWEN3NEXT:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
// Load linear attention (gated delta net) parameters
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
// Mark recurrent layers (linear attention layers)
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
}
switch (hparams.n_layer) {
case 80: type = LLM_TYPE_80B_A3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
default: throw std::runtime_error("unsupported model architecture"); default: throw std::runtime_error("unsupported model architecture");
} }
@ -6415,6 +6437,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
} }
} break; } break;
case LLM_ARCH_QWEN3NEXT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
}
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
// Calculate dimensions from hyperparameters
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t head_v_dim = hparams.ssm_d_state;
const int64_t n_k_heads = hparams.ssm_n_group;
const int64_t n_v_heads = hparams.ssm_dt_rank;
const int64_t key_dim = head_k_dim * n_k_heads;
const int64_t value_dim = head_v_dim * n_v_heads;
const int64_t conv_dim = key_dim * 2 + value_dim;
// Calculate projection sizes
const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
const int64_t ba_dim = n_v_heads * 2;
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
if (!hparams.is_recurrent(i)) {
// Attention layers
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
// Q/K normalization for attention layers
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
} else {
// Linear attention (gated delta net) specific tensors
// Create tensors with calculated dimensions
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
}
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
// Shared experts
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -6685,6 +6775,7 @@ void llama_model::print_info() const {
arch == LLM_ARCH_FALCON_H1 || arch == LLM_ARCH_FALCON_H1 ||
arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_NEMOTRON_H) { arch == LLM_ARCH_NEMOTRON_H) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@ -7427,6 +7518,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_pangu_embedded>(*this, params); llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
} break; } break;
case LLM_ARCH_QWEN3NEXT:
{
llm = std::make_unique<llm_build_qwen3next>(*this, params);
} break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
@ -7653,6 +7748,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_COGVLM: case LLM_ARCH_COGVLM:
case LLM_ARCH_PANGU_EMBED: case LLM_ARCH_PANGU_EMBED:
case LLM_ARCH_AFMOE: case LLM_ARCH_AFMOE:
case LLM_ARCH_QWEN3NEXT:
return LLAMA_ROPE_TYPE_NEOX; return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL: case LLM_ARCH_QWEN2VL:

View File

@ -113,6 +113,7 @@ enum llm_type {
LLM_TYPE_16B_A1B, LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B, LLM_TYPE_30B_A3B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_230B_A10B, // Minimax M2
@ -309,6 +310,9 @@ struct llama_layer {
struct ggml_tensor * ssm_conv1d_b = nullptr; struct ggml_tensor * ssm_conv1d_b = nullptr;
struct ggml_tensor * ssm_dt_b = nullptr; struct ggml_tensor * ssm_dt_b = nullptr;
// qwen3next
struct ggml_tensor * ssm_beta_alpha = nullptr;
// rwkv // rwkv
struct ggml_tensor * time_mix_w1 = nullptr; struct ggml_tensor * time_mix_w1 = nullptr;
struct ggml_tensor * time_mix_w2 = nullptr; struct ggml_tensor * time_mix_w2 = nullptr;

View File

@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue; continue;
} else if (remapped_name != it.first) { }
if (remapped_name != it.first) {
ggml_set_name(it.second.tensor, remapped_name.c_str()); ggml_set_name(it.second.tensor, remapped_name.c_str());
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor)); LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
} }
@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
{ {
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads // attention layers have a non-zero number of kv heads
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
if (llama_model_has_encoder(&model)) { if (llama_model_has_encoder(&model)) {
// now n_attn_layer is the number of attention layers in the encoder // now n_layer_attn is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers // for each decoder block, there are 2 attention layers
n_attn_layer += 2 * model.hparams.dec_n_layer; n_layer_attn += 2 * model.hparams.dec_n_layer;
} }
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
} }
size_t total_size_org = 0; size_t total_size_org = 0;

View File

@ -2,8 +2,9 @@
#include "../llama-model.h" #include "../llama-model.h"
#include "../llama-graph.h" #include "../llama-graph.h"
#include "../llama-memory-recurrent.h"
// TODO: remove in follow-up PR - move to .cpp files
#include "../llama-memory-recurrent.h"
#include <cmath> #include <cmath>
struct llm_graph_context_mamba : public llm_graph_context { struct llm_graph_context_mamba : public llm_graph_context {
@ -421,7 +422,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
struct llm_build_qwen3vlmoe : public llm_graph_context { struct llm_build_qwen3vlmoe : public llm_graph_context {
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params); llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_qwen3next : public llm_graph_context_mamba {
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
private:
ggml_tensor * build_layer_attn(
llm_graph_input_attn_kv * inp_attn,
ggml_tensor * cur,
ggml_tensor * inp_pos,
int il);
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il);
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il);
ggml_tensor * build_delta_net_recurrent(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il);
ggml_tensor * build_delta_net_chunking(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il);
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,
ggml_tensor * gate,
int layer);
const llama_model & model;
};
struct llm_build_qwen : public llm_graph_context { struct llm_build_qwen : public llm_graph_context {
llm_build_qwen(const llama_model & model, const llm_graph_params & params); llm_build_qwen(const llama_model & model, const llm_graph_params & params);

1042
src/models/qwen3next.cpp Normal file

File diff suppressed because it is too large Load Diff