diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ce1500d699..1a6eb52485 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6305,11 +6305,9 @@ class SmolLM3Model(LlamaModel): def set_gguf_parameters(self): super().set_gguf_parameters() - # if self.model.config.no_rope_layers is not None: - # self.gguf_writer.add_array("smollm3.no_rope_layers", self.model.config.no_rope_layers, gguf.GGUFValueType.INT32) - no_rope_layers = self.hparams.get("no_rope_layers") - if no_rope_layers is not None: - self.gguf_writer.add_array("smollm3.no_rope_layers", no_rope_layers) + no_rope_layer_interval = self.hparams.get("no_rope_layer_interval") + if no_rope_layer_interval is not None: + self.gguf_writer.add_uint32("no_rope_layer_interval", no_rope_layer_interval) ###### CONVERSION LOGIC ###### diff --git a/src/llama-hparams.h b/src/llama-hparams.h index b2bcb8b01a..3d5225dd47 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -186,6 +186,9 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; + // for NoPE interval + uint32_t no_rope_layer_interval = 0; + bool is_swa(uint32_t il) const; }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c0be7e6932..b2e7668311 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -443,6 +443,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { return; } + if (arch == LLM_ARCH_SMOLLM3) { + ml.get_key("no_rope_layer_interval", hparams.no_rope_layer_interval); + } + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); @@ -13740,17 +13744,7 @@ struct llm_build_smollm3 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - // collect layers for which RoPE is disabled (metadata key: "smollm3.no_rope_layers") - std::vector no_rope_layers; - if (arch == LLM_ARCH_SMOLLM3) { - const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers"); - if (kid != -1) { - const uint32_t n = gguf_get_arr_n(model.meta, kid); - no_rope_layers.resize(n); - const int nb = gguf_get_arr_data(model.meta, kid, no_rope_layers.data(), n * sizeof(int32_t)); - GGML_ASSERT(nb == int(n * sizeof(int32_t))); - } - } + const uint32_t interval = hparams.no_rope_layer_interval; // token embeddings ggml_tensor * inpL = build_inp_embd(model.tok_embd); @@ -13793,7 +13787,7 @@ struct llm_build_smollm3 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) == no_rope_layers.end()) { + if (interval == 0 || il % interval != 0) { ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,