model : add OLMo3 support (#16015)

* Add HF to gguf conversion logic for Olmo3

* Add Olmo3 implementation

* Update rope comment

* Fix indentation

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Apply suggestion from @CISC

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Shane A 2025-09-17 00:01:58 -07:00 committed by GitHub
parent d5fabe3682
commit 85286f3548
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 68 additions and 4 deletions

View File

@ -6009,9 +6009,34 @@ class SeedOssModel(TextModel):
@ModelBase.register("Olmo2ForCausalLM") @ModelBase.register("Olmo2ForCausalLM")
@ModelBase.register("Olmo3ForCausalLM")
class Olmo2Model(TextModel): class Olmo2Model(TextModel):
model_arch = gguf.MODEL_ARCH.OLMO2 model_arch = gguf.MODEL_ARCH.OLMO2
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
if "sliding_window" in self.hparams:
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
sliding_window_pattern = []
if "layer_types" in self.hparams:
sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
else:
# Olmo2 does not use sliding window attention.
# Olmo3 defaults to using sliding window for all layers except every 4th.
for i in range(self.hparams["num_hidden_layers"]):
sliding_window_pattern.append((i + 1) % 4 != 0)
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
@ModelBase.register("OlmoeForCausalLM") @ModelBase.register("OlmoeForCausalLM")
class OlmoeModel(TextModel): class OlmoeModel(TextModel):

View File

@ -1350,6 +1350,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 16: type = LLM_TYPE_1B; break; case 16: type = LLM_TYPE_1B; break;
case 32: type = LLM_TYPE_7B; break; case 32: type = LLM_TYPE_7B; break;
@ -12233,6 +12241,7 @@ struct llm_build_olmo : public llm_graph_context {
} }
}; };
template <bool iswa>
struct llm_build_olmo2 : public llm_graph_context { struct llm_build_olmo2 : public llm_graph_context {
llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@ -12248,7 +12257,14 @@ struct llm_build_olmo2 : public llm_graph_context {
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv(); using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
@ -12281,6 +12297,24 @@ struct llm_build_olmo2 : public llm_graph_context {
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
const bool is_swa = hparams.is_swa(il);
if (is_swa) {
// For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
// This is achieved here by setting freq_scale and attn_factor to 1.
// We also set ext_factor to 0 to avoid a few unnecessary computations.
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
0.0, 1.0, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
0.0, 1.0, beta_fast, beta_slow
);
} else {
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr, ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@ -12292,6 +12326,7 @@ struct llm_build_olmo2 : public llm_graph_context {
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -19131,7 +19166,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break; } break;
case LLM_ARCH_OLMO2: case LLM_ARCH_OLMO2:
{ {
llm = std::make_unique<llm_build_olmo2>(*this, params); if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
} else {
llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
}
} break; } break;
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
{ {