diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6509ca7f37..10c38d3691 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -129,7 +129,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, - + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index bf7aece8de..64e7831cf6 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -74,8 +74,8 @@ uint32_t llama_hparams::n_embd_r() const { // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size - - // check if the architecture is using d_ssm + + // check if the architecture is using d_ssm return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index af37a83438..4eb0cd5d13 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4600,7 +4600,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); @@ -14738,7 +14738,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { inpSA = ggml_add(ctx0, cur, inpSA); cb(cur, "layer_out", il); - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); }