diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 79c0e437d3..55a76f8248 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3015,7 +3015,6 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 897c58ac14..860e558595 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -451,7 +451,6 @@ void llama_model::load_arch(llama_model_loader & ml) { } void llama_model::load_hparams(llama_model_loader & ml) { - const gguf_context * ctx = ml.meta.get(); // get metadata as string @@ -465,7 +464,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { gguf_kv.emplace(name, value); } - // get general kv ml.get_key(LLM_KV_GENERAL_NAME, name, false); @@ -586,7 +584,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } // arch-specific KVs - LLAMA_LOG_INFO("Switching Arch\n"); switch (arch) { case LLM_ARCH_LLAMA: { @@ -1901,6 +1898,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { void llama_model::load_vocab(llama_model_loader & ml) { const auto kv = LLM_KV(arch); + vocab.load(ml, kv); } @@ -2045,7 +2043,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list & ne, int flags) -> ggml_tensor * { ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str()); - LLAMA_LOG_INFO("Creating Tensor: %s\n", tn.str().c_str()); + if (!t_meta) { if (flags & TENSOR_NOT_REQUIRED) { return nullptr; @@ -2120,6 +2118,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } ggml_backend_buffer_type_t buft = nullptr; + // check overrides if (ml.tensor_buft_overrides) { std::string tensor_name = tn.str(); @@ -2167,6 +2166,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { first_moved_to_buft = buft; } } + ggml_context * ctx = ctx_for_buft(buft); // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one @@ -2624,14 +2624,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); - type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED); - tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); - if (arch == LLM_ARCH_BERT) { pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0); + cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED); cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); @@ -2639,11 +2636,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED); } + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); - + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); if (!layer.wqkv) { layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); @@ -2657,8 +2657,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); - - layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); @@ -2668,7 +2667,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); @@ -2683,7 +2681,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); - } } break; case LLM_ARCH_MODERN_BERT: @@ -7549,7 +7546,6 @@ struct llm_build_modern_bert : public llm_graph_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_ff = hparams.n_ff(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0b6c8c73e2..21420389ec 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1661,13 +1661,10 @@ private: void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { struct gguf_context * ctx = ml.meta.get(); - LLAMA_LOG_INFO("Determining Vocab Type\n"); // determine vocab type { ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - LLAMA_LOG_INFO("pre tokenizer model: %s\n", tokenizer_pre.c_str()); - LLAMA_LOG_INFO("tokenizer model: %s\n", tokenizer_model.c_str()); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false); diff --git a/src/llama.cpp b/src/llama.cpp index 024e142453..34906cdb62 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -126,7 +126,6 @@ static int llama_model_load(const std::string & fname, std::vector if (!model.load_tensors(ml)) { return -2; } - } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1;