From 0f7daa9d1bce23b962d6c648dc4d7f71d338c8c6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 19:56:10 +0200 Subject: [PATCH] graph : move non-context related logic to llm_build_context ggml-ci --- src/llama-context.cpp | 620 +++++++++++------------------------------- src/llama-context.h | 118 ++++---- src/llama-graph.cpp | 12 +- src/llama-graph.h | 67 ++--- src/llama-model.cpp | 425 +++++++++++++++++++++++------ src/llama-model.h | 1 - 6 files changed, 579 insertions(+), 664 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8587f480fd..7ba86a2a7f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,26 +71,7 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } -class llama_graph_input_pos : public llama_graph_input_i { -public: - llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} - virtual ~llama_graph_input_pos() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * pos = nullptr; // I32 [n_batch] - - const int64_t n_pos_per_token = 1; -}; - -void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { - if (ubatch->pos && pos) { - const int64_t n_tokens = ubatch->n_tokens; - - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); - } -} - +// I32 [n_batch, n_batch] class llama_graph_input_pos_bucket : public llama_graph_input_i { public: llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} @@ -98,19 +79,17 @@ public: void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - const llama_hparams & hparams; }; void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { - if (pos_bucket) { + if (cur) { const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) pos_bucket->data; + int32_t * data = (int32_t *) cur->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -122,192 +101,6 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { } } -class llama_graph_input_out_ids : public llama_graph_input_i { -public: - llama_graph_input_out_ids( - const llama_hparams & hparams, - const llama_cparams & cparams, - int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} - virtual ~llama_graph_input_out_ids() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * out_ids; // I32 [n_outputs] - - const llama_hparams & hparams; - const llama_cparams & cparams; - - const int32_t n_outputs; -}; - -void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); - - if (!out_ids) { - LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch->n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); - int32_t * data = (int32_t *) out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch->output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch->output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } -} - -class llama_graph_input_mean : public llama_graph_input_i { -public: - llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} - virtual ~llama_graph_input_mean() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * mean; // F32 [n_batch, n_batch] - - const llama_cparams & cparams; -}; - -void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(mean); - GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); - - float * data = (float *) mean->data; - memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch->n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } -} - -class llama_graph_input_cls : public llama_graph_input_i { -public: - llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} - virtual ~llama_graph_input_cls() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * cls; // I32 [n_batch] - - const llama_cparams & cparams; -}; - -void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(cls); - GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_tokens * ggml_element_size(cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(cls); - GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_tokens * ggml_element_size(cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } -} - class llama_graph_input_attn_base : public llama_graph_input_attn_i { public: llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : @@ -1359,14 +1152,6 @@ int llama_context_base::decode(llama_batch & inp_batch) { return 0; } -// -// input -// - -int64_t llama_context_base::n_pos_per_token() const { - return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; -} - // // output // @@ -1535,6 +1320,10 @@ enum ggml_status llama_context_base::graph_compute( // graph build API // +int32_t llama_context_base::get_n_outputs() const { + return n_outputs; +} + void llama_context_base::build_cb( ggml_tensor * cur, const char * name, @@ -1650,6 +1439,117 @@ ggml_tensor * llama_context_base::build_rope_factors(int il) const { return model.layers[il].rope_short; } +llama_graph_input_ptr llama_context_base::build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) const { + const auto & hparams = model.hparams; + + const int64_t n_embd = hparams.n_embd; + + auto inp = std::make_shared(); + + auto & cur = inp->cur; + + if (ubatch.token) { + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + //cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + + cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, inp->tokens) + ), scale); + + cur = ggml_add(ctx0, cur, inpL_delta); + } + } else { + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + cur = inp->embd; + ggml_set_input(inp->embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); + } + + //cb(cur, "inp_embd", -1); + + return inp; +} + +llama_graph_input_ptr llama_context_base::build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(model.hparams); + + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + ggml_set_input(inp->cur); + + return inp; +} + +llama_graph_input_attn_ptr llama_context_base::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp = std::make_shared(model.hparams, cparams); + + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch + GGML_UNUSED(causal); + GGML_UNUSED(swa); + + inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_kq_mask, "KQ_mask", -1); + ggml_set_input(inp->kq_mask); + + inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; + + return inp; +} + +ggml_tensor * llama_context_base::build_attn( + llama_graph_input_attn_i * inp, + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + float kq_scale, + int il) const { + GGML_UNUSED(il); + + const auto & kq_mask = inp->get_kq_mask(); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + //cb(k, "v", il); + + ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale); + + return cur; +} + ggml_tensor * llama_context_base::build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, @@ -1699,181 +1599,6 @@ ggml_tensor * llama_context_base::build_rope_shift( return tmp; } -ggml_tensor * llama_context_base::build_inp_embd( - llama_graph_result * res, - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) const { - const auto & hparams = model.hparams; - - const int64_t n_embd = hparams.n_embd; - - auto inp = std::make_shared(); - - struct ggml_tensor * inpL; - - if (ubatch.token) { - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); - ggml_set_input(inp->tokens); - - inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens); - - // apply lora for embedding tokens if needed - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( - ctx0, lw->b, // non-transposed lora_b - ggml_get_rows(ctx0, lw->a, inp->tokens) - ), scale); - - inpL = ggml_add(ctx0, inpL, inpL_delta); - } - } else { - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = inp->embd; - ggml_set_input(inp->embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); - } - - res->add_input(std::move(inp)); - - //cb(inpL, "inp_embd", -1); - - return inpL; -} - -ggml_tensor * llama_context_base::build_inp_pos( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(n_pos_per_token()); - - inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp->pos); - - res->add_input(inp); - - return inp->pos; -} - -ggml_tensor * llama_context_base::build_inp_pos_bucket( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(model.hparams); - - inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - ggml_set_input(inp->pos_bucket); - - res->add_input(inp); - - return inp->pos_bucket; -} - -ggml_tensor * llama_context_base::build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const { - auto inp = std::make_shared(model.hparams, cparams, n_outputs); - - inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); - ggml_set_input(inp->out_ids); - - res->add_input(inp); - - return inp->out_ids; -} - -ggml_tensor * llama_context_base::build_inp_mean( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(cparams); - - inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp->mean); - - res->add_input(inp); - - return inp->mean; -} - -ggml_tensor * llama_context_base::build_inp_cls( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(cparams); - - inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp->cls); - - res->add_input(inp); - - return inp->cls; -} - -llama_graph_input_attn_ptr llama_context_base::build_attn_inp( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa) const { - auto inp = std::make_shared(model.hparams, cparams); - - // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - GGML_UNUSED(causal); - GGML_UNUSED(swa); - - inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_kq_mask, "KQ_mask", -1); - ggml_set_input(inp->kq_mask); - - inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; - - res->add_input(inp); - - return inp; -} - -ggml_tensor * llama_context_base::build_attn( - llama_graph_input_attn_i * inp, - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_b, - float kq_scale, - int il) const { - GGML_UNUSED(il); - - const auto & kq_mask = inp->get_kq_mask(); - - ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - //cb(q, "q", il); - - ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); - //cb(k, "k", il); - - ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); - //cb(k, "v", il); - - ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale); - - return cur; -} - ggml_tensor * llama_context_base::build_attn_mha( ggml_context * ctx0, ggml_cgraph * gf, @@ -2485,6 +2210,7 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i // llama_context_kv_self // +// I32 [n_kv, n_batch] class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { public: llama_graph_input_pos_bucket_kv( @@ -2494,20 +2220,18 @@ public: void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - const llama_hparams & hparams; const llama_kv_cache_unified * kv_self; }; void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { - if (pos_bucket) { + if (cur) { const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) pos_bucket->data; + int32_t * data = (int32_t *) cur->data; const int64_t n_kv = kv_self->n; @@ -3311,24 +3035,20 @@ ggml_cgraph * llama_context_kv_self::graph_init() { return llama_context_base::graph_init(); } -ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( - llama_graph_result * res, +llama_graph_input_ptr llama_context_kv_self::build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const { auto inp = std::make_shared(model.hparams, kv_self.get()); const auto n_kv = kv_self->n; - inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - ggml_set_input(inp->pos_bucket); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + ggml_set_input(inp->cur); - res->inputs.push_back(inp); - - return inp->pos_bucket; + return inp; } llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -3359,8 +3079,6 @@ llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; } - res->add_input(inp); - return inp; } @@ -3833,6 +3551,7 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se // llama_context_recurrent // +// I32 [kv_size] class llama_graph_input_s_copy : public llama_graph_input_i { public: llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} @@ -3840,8 +3559,6 @@ public: void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * s_copy; // I32 [kv_size] - llama_kv_cache_recurrent * kv_self; }; @@ -3850,9 +3567,9 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { const int64_t n_kv = kv_self->n; - if (s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); - int32_t * data = (int32_t *) s_copy->data; + if (cur) { + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); + int32_t * data = (int32_t *) cur->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { @@ -3878,6 +3595,7 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { } } +// F32 [1, n_kv] class llama_graph_input_s_mask : public llama_graph_input_i { public: llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} @@ -3885,8 +3603,6 @@ public: void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * s_mask; // F32 [1, n_kv] - llama_kv_cache_recurrent * kv_self; }; @@ -3895,9 +3611,9 @@ void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) { const int64_t n_kv = kv_self->n; - if (s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer)); - float * data = (float *) s_mask->data; + if (cur) { + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); + float * data = (float *) cur->data; // clear unused states for (int i = 0; i < n_kv; ++i) { @@ -4302,36 +4018,30 @@ ggml_cgraph * llama_context_recurrent::graph_init() { return llama_context_base::graph_init(); } -ggml_tensor * llama_context_recurrent::build_inp_s_copy( - llama_graph_result * res, +llama_graph_input_ptr llama_context_recurrent::build_inp_s_copy( ggml_context * ctx0) const { auto inp = std::make_shared(kv_self.get()); const auto n_kv = kv_self->n; - inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - //cb(inp.s_copy, "inp_s_copy", -1); - ggml_set_input(inp->s_copy); + inp->cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp.cur, "inp_s_copy", -1); + ggml_set_input(inp->cur); - res->add_input(inp); - - return inp->s_copy; + return inp; } -ggml_tensor * llama_context_recurrent::build_inp_s_mask( - llama_graph_result * res, +llama_graph_input_ptr llama_context_recurrent::build_inp_s_mask( ggml_context * ctx0) const { auto inp = std::make_shared(kv_self.get()); const auto n_kv = kv_self->n; - inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - //cb(inp->s_mask, "inp_s_mask", -1); - ggml_set_input(inp->s_mask); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp->cur, "inp_s_mask", -1); + ggml_set_input(inp->cur); - res->add_input(inp); - - return inp->s_mask; + return inp; } ggml_tensor * llama_context_recurrent::build_copy_mask_state( @@ -4904,6 +4614,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // llama_context_dec // +// F32 [n_embd, n_outputs_enc] class llama_graph_input_cross_embd : public llama_graph_input_i { public: llama_graph_input_cross_embd( @@ -4912,26 +4623,24 @@ public: void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] - const llama_cross * cross; }; void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { GGML_UNUSED(ubatch); - if (cross_embd && cross->t_embd) { - assert(cross_embd->type == GGML_TYPE_F32); + if (cur && cross->t_embd) { + assert(cur->type == GGML_TYPE_F32); - ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd)); + ggml_backend_tensor_set(cur, cross->v_embd, 0, ggml_nbytes(cur)); } } class llama_graph_input_attn_dec : public llama_graph_input_attn_i { public: llama_graph_input_attn_dec( - llama_graph_input_attn_i * inp_kv_self, - const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {} + llama_graph_input_attn_ptr inp_kv_self, + const llama_cross * cross) : inp_kv_self(std::move(inp_kv_self)), cross(cross) {} void set_input(const llama_ubatch * ubatch) override; @@ -4942,11 +4651,14 @@ public: ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch] ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch] - llama_graph_input_attn_i * inp_kv_self = nullptr; + llama_graph_input_attn_ptr inp_kv_self = nullptr; + const llama_cross * cross = nullptr; }; void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) { + inp_kv_self->set_input(ubatch); + if (cross_kq_mask) { const int64_t n_enc = cross_kq_mask->ne[0]; const int64_t n_tokens = ubatch->n_tokens; @@ -4990,17 +4702,16 @@ ggml_cgraph * llama_context_dec::graph_init() { return llama_context_kv_self::graph_init(); } -ggml_tensor * llama_context_dec::build_inp_cross_embd( - llama_graph_result * res, +llama_graph_input_ptr llama_context_dec::build_inp_cross_embd( ggml_context * ctx0) const { auto inp = std::make_shared(cross); // if we have the output embeddings from the encoder, use them directly // TODO: needs more work to be correct, for now just use the tensor shape //if (cross->t_embd) { - // inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + // inp->cur = ggml_view_tensor(ctx0, cross->t_embd); - // return inp->cross_embd; + // return inp->cur; //} const auto & hparams = model.hparams; @@ -5008,23 +4719,20 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd( const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; - inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); - ggml_set_input(inp->cross_embd); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); + ggml_set_input(inp->cur); - res->add_input(inp); - - return inp->cross_embd; + return inp; } llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, bool swa) const { - auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa); + auto inp_kv_self = llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa); - auto inp = std::make_shared(inp_kv_self.get(), cross); + auto inp = std::make_shared(std::move(inp_kv_self), cross); const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train; @@ -5033,8 +4741,6 @@ llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; - res->add_input(inp); - return inp; } diff --git a/src/llama-context.h b/src/llama-context.h index 21015e8796..a5159bc5b3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -242,12 +242,6 @@ public: int decode(llama_batch & inp_batch) override; protected: - // - // input - // - - virtual int64_t n_pos_per_token() const; // vision - // // output // @@ -287,6 +281,8 @@ public: // graph build // + int32_t get_n_outputs() const override; + void build_cb( ggml_tensor * cur, const char * name, @@ -314,45 +310,16 @@ public: ggml_tensor * build_rope_factors(int il) const override; - ggml_tensor * build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - ggml_backend_buffer * bbuf) const override; - - ggml_tensor * build_inp_embd( - llama_graph_result * res, + llama_graph_input_ptr build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) const override; - ggml_tensor * build_inp_pos( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const override; - - ggml_tensor * build_inp_mean( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_cls( - llama_graph_result * res, + llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -370,7 +337,15 @@ public: int il) const override; protected: - virtual ggml_tensor * build_attn_mha( + // note: optionally set the backend to be the same as the bbuf's backend + ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf) const; + + ggml_tensor * build_attn_mha( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q, @@ -458,28 +433,9 @@ protected: llama_loras loras; llama_sbatch sbatch; - ggml_threadpool_t threadpool = nullptr; - ggml_threadpool_t threadpool_batch = nullptr; - - ggml_abort_callback abort_callback = nullptr; - void * abort_callback_data = nullptr; - - ggml_backend_t backend_cpu = nullptr; - std::vector backends; - - std::vector> set_n_threads_fns; - ggml_backend_sched_ptr sched; - // buffer types used for the compute buffer of each backend - std::vector backend_ptrs; - std::vector backend_buft; - - // memory buffers used to evaluate the model - std::vector buf_compute_meta; - - // host buffer for the model output (logits and embeddings) - ggml_backend_buffer_ptr buf_output; + // TODO: these below likely need some rework in the future, together with the batch-refactoring // TODO: remove bool logits_all = false; @@ -502,6 +458,30 @@ protected: std::vector output_ids; // map batch token positions to ids of the logits and embd buffers +private: + // base functionality - should not leak into derived classes + + ggml_threadpool_t threadpool = nullptr; + ggml_threadpool_t threadpool_batch = nullptr; + + ggml_abort_callback abort_callback = nullptr; + void * abort_callback_data = nullptr; + + ggml_backend_t backend_cpu = nullptr; + std::vector backends; + + std::vector> set_n_threads_fns; + + // buffer types used for the compute buffer of each backend + std::vector backend_ptrs; + std::vector backend_buft; + + // memory buffers used to evaluate the model + std::vector buf_compute_meta; + + // host buffer for the model output (logits and embeddings) + ggml_backend_buffer_ptr buf_output; + bool has_evaluated_once = false; }; @@ -539,13 +519,11 @@ public: // graph build // - ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, + llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -624,12 +602,10 @@ public: // graph build // - ggml_tensor * build_inp_s_copy( - llama_graph_result * res, + llama_graph_input_ptr build_inp_s_copy( ggml_context * ctx0) const override; - ggml_tensor * build_inp_s_mask( - llama_graph_result * res, + llama_graph_input_ptr build_inp_s_mask( ggml_context * ctx0) const override; ggml_tensor * build_copy_mask_state( @@ -694,6 +670,10 @@ private: std::unique_ptr kv_self; }; +// +// enc-dec +// + // TODO: tmp - need something better to pass the data from the encoder to the decoder struct llama_cross { // the output embeddings from the encoder as a ggml tensor @@ -714,7 +694,7 @@ public: int encode(llama_batch & inp_batch) override; - llama_cross * cross = nullptr; + llama_cross * cross = nullptr; // TODO: hacky, rework }; class llama_context_dec : public llama_context_kv_self { @@ -730,12 +710,10 @@ protected: ggml_cgraph * graph_init() override; - ggml_tensor * build_inp_cross_embd( - llama_graph_result * res, + llama_graph_input_ptr build_inp_cross_embd( ggml_context * ctx0) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -753,7 +731,7 @@ protected: int il) const override; public: - llama_cross * cross = nullptr; + llama_cross * cross = nullptr; // TODO: hacky, rework }; class llama_context_enc_dec : public llama_context { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 79b26d1734..89e311a915 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -67,20 +67,16 @@ ggml_tensor * llama_graph_i::build_attn_cross( return nullptr; } -ggml_tensor * llama_graph_i::build_inp_cross_embd( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_cross_embd( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; } -ggml_tensor * llama_graph_i::build_inp_s_copy ( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_s_copy ( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -88,10 +84,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy ( return nullptr; // NOLINT } -ggml_tensor * llama_graph_i::build_inp_s_mask( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_s_mask( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); diff --git a/src/llama-graph.h b/src/llama-graph.h index 7ae99becc7..343d4a0772 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -29,6 +29,9 @@ public: virtual ~llama_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; + + // by default, we produce a single input tensor, but some children could produce more + ggml_tensor * cur = nullptr; }; using llama_graph_input_ptr = std::shared_ptr; @@ -76,7 +79,7 @@ public: } } - void add_input(llama_graph_input_ptr && input) { + void add_input(llama_graph_input_ptr input) { inputs.emplace_back(std::move(input)); } @@ -92,19 +95,23 @@ public: // llama_graph // +// note: keep all methods const // TODO: can become more granular in the future -// TODO: move all methods that do not require things from llama_context to llm_build_context class llama_graph_i { public: llama_graph_i(llama_graph_type type); virtual ~llama_graph_i() = default; - llama_graph_type get_type() const { return type; } + llama_graph_type get_type() const { + return type; + } -protected: +private: llama_graph_type type; public: + virtual int32_t get_n_outputs() const = 0; + // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) virtual void build_cb( ggml_tensor * cur, @@ -131,50 +138,27 @@ public: ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * ids) const = 0; + // rope factors based on the current context size virtual ggml_tensor * build_rope_factors(int il) const = 0; - // note: optionally set the backend to be the same as the bbuf's backend - virtual ggml_tensor * build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - ggml_backend_buffer * bbuf) const = 0; - // graph build API (context-specific) - virtual ggml_tensor * build_inp_embd( - llama_graph_result * res, + // input embeddings with optional lora + virtual llama_graph_input_ptr build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) const = 0; - virtual ggml_tensor * build_inp_pos( - llama_graph_result * res, + // enc-dec pos + virtual llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const = 0; - virtual ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; - - virtual ggml_tensor * build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const = 0; - - virtual ggml_tensor * build_inp_mean( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; - - virtual ggml_tensor * build_inp_cls( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; + // + // attention API + // virtual llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -202,16 +186,17 @@ public: float kq_scale, int il) const; - virtual ggml_tensor * build_inp_cross_embd( - llama_graph_result * res, + virtual llama_graph_input_ptr build_inp_cross_embd( ggml_context * ctx0) const; - virtual ggml_tensor * build_inp_s_copy( - llama_graph_result * res, + // + // recurrent API + // + + virtual llama_graph_input_ptr build_inp_s_copy( ggml_context * ctx0) const; - virtual ggml_tensor * build_inp_s_mask( - llama_graph_result * res, + virtual llama_graph_input_ptr build_inp_s_mask( ggml_context * ctx0) const; virtual ggml_tensor * build_copy_mask_state( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7fae82c6ec..60a8cc0f8b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3813,6 +3813,212 @@ enum llm_norm_type { LLM_NORM_GROUP, }; +class llama_graph_input_pos : public llama_graph_input_i { +public: + llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + virtual ~llama_graph_input_pos() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos = nullptr; // I32 [n_batch] + + const int64_t n_pos_per_token = 1; +}; + +void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { + if (ubatch->pos && pos) { + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + } +} + +class llama_graph_input_out_ids : public llama_graph_input_i { +public: + llama_graph_input_out_ids( + const llama_hparams & hparams, + const llama_cparams & cparams, + int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + virtual ~llama_graph_input_out_ids() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * out_ids; // I32 [n_outputs] + + const llama_hparams & hparams; + const llama_cparams & cparams; + + const int32_t n_outputs; +}; + +void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); + + if (!out_ids) { + LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); + int32_t * data = (int32_t *) out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch->output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch->output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } +} + +class llama_graph_input_mean : public llama_graph_input_i { +public: + llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_mean() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * mean; // F32 [n_batch, n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(mean); + GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); + + float * data = (float *) mean->data; + memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch->n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } +} + +class llama_graph_input_cls : public llama_graph_input_i { +public: + llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_cls() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cls; // I32 [n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } +} + struct llm_build_context { const llama_model & model; const llama_hparams & hparams; @@ -3895,55 +4101,75 @@ struct llm_build_context { res (std::make_unique()) { } + int64_t n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; + } + // TODO: tmp void cb(struct ggml_tensor * cur, const char * name, int il) { lgf->build_cb(cur, name, ubatch, il); } - // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch); - cb(inpL, "inp_embd", -1); + auto inp = lgf->build_inp_embd(ctx0, tok_embd, ubatch); - return inpL; + cb(inp->cur, "inp_embd", -1); + + res->add_input(inp); + + return inp->cur; } - // TODO: tmp - struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens); - cb(cur, "inp_pos", -1); + struct ggml_tensor * build_inp_pos() const { + auto inp = std::make_shared(n_pos_per_token()); - return cur; + inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp->pos); + + res->add_input(inp); + + return inp->pos; } - // TODO: tmp struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0); - cb(cur, "inp_out_ids", -1); + const auto n_outputs = lgf->get_n_outputs(); - return cur; + auto inp = std::make_shared(hparams, cparams, n_outputs); + + inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + ggml_set_input(inp->out_ids); + + res->add_input(inp); + + return inp->out_ids; } - // TODO: tmp struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens); - cb(cur, "inp_mean", -1); + auto inp = std::make_shared(cparams); - return cur; + inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp->mean); + + res->add_input(inp); + + return inp->mean; } - // TODO: tmp struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens); - cb(cur, "inp_cls", -1); + auto inp = std::make_shared(cparams); - return cur; + inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->cls); + + res->add_input(inp); + + return inp->cls; } // TODO: tmp struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, - struct ggml_tensor * cur) { + struct ggml_tensor * cur) const { return lgf->build_lora_mm(ctx0, w, cur); } @@ -3951,24 +4177,42 @@ struct llm_build_context { struct ggml_tensor * build_lora_mm_id( struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { + struct ggml_tensor * ids) const { return lgf->build_lora_mm_id(ctx0, w, cur, ids); } - // TODO: tmp struct ggml_tensor * build_pos_bucket() { - ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens); - cb(cur, "pos_bucket", -1); + auto inp = lgf->build_inp_pos_bucket(ctx0, n_tokens); + cb(inp->cur, "pos_bucket", -1); - return cur; + res->add_input(inp); + + return inp->cur; } - // TODO: tmp struct ggml_tensor * build_inp_cross_embd() { - ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0); - cb(cur, "embd_enc", -1); + auto inp = lgf->build_inp_cross_embd(ctx0); + cb(inp->cur, "embd_enc", -1); - return cur; + res->add_input(inp); + + return inp->cur; + } + + struct ggml_tensor * build_inp_s_copy() const { + auto inp = lgf->build_inp_s_copy(ctx0); + + res->add_input(inp); + + return inp->cur; + } + + struct ggml_tensor * build_inp_s_mask() const { + auto inp = lgf->build_inp_s_mask(ctx0); + + res->add_input(inp); + + return inp->cur; } struct ggml_tensor * build_norm( @@ -4250,6 +4494,18 @@ struct llm_build_context { return moe_out; } + llama_graph_input_attn_ptr build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp = lgf->build_attn_inp(ctx0, n_tokens, causal, swa); + + res->add_input(inp); + + return inp; + } + struct ggml_tensor * build_attn( llama_graph_input_attn_i * inp, ggml_cgraph * gf, @@ -4490,7 +4746,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4651,7 +4907,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4807,7 +5063,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4923,7 +5179,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5028,7 +5284,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5151,7 +5407,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5303,7 +5559,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5425,7 +5681,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5526,7 +5782,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5640,7 +5896,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -5785,7 +6041,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); inpL = build_norm(inpL, model.tok_norm, @@ -5888,7 +6144,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); if (model.pos_embd) { // inp_pos - contains the positions @@ -6030,11 +6286,9 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { - - // norm cur = build_norm(inpL, model.layers[il].attn_norm, @@ -6181,7 +6435,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6295,7 +6549,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6408,7 +6662,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -6526,7 +6780,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6673,7 +6927,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6795,8 +7049,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6940,7 +7193,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -7046,7 +7299,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7152,7 +7405,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -7263,7 +7516,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7382,7 +7635,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7510,7 +7763,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7711,7 +7964,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { // norm @@ -7819,7 +8072,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { // norm @@ -7949,7 +8202,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8062,8 +8315,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); for (int il = 0; il < n_layer; ++il) { // norm @@ -8124,7 +8377,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -8272,7 +8525,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -8407,7 +8660,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8527,7 +8780,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8651,7 +8904,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8772,7 +9025,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8900,7 +9153,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -9044,7 +9297,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9174,7 +9427,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9337,7 +9590,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9555,7 +9808,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9706,7 +9959,7 @@ struct llm_build_context { struct ggml_tensor * pos_bucket_enc = build_pos_bucket(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9809,7 +10062,7 @@ struct llm_build_context { const int64_t n_outputs_enc = embd_enc->ne[1]; - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9972,7 +10225,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -10066,7 +10319,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10196,7 +10449,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10317,7 +10570,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10435,8 +10688,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10527,8 +10780,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10622,7 +10875,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; diff --git a/src/llama-model.h b/src/llama-model.h index 2d64c0d242..45abce7d53 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -365,7 +365,6 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; - // TODO: add encode/decode graphs llama_graph_result_ptr build_graph( ggml_context * ctx, ggml_cgraph * gf,