graph : move non-context related logic to llm_build_context

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-02-28 19:56:10 +02:00
parent 9cab53c7dd
commit 0f7daa9d1b
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
6 changed files with 579 additions and 664 deletions

View File

@ -71,26 +71,7 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) {
} }
} }
class llama_graph_input_pos : public llama_graph_input_i { // I32 [n_batch, n_batch]
public:
llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
virtual ~llama_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1;
};
void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
}
}
class llama_graph_input_pos_bucket : public llama_graph_input_i { class llama_graph_input_pos_bucket : public llama_graph_input_i {
public: public:
llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@ -98,19 +79,17 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
const llama_hparams & hparams; const llama_hparams & hparams;
}; };
void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
if (pos_bucket) { if (cur) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
int32_t * data = (int32_t *) pos_bucket->data; int32_t * data = (int32_t *) cur->data;
for (int h = 0; h < 1; ++h) { for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) { for (int j = 0; j < n_tokens; ++j) {
@ -122,192 +101,6 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
} }
} }
class llama_graph_input_out_ids : public llama_graph_input_i {
public:
llama_graph_input_out_ids(
const llama_hparams & hparams,
const llama_cparams & cparams,
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
virtual ~llama_graph_input_out_ids() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * out_ids; // I32 [n_outputs]
const llama_hparams & hparams;
const llama_cparams & cparams;
const int32_t n_outputs;
};
void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
//GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
if (!out_ids) {
LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
} else {
const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
int32_t * data = (int32_t *) out_ids->data;
if (n_outputs == n_tokens) {
for (int i = 0; i < n_tokens; ++i) {
data[i] = i;
}
} else if (ubatch->output) {
int32_t n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
if (ubatch->output[i]) {
data[n_outputs++] = i;
}
}
// the graph needs to have been passed the correct number of outputs
GGML_ASSERT(n_outputs == n_outputs);
} else if (n_outputs == 1) {
// only keep last output
data[0] = n_tokens - 1;
} else {
GGML_ASSERT(n_outputs == 0);
}
}
}
}
class llama_graph_input_mean : public llama_graph_input_i {
public:
llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
virtual ~llama_graph_input_mean() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * mean; // F32 [n_batch, n_batch]
const llama_cparams & cparams;
};
void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(mean);
GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
float * data = (float *) mean->data;
memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += ubatch->n_seq_tokens;
}
std::vector<float> div(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
}
}
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
for (int i = 0; i < n_seq_tokens; ++i) {
data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
}
}
}
}
class llama_graph_input_cls : public llama_graph_input_i {
public:
llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
virtual ~llama_graph_input_cls() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * cls; // I32 [n_batch]
const llama_cparams & cparams;
};
void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && (
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(cls);
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
uint32_t * data = (uint32_t *) cls->data;
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
for (int i = 0; i < n_seq_tokens; ++i) {
const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
if (pos == 0) {
data[seq_id] = s*n_seq_tokens + i;
}
}
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(cls);
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
uint32_t * data = (uint32_t *) cls->data;
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1);
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
for (int i = 0; i < n_seq_tokens; ++i) {
const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
last_row[seq_id] = s*n_seq_tokens + i;
}
}
}
for (int i = 0; i < n_tokens; ++i) {
if (last_row[i] >= 0) {
data[i] = last_row[i];
}
}
}
}
class llama_graph_input_attn_base : public llama_graph_input_attn_i { class llama_graph_input_attn_base : public llama_graph_input_attn_i {
public: public:
llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
@ -1359,14 +1152,6 @@ int llama_context_base::decode(llama_batch & inp_batch) {
return 0; return 0;
} }
//
// input
//
int64_t llama_context_base::n_pos_per_token() const {
return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
// //
// output // output
// //
@ -1535,6 +1320,10 @@ enum ggml_status llama_context_base::graph_compute(
// graph build API // graph build API
// //
int32_t llama_context_base::get_n_outputs() const {
return n_outputs;
}
void llama_context_base::build_cb( void llama_context_base::build_cb(
ggml_tensor * cur, ggml_tensor * cur,
const char * name, const char * name,
@ -1650,6 +1439,117 @@ ggml_tensor * llama_context_base::build_rope_factors(int il) const {
return model.layers[il].rope_short; return model.layers[il].rope_short;
} }
llama_graph_input_ptr llama_context_base::build_inp_embd(
ggml_context * ctx0,
ggml_tensor * tok_embd,
const llama_ubatch & ubatch) const {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
auto inp = std::make_shared<llama_graph_input_embd>();
auto & cur = inp->cur;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
// apply lora for embedding tokens if needed
for (const auto & lora : loras) {
struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
if (lw == nullptr) {
continue;
}
const float adapter_scale = lora.second;
const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
ctx0, lw->b, // non-transposed lora_b
ggml_get_rows(ctx0, lw->a, inp->tokens)
), scale);
cur = ggml_add(ctx0, cur, inpL_delta);
}
} else {
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
cur = inp->embd;
ggml_set_input(inp->embd);
}
// For Granite architecture
if (hparams.f_embedding_scale != 0.0f) {
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
}
//cb(cur, "inp_embd", -1);
return inp;
}
llama_graph_input_ptr llama_context_base::build_inp_pos_bucket(
ggml_context * ctx0,
int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
ggml_set_input(inp->cur);
return inp;
}
llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
ggml_context * ctx0,
int32_t n_tokens,
bool causal,
bool swa) const {
auto inp = std::make_shared<llama_graph_input_attn_base>(model.hparams, cparams);
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
GGML_UNUSED(causal);
GGML_UNUSED(swa);
inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
//cb(inp_kq_mask, "KQ_mask", -1);
ggml_set_input(inp->kq_mask);
inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
return inp;
}
ggml_tensor * llama_context_base::build_attn(
llama_graph_input_attn_i * inp,
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
float kq_scale,
int il) const {
GGML_UNUSED(il);
const auto & kq_mask = inp->get_kq_mask();
ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
//cb(q, "q", il);
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
//cb(k, "k", il);
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
return cur;
}
ggml_tensor * llama_context_base::build_rope_shift( ggml_tensor * llama_context_base::build_rope_shift(
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * cur, ggml_tensor * cur,
@ -1699,181 +1599,6 @@ ggml_tensor * llama_context_base::build_rope_shift(
return tmp; return tmp;
} }
ggml_tensor * llama_context_base::build_inp_embd(
llama_graph_result * res,
ggml_context * ctx0,
ggml_tensor * tok_embd,
const llama_ubatch & ubatch) const {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
auto inp = std::make_shared<llama_graph_input_embd>();
struct ggml_tensor * inpL;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens);
// apply lora for embedding tokens if needed
for (const auto & lora : loras) {
struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
if (lw == nullptr) {
continue;
}
const float adapter_scale = lora.second;
const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
ctx0, lw->b, // non-transposed lora_b
ggml_get_rows(ctx0, lw->a, inp->tokens)
), scale);
inpL = ggml_add(ctx0, inpL, inpL_delta);
}
} else {
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = inp->embd;
ggml_set_input(inp->embd);
}
// For Granite architecture
if (hparams.f_embedding_scale != 0.0f) {
inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
}
res->add_input(std::move(inp));
//cb(inpL, "inp_embd", -1);
return inpL;
}
ggml_tensor * llama_context_base::build_inp_pos(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
ggml_set_input(inp->pos);
res->add_input(inp);
return inp->pos;
}
ggml_tensor * llama_context_base::build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
ggml_set_input(inp->pos_bucket);
res->add_input(inp);
return inp->pos_bucket;
}
ggml_tensor * llama_context_base::build_inp_out_ids(
llama_graph_result * res,
ggml_context * ctx0) const {
auto inp = std::make_shared<llama_graph_input_out_ids>(model.hparams, cparams, n_outputs);
inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
ggml_set_input(inp->out_ids);
res->add_input(inp);
return inp->out_ids;
}
ggml_tensor * llama_context_base::build_inp_mean(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_mean>(cparams);
inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
ggml_set_input(inp->mean);
res->add_input(inp);
return inp->mean;
}
ggml_tensor * llama_context_base::build_inp_cls(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_cls>(cparams);
inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_set_input(inp->cls);
res->add_input(inp);
return inp->cls;
}
llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens,
bool causal,
bool swa) const {
auto inp = std::make_shared<llama_graph_input_attn_base>(model.hparams, cparams);
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
GGML_UNUSED(causal);
GGML_UNUSED(swa);
inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
//cb(inp_kq_mask, "KQ_mask", -1);
ggml_set_input(inp->kq_mask);
inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
res->add_input(inp);
return inp;
}
ggml_tensor * llama_context_base::build_attn(
llama_graph_input_attn_i * inp,
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
float kq_scale,
int il) const {
GGML_UNUSED(il);
const auto & kq_mask = inp->get_kq_mask();
ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
//cb(q, "q", il);
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
//cb(k, "k", il);
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
return cur;
}
ggml_tensor * llama_context_base::build_attn_mha( ggml_tensor * llama_context_base::build_attn_mha(
ggml_context * ctx0, ggml_context * ctx0,
ggml_cgraph * gf, ggml_cgraph * gf,
@ -2485,6 +2210,7 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i
// llama_context_kv_self // llama_context_kv_self
// //
// I32 [n_kv, n_batch]
class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { class llama_graph_input_pos_bucket_kv : public llama_graph_input_i {
public: public:
llama_graph_input_pos_bucket_kv( llama_graph_input_pos_bucket_kv(
@ -2494,20 +2220,18 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
const llama_hparams & hparams; const llama_hparams & hparams;
const llama_kv_cache_unified * kv_self; const llama_kv_cache_unified * kv_self;
}; };
void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
if (pos_bucket) { if (cur) {
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
int32_t * data = (int32_t *) pos_bucket->data; int32_t * data = (int32_t *) cur->data;
const int64_t n_kv = kv_self->n; const int64_t n_kv = kv_self->n;
@ -3311,24 +3035,20 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
return llama_context_base::graph_init(); return llama_context_base::graph_init();
} }
ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( llama_graph_input_ptr llama_context_kv_self::build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens) const { int32_t n_tokens) const {
auto inp = std::make_shared<llama_graph_input_pos_bucket_kv>(model.hparams, kv_self.get()); auto inp = std::make_shared<llama_graph_input_pos_bucket_kv>(model.hparams, kv_self.get());
const auto n_kv = kv_self->n; const auto n_kv = kv_self->n;
inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
ggml_set_input(inp->pos_bucket); ggml_set_input(inp->cur);
res->inputs.push_back(inp); return inp;
return inp->pos_bucket;
} }
llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
@ -3359,8 +3079,6 @@ llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
} }
res->add_input(inp);
return inp; return inp;
} }
@ -3833,6 +3551,7 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se
// llama_context_recurrent // llama_context_recurrent
// //
// I32 [kv_size]
class llama_graph_input_s_copy : public llama_graph_input_i { class llama_graph_input_s_copy : public llama_graph_input_i {
public: public:
llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@ -3840,8 +3559,6 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * s_copy; // I32 [kv_size]
llama_kv_cache_recurrent * kv_self; llama_kv_cache_recurrent * kv_self;
}; };
@ -3850,9 +3567,9 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
const int64_t n_kv = kv_self->n; const int64_t n_kv = kv_self->n;
if (s_copy) { if (cur) {
GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
int32_t * data = (int32_t *) s_copy->data; int32_t * data = (int32_t *) cur->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_kv; ++i) { for (uint32_t i = 0; i < n_kv; ++i) {
@ -3878,6 +3595,7 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
} }
} }
// F32 [1, n_kv]
class llama_graph_input_s_mask : public llama_graph_input_i { class llama_graph_input_s_mask : public llama_graph_input_i {
public: public:
llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@ -3885,8 +3603,6 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * s_mask; // F32 [1, n_kv]
llama_kv_cache_recurrent * kv_self; llama_kv_cache_recurrent * kv_self;
}; };
@ -3895,9 +3611,9 @@ void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
const int64_t n_kv = kv_self->n; const int64_t n_kv = kv_self->n;
if (s_mask) { if (cur) {
GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
float * data = (float *) s_mask->data; float * data = (float *) cur->data;
// clear unused states // clear unused states
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
@ -4302,36 +4018,30 @@ ggml_cgraph * llama_context_recurrent::graph_init() {
return llama_context_base::graph_init(); return llama_context_base::graph_init();
} }
ggml_tensor * llama_context_recurrent::build_inp_s_copy( llama_graph_input_ptr llama_context_recurrent::build_inp_s_copy(
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
auto inp = std::make_shared<llama_graph_input_s_copy>(kv_self.get()); auto inp = std::make_shared<llama_graph_input_s_copy>(kv_self.get());
const auto n_kv = kv_self->n; const auto n_kv = kv_self->n;
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); inp->cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
//cb(inp.s_copy, "inp_s_copy", -1); //cb(inp.cur, "inp_s_copy", -1);
ggml_set_input(inp->s_copy); ggml_set_input(inp->cur);
res->add_input(inp); return inp;
return inp->s_copy;
} }
ggml_tensor * llama_context_recurrent::build_inp_s_mask( llama_graph_input_ptr llama_context_recurrent::build_inp_s_mask(
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
auto inp = std::make_shared<llama_graph_input_s_mask>(kv_self.get()); auto inp = std::make_shared<llama_graph_input_s_mask>(kv_self.get());
const auto n_kv = kv_self->n; const auto n_kv = kv_self->n;
inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
//cb(inp->s_mask, "inp_s_mask", -1); //cb(inp->cur, "inp_s_mask", -1);
ggml_set_input(inp->s_mask); ggml_set_input(inp->cur);
res->add_input(inp); return inp;
return inp->s_mask;
} }
ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_tensor * llama_context_recurrent::build_copy_mask_state(
@ -4904,6 +4614,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
// llama_context_dec // llama_context_dec
// //
// F32 [n_embd, n_outputs_enc]
class llama_graph_input_cross_embd : public llama_graph_input_i { class llama_graph_input_cross_embd : public llama_graph_input_i {
public: public:
llama_graph_input_cross_embd( llama_graph_input_cross_embd(
@ -4912,26 +4623,24 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
const llama_cross * cross; const llama_cross * cross;
}; };
void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
GGML_UNUSED(ubatch); GGML_UNUSED(ubatch);
if (cross_embd && cross->t_embd) { if (cur && cross->t_embd) {
assert(cross_embd->type == GGML_TYPE_F32); assert(cur->type == GGML_TYPE_F32);
ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd)); ggml_backend_tensor_set(cur, cross->v_embd, 0, ggml_nbytes(cur));
} }
} }
class llama_graph_input_attn_dec : public llama_graph_input_attn_i { class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
public: public:
llama_graph_input_attn_dec( llama_graph_input_attn_dec(
llama_graph_input_attn_i * inp_kv_self, llama_graph_input_attn_ptr inp_kv_self,
const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {} const llama_cross * cross) : inp_kv_self(std::move(inp_kv_self)), cross(cross) {}
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@ -4942,11 +4651,14 @@ public:
ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch] ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch]
ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch] ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
llama_graph_input_attn_i * inp_kv_self = nullptr; llama_graph_input_attn_ptr inp_kv_self = nullptr;
const llama_cross * cross = nullptr; const llama_cross * cross = nullptr;
}; };
void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) { void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) {
inp_kv_self->set_input(ubatch);
if (cross_kq_mask) { if (cross_kq_mask) {
const int64_t n_enc = cross_kq_mask->ne[0]; const int64_t n_enc = cross_kq_mask->ne[0];
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
@ -4990,17 +4702,16 @@ ggml_cgraph * llama_context_dec::graph_init() {
return llama_context_kv_self::graph_init(); return llama_context_kv_self::graph_init();
} }
ggml_tensor * llama_context_dec::build_inp_cross_embd( llama_graph_input_ptr llama_context_dec::build_inp_cross_embd(
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
auto inp = std::make_shared<llama_graph_input_cross_embd>(cross); auto inp = std::make_shared<llama_graph_input_cross_embd>(cross);
// if we have the output embeddings from the encoder, use them directly // if we have the output embeddings from the encoder, use them directly
// TODO: needs more work to be correct, for now just use the tensor shape // TODO: needs more work to be correct, for now just use the tensor shape
//if (cross->t_embd) { //if (cross->t_embd) {
// inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd); // inp->cur = ggml_view_tensor(ctx0, cross->t_embd);
// return inp->cross_embd; // return inp->cur;
//} //}
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
@ -5008,23 +4719,20 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd(
const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd;
const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train;
inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
ggml_set_input(inp->cross_embd); ggml_set_input(inp->cur);
res->add_input(inp); return inp;
return inp->cross_embd;
} }
llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
bool swa) const { bool swa) const {
auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa); auto inp_kv_self = llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
auto inp = std::make_shared<llama_graph_input_attn_dec>(inp_kv_self.get(), cross); auto inp = std::make_shared<llama_graph_input_attn_dec>(std::move(inp_kv_self), cross);
const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train; const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train;
@ -5033,8 +4741,6 @@ llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
res->add_input(inp);
return inp; return inp;
} }

View File

@ -242,12 +242,6 @@ public:
int decode(llama_batch & inp_batch) override; int decode(llama_batch & inp_batch) override;
protected: protected:
//
// input
//
virtual int64_t n_pos_per_token() const; // vision
// //
// output // output
// //
@ -287,6 +281,8 @@ public:
// graph build // graph build
// //
int32_t get_n_outputs() const override;
void build_cb( void build_cb(
ggml_tensor * cur, ggml_tensor * cur,
const char * name, const char * name,
@ -314,45 +310,16 @@ public:
ggml_tensor * build_rope_factors(int il) const override; ggml_tensor * build_rope_factors(int il) const override;
ggml_tensor * build_rope_shift( llama_graph_input_ptr build_inp_embd(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
ggml_backend_buffer * bbuf) const override;
ggml_tensor * build_inp_embd(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * tok_embd, ggml_tensor * tok_embd,
const llama_ubatch & ubatch) const override; const llama_ubatch & ubatch) const override;
ggml_tensor * build_inp_pos( llama_graph_input_ptr build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const override;
ggml_tensor * build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const override;
ggml_tensor * build_inp_out_ids(
llama_graph_result * res,
ggml_context * ctx0) const override;
ggml_tensor * build_inp_mean(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const override;
ggml_tensor * build_inp_cls(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens) const override; int32_t n_tokens) const override;
llama_graph_input_attn_ptr build_attn_inp( llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
@ -370,7 +337,15 @@ public:
int il) const override; int il) const override;
protected: protected:
virtual ggml_tensor * build_attn_mha( // note: optionally set the backend to be the same as the bbuf's backend
ggml_tensor * build_rope_shift(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
ggml_backend_buffer * bbuf) const;
ggml_tensor * build_attn_mha(
ggml_context * ctx0, ggml_context * ctx0,
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * q, ggml_tensor * q,
@ -458,28 +433,9 @@ protected:
llama_loras loras; llama_loras loras;
llama_sbatch sbatch; llama_sbatch sbatch;
ggml_threadpool_t threadpool = nullptr;
ggml_threadpool_t threadpool_batch = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
ggml_backend_sched_ptr sched; ggml_backend_sched_ptr sched;
// buffer types used for the compute buffer of each backend // TODO: these below likely need some rework in the future, together with the batch-refactoring
std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output;
// TODO: remove // TODO: remove
bool logits_all = false; bool logits_all = false;
@ -502,6 +458,30 @@ protected:
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
private:
// base functionality - should not leak into derived classes
ggml_threadpool_t threadpool = nullptr;
ggml_threadpool_t threadpool_batch = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
ggml_backend_t backend_cpu = nullptr;
std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
// buffer types used for the compute buffer of each backend
std::vector<ggml_backend_t> backend_ptrs;
std::vector<ggml_backend_buffer_type_t> backend_buft;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output;
bool has_evaluated_once = false; bool has_evaluated_once = false;
}; };
@ -539,13 +519,11 @@ public:
// graph build // graph build
// //
ggml_tensor * build_inp_pos_bucket( llama_graph_input_ptr build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens) const override; int32_t n_tokens) const override;
llama_graph_input_attn_ptr build_attn_inp( llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
@ -624,12 +602,10 @@ public:
// graph build // graph build
// //
ggml_tensor * build_inp_s_copy( llama_graph_input_ptr build_inp_s_copy(
llama_graph_result * res,
ggml_context * ctx0) const override; ggml_context * ctx0) const override;
ggml_tensor * build_inp_s_mask( llama_graph_input_ptr build_inp_s_mask(
llama_graph_result * res,
ggml_context * ctx0) const override; ggml_context * ctx0) const override;
ggml_tensor * build_copy_mask_state( ggml_tensor * build_copy_mask_state(
@ -694,6 +670,10 @@ private:
std::unique_ptr<llama_kv_cache_recurrent> kv_self; std::unique_ptr<llama_kv_cache_recurrent> kv_self;
}; };
//
// enc-dec
//
// TODO: tmp - need something better to pass the data from the encoder to the decoder // TODO: tmp - need something better to pass the data from the encoder to the decoder
struct llama_cross { struct llama_cross {
// the output embeddings from the encoder as a ggml tensor // the output embeddings from the encoder as a ggml tensor
@ -714,7 +694,7 @@ public:
int encode(llama_batch & inp_batch) override; int encode(llama_batch & inp_batch) override;
llama_cross * cross = nullptr; llama_cross * cross = nullptr; // TODO: hacky, rework
}; };
class llama_context_dec : public llama_context_kv_self { class llama_context_dec : public llama_context_kv_self {
@ -730,12 +710,10 @@ protected:
ggml_cgraph * graph_init() override; ggml_cgraph * graph_init() override;
ggml_tensor * build_inp_cross_embd( llama_graph_input_ptr build_inp_cross_embd(
llama_graph_result * res,
ggml_context * ctx0) const override; ggml_context * ctx0) const override;
llama_graph_input_attn_ptr build_attn_inp( llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
@ -753,7 +731,7 @@ protected:
int il) const override; int il) const override;
public: public:
llama_cross * cross = nullptr; llama_cross * cross = nullptr; // TODO: hacky, rework
}; };
class llama_context_enc_dec : public llama_context { class llama_context_enc_dec : public llama_context {

View File

@ -67,20 +67,16 @@ ggml_tensor * llama_graph_i::build_attn_cross(
return nullptr; return nullptr;
} }
ggml_tensor * llama_graph_i::build_inp_cross_embd( llama_graph_input_ptr llama_graph_i::build_inp_cross_embd(
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0); GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__); LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
return nullptr; return nullptr;
} }
ggml_tensor * llama_graph_i::build_inp_s_copy ( llama_graph_input_ptr llama_graph_i::build_inp_s_copy (
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0); GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__); LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@ -88,10 +84,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
return nullptr; // NOLINT return nullptr; // NOLINT
} }
ggml_tensor * llama_graph_i::build_inp_s_mask( llama_graph_input_ptr llama_graph_i::build_inp_s_mask(
llama_graph_result * res,
ggml_context * ctx0) const { ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0); GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__); LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

View File

@ -29,6 +29,9 @@ public:
virtual ~llama_graph_input_i() = default; virtual ~llama_graph_input_i() = default;
virtual void set_input(const llama_ubatch * ubatch) = 0; virtual void set_input(const llama_ubatch * ubatch) = 0;
// by default, we produce a single input tensor, but some children could produce more
ggml_tensor * cur = nullptr;
}; };
using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>; using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>;
@ -76,7 +79,7 @@ public:
} }
} }
void add_input(llama_graph_input_ptr && input) { void add_input(llama_graph_input_ptr input) {
inputs.emplace_back(std::move(input)); inputs.emplace_back(std::move(input));
} }
@ -92,19 +95,23 @@ public:
// llama_graph // llama_graph
// //
// note: keep all methods const
// TODO: can become more granular in the future // TODO: can become more granular in the future
// TODO: move all methods that do not require things from llama_context to llm_build_context
class llama_graph_i { class llama_graph_i {
public: public:
llama_graph_i(llama_graph_type type); llama_graph_i(llama_graph_type type);
virtual ~llama_graph_i() = default; virtual ~llama_graph_i() = default;
llama_graph_type get_type() const { return type; } llama_graph_type get_type() const {
return type;
}
protected: private:
llama_graph_type type; llama_graph_type type;
public: public:
virtual int32_t get_n_outputs() const = 0;
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
virtual void build_cb( virtual void build_cb(
ggml_tensor * cur, ggml_tensor * cur,
@ -131,50 +138,27 @@ public:
ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * cur, // struct ggml_tensor * b
ggml_tensor * ids) const = 0; ggml_tensor * ids) const = 0;
// rope factors based on the current context size
virtual ggml_tensor * build_rope_factors(int il) const = 0; virtual ggml_tensor * build_rope_factors(int il) const = 0;
// note: optionally set the backend to be the same as the bbuf's backend
virtual ggml_tensor * build_rope_shift(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
ggml_backend_buffer * bbuf) const = 0;
// graph build API (context-specific) // graph build API (context-specific)
virtual ggml_tensor * build_inp_embd( // input embeddings with optional lora
llama_graph_result * res, virtual llama_graph_input_ptr build_inp_embd(
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * tok_embd, ggml_tensor * tok_embd,
const llama_ubatch & ubatch) const = 0; const llama_ubatch & ubatch) const = 0;
virtual ggml_tensor * build_inp_pos( // enc-dec pos
llama_graph_result * res, virtual llama_graph_input_ptr build_inp_pos_bucket(
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens) const = 0; int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_pos_bucket( //
llama_graph_result * res, // attention API
ggml_context * ctx0, //
int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_out_ids(
llama_graph_result * res,
ggml_context * ctx0) const = 0;
virtual ggml_tensor * build_inp_mean(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_cls(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) const = 0;
virtual llama_graph_input_attn_ptr build_attn_inp( virtual llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
ggml_context * ctx0, ggml_context * ctx0,
int32_t n_tokens, int32_t n_tokens,
bool causal, bool causal,
@ -202,16 +186,17 @@ public:
float kq_scale, float kq_scale,
int il) const; int il) const;
virtual ggml_tensor * build_inp_cross_embd( virtual llama_graph_input_ptr build_inp_cross_embd(
llama_graph_result * res,
ggml_context * ctx0) const; ggml_context * ctx0) const;
virtual ggml_tensor * build_inp_s_copy( //
llama_graph_result * res, // recurrent API
//
virtual llama_graph_input_ptr build_inp_s_copy(
ggml_context * ctx0) const; ggml_context * ctx0) const;
virtual ggml_tensor * build_inp_s_mask( virtual llama_graph_input_ptr build_inp_s_mask(
llama_graph_result * res,
ggml_context * ctx0) const; ggml_context * ctx0) const;
virtual ggml_tensor * build_copy_mask_state( virtual ggml_tensor * build_copy_mask_state(

View File

@ -3813,6 +3813,212 @@ enum llm_norm_type {
LLM_NORM_GROUP, LLM_NORM_GROUP,
}; };
class llama_graph_input_pos : public llama_graph_input_i {
public:
llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
virtual ~llama_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_token = 1;
};
void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
}
}
class llama_graph_input_out_ids : public llama_graph_input_i {
public:
llama_graph_input_out_ids(
const llama_hparams & hparams,
const llama_cparams & cparams,
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
virtual ~llama_graph_input_out_ids() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * out_ids; // I32 [n_outputs]
const llama_hparams & hparams;
const llama_cparams & cparams;
const int32_t n_outputs;
};
void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
//GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
if (!out_ids) {
LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
} else {
const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
int32_t * data = (int32_t *) out_ids->data;
if (n_outputs == n_tokens) {
for (int i = 0; i < n_tokens; ++i) {
data[i] = i;
}
} else if (ubatch->output) {
int32_t n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
if (ubatch->output[i]) {
data[n_outputs++] = i;
}
}
// the graph needs to have been passed the correct number of outputs
GGML_ASSERT(n_outputs == n_outputs);
} else if (n_outputs == 1) {
// only keep last output
data[0] = n_tokens - 1;
} else {
GGML_ASSERT(n_outputs == 0);
}
}
}
}
class llama_graph_input_mean : public llama_graph_input_i {
public:
llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
virtual ~llama_graph_input_mean() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * mean; // F32 [n_batch, n_batch]
const llama_cparams & cparams;
};
void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(mean);
GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
float * data = (float *) mean->data;
memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += ubatch->n_seq_tokens;
}
std::vector<float> div(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
}
}
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
for (int i = 0; i < n_seq_tokens; ++i) {
data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
}
}
}
}
class llama_graph_input_cls : public llama_graph_input_i {
public:
llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
virtual ~llama_graph_input_cls() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * cls; // I32 [n_batch]
const llama_cparams & cparams;
};
void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
if (cparams.embeddings && (
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(cls);
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
uint32_t * data = (uint32_t *) cls->data;
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
for (int i = 0; i < n_seq_tokens; ++i) {
const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
if (pos == 0) {
data[seq_id] = s*n_seq_tokens + i;
}
}
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(cls);
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
uint32_t * data = (uint32_t *) cls->data;
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1);
for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0];
// TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
for (int i = 0; i < n_seq_tokens; ++i) {
const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
last_row[seq_id] = s*n_seq_tokens + i;
}
}
}
for (int i = 0; i < n_tokens; ++i) {
if (last_row[i] >= 0) {
data[i] = last_row[i];
}
}
}
}
struct llm_build_context { struct llm_build_context {
const llama_model & model; const llama_model & model;
const llama_hparams & hparams; const llama_hparams & hparams;
@ -3895,55 +4101,75 @@ struct llm_build_context {
res (std::make_unique<llama_graph_result>()) { res (std::make_unique<llama_graph_result>()) {
} }
int64_t n_pos_per_token() const {
return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
// TODO: tmp // TODO: tmp
void cb(struct ggml_tensor * cur, const char * name, int il) { void cb(struct ggml_tensor * cur, const char * name, int il) {
lgf->build_cb(cur, name, ubatch, il); lgf->build_cb(cur, name, ubatch, il);
} }
// TODO: tmp
struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch); auto inp = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
cb(inpL, "inp_embd", -1);
return inpL; cb(inp->cur, "inp_embd", -1);
res->add_input(inp);
return inp->cur;
} }
// TODO: tmp struct ggml_tensor * build_inp_pos() const {
struct ggml_tensor * build_inp_pos() { auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
cb(cur, "inp_pos", -1);
return cur; inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
ggml_set_input(inp->pos);
res->add_input(inp);
return inp->pos;
} }
// TODO: tmp
struct ggml_tensor * build_inp_out_ids() { struct ggml_tensor * build_inp_out_ids() {
ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0); const auto n_outputs = lgf->get_n_outputs();
cb(cur, "inp_out_ids", -1);
return cur; auto inp = std::make_shared<llama_graph_input_out_ids>(hparams, cparams, n_outputs);
inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
ggml_set_input(inp->out_ids);
res->add_input(inp);
return inp->out_ids;
} }
// TODO: tmp
struct ggml_tensor * build_inp_mean() { struct ggml_tensor * build_inp_mean() {
ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens); auto inp = std::make_shared<llama_graph_input_mean>(cparams);
cb(cur, "inp_mean", -1);
return cur; inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
ggml_set_input(inp->mean);
res->add_input(inp);
return inp->mean;
} }
// TODO: tmp
struct ggml_tensor * build_inp_cls() { struct ggml_tensor * build_inp_cls() {
ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens); auto inp = std::make_shared<llama_graph_input_cls>(cparams);
cb(cur, "inp_cls", -1);
return cur; inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_set_input(inp->cls);
res->add_input(inp);
return inp->cls;
} }
// TODO: tmp // TODO: tmp
struct ggml_tensor * build_lora_mm( struct ggml_tensor * build_lora_mm(
struct ggml_tensor * w, struct ggml_tensor * w,
struct ggml_tensor * cur) { struct ggml_tensor * cur) const {
return lgf->build_lora_mm(ctx0, w, cur); return lgf->build_lora_mm(ctx0, w, cur);
} }
@ -3951,24 +4177,42 @@ struct llm_build_context {
struct ggml_tensor * build_lora_mm_id( struct ggml_tensor * build_lora_mm_id(
struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * w, // struct ggml_tensor * as
struct ggml_tensor * cur, // struct ggml_tensor * b struct ggml_tensor * cur, // struct ggml_tensor * b
struct ggml_tensor * ids) { struct ggml_tensor * ids) const {
return lgf->build_lora_mm_id(ctx0, w, cur, ids); return lgf->build_lora_mm_id(ctx0, w, cur, ids);
} }
// TODO: tmp
struct ggml_tensor * build_pos_bucket() { struct ggml_tensor * build_pos_bucket() {
ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens); auto inp = lgf->build_inp_pos_bucket(ctx0, n_tokens);
cb(cur, "pos_bucket", -1); cb(inp->cur, "pos_bucket", -1);
return cur; res->add_input(inp);
return inp->cur;
} }
// TODO: tmp
struct ggml_tensor * build_inp_cross_embd() { struct ggml_tensor * build_inp_cross_embd() {
ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0); auto inp = lgf->build_inp_cross_embd(ctx0);
cb(cur, "embd_enc", -1); cb(inp->cur, "embd_enc", -1);
return cur; res->add_input(inp);
return inp->cur;
}
struct ggml_tensor * build_inp_s_copy() const {
auto inp = lgf->build_inp_s_copy(ctx0);
res->add_input(inp);
return inp->cur;
}
struct ggml_tensor * build_inp_s_mask() const {
auto inp = lgf->build_inp_s_mask(ctx0);
res->add_input(inp);
return inp->cur;
} }
struct ggml_tensor * build_norm( struct ggml_tensor * build_norm(
@ -4250,6 +4494,18 @@ struct llm_build_context {
return moe_out; return moe_out;
} }
llama_graph_input_attn_ptr build_attn_inp(
ggml_context * ctx0,
int32_t n_tokens,
bool causal,
bool swa) const {
auto inp = lgf->build_attn_inp(ctx0, n_tokens, causal, swa);
res->add_input(inp);
return inp;
}
struct ggml_tensor * build_attn( struct ggml_tensor * build_attn(
llama_graph_input_attn_i * inp, llama_graph_input_attn_i * inp,
ggml_cgraph * gf, ggml_cgraph * gf,
@ -4490,7 +4746,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -4651,7 +4907,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -4807,7 +5063,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -4923,7 +5179,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -5028,7 +5284,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * attn_norm; struct ggml_tensor * attn_norm;
@ -5151,7 +5407,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -5303,7 +5559,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -5425,7 +5681,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
cb(pos, "pos_embd", -1); cb(pos, "pos_embd", -1);
@ -5526,7 +5782,7 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -5640,7 +5896,7 @@ struct llm_build_context {
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
cb(inpL, "inp_norm", -1); cb(inpL, "inp_norm", -1);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
// iterate layers // iterate layers
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -5785,7 +6041,7 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
inpL = build_norm(inpL, inpL = build_norm(inpL,
model.tok_norm, model.tok_norm,
@ -5888,7 +6144,7 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
if (model.pos_embd) { if (model.pos_embd) {
// inp_pos - contains the positions // inp_pos - contains the positions
@ -6030,11 +6286,9 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// norm // norm
cur = build_norm(inpL, cur = build_norm(inpL,
model.layers[il].attn_norm, model.layers[il].attn_norm,
@ -6181,7 +6435,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -6295,7 +6549,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -6408,7 +6662,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
int sections[4]; int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@ -6526,7 +6780,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -6673,7 +6927,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
attn_norm_output = build_norm(inpL, attn_norm_output = build_norm(inpL,
@ -6795,8 +7049,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
auto * residual = inpL; auto * residual = inpL;
@ -6940,7 +7193,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -7046,7 +7299,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
cb(pos, "pos_embd", -1); cb(pos, "pos_embd", -1);
@ -7152,7 +7405,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL, cur = build_norm(inpL,
@ -7263,7 +7516,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -7382,7 +7635,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -7510,7 +7763,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -7711,7 +7964,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// norm // norm
@ -7819,7 +8072,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// norm // norm
@ -7949,7 +8202,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -8062,8 +8315,8 @@ struct llm_build_context {
// {n_embd, n_tokens} // {n_embd, n_tokens}
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); struct ggml_tensor * state_copy = build_inp_s_copy();
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); struct ggml_tensor * state_mask = build_inp_s_mask();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// norm // norm
@ -8124,7 +8377,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
@ -8272,7 +8525,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
// sliding window switch pattern // sliding window switch pattern
const int32_t sliding_window_pattern = 4; const int32_t sliding_window_pattern = 4;
@ -8407,7 +8660,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -8527,7 +8780,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -8651,7 +8904,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -8772,7 +9025,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const int64_t n_head = hparams.n_head(il); const int64_t n_head = hparams.n_head(il);
@ -8900,7 +9153,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL, cur = build_norm(inpL,
@ -9044,7 +9297,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -9174,7 +9427,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@ -9337,7 +9590,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -9555,7 +9808,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -9706,7 +9959,7 @@ struct llm_build_context {
struct ggml_tensor * pos_bucket_enc = build_pos_bucket(); struct ggml_tensor * pos_bucket_enc = build_pos_bucket();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -9809,7 +10062,7 @@ struct llm_build_context {
const int64_t n_outputs_enc = embd_enc->ne[1]; const int64_t n_outputs_enc = embd_enc->ne[1];
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -9972,7 +10225,7 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
cur = build_norm(inpL, cur = build_norm(inpL,
@ -10066,7 +10319,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -10196,7 +10449,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -10317,7 +10570,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -10435,8 +10688,8 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); struct ggml_tensor * state_copy = build_inp_s_copy();
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); struct ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -10527,8 +10780,8 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); struct ggml_tensor * state_copy = build_inp_s_copy();
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); struct ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -10622,7 +10875,7 @@ struct llm_build_context {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;

View File

@ -365,7 +365,6 @@ struct llama_model {
const struct ggml_tensor * get_tensor(const char * name) const; const struct ggml_tensor * get_tensor(const char * name) const;
// TODO: add encode/decode graphs
llama_graph_result_ptr build_graph( llama_graph_result_ptr build_graph(
ggml_context * ctx, ggml_context * ctx,
ggml_cgraph * gf, ggml_cgraph * gf,