This commit is contained in:
Xuan-Son Nguyen 2026-02-01 12:33:06 +02:00 committed by GitHub
commit 6e81dbec53
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 118 additions and 2 deletions

View File

@ -40,6 +40,33 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
return res;
}
void llm_graph_input_ngram_ids::set_input(const llama_ubatch * ubatch) {
GGML_ASSERT(!ubatch->embd);
GGML_ASSERT(ubatch->token);
const int64_t n_tokens = ubatch->n_tokens;
// each token have a context of ngram_k ids
std::vector<std::vector<llama_token>> ngrams;
ngrams.reserve(ubatch->n_tokens);
for (size_t i = 0; i < (size_t) n_tokens; ++i) {
auto ngram = mctx->get_last_n_tokens(ngram_n,
ubatch->pos[i],
ubatch->seq_id[i][0] /* FIXME: support multiple seq ids */);
printf("token[%zu] = %d : ngram =", i, ubatch->token[i]);
for (size_t j = 0; j < ngram.size(); ++j) {
printf(" %d", ngram[j]);
}
printf("\n");
ngrams.push_back(std::move(ngram));
}
if (ubatch->pos) { exit(1); } // TEST ONLY
if (ubatch->pos && pos_ngram) {
}
}
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
@ -1471,6 +1498,15 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
return cur;
}
ggml_tensor * llm_graph_context::build_inp_ngram_ids() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = std::make_unique<llm_graph_input_ngram_ids>(4, 4, mctx_cur);
res->add_input(std::move(inp));
return nullptr; // TODO
}
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
// but this would make the graph topology depend on the number of output tokens, which can interere with

View File

@ -119,6 +119,21 @@ public:
const int64_t n_embd = 0;
};
class llm_graph_input_ngram_ids : public llm_graph_input_i {
public:
llm_graph_input_ngram_ids(uint32_t ngram_n, uint32_t ngram_k, const llama_kv_cache_context * mctx)
: ngram_n(ngram_n), ngram_k(ngram_k), mctx(mctx) {}
virtual ~llm_graph_input_ngram_ids() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos_ngram = nullptr; // I32 [n_batch, ngram_k]
uint32_t ngram_n = 0;
uint32_t ngram_k = 0;
const llama_kv_cache_context * mctx;
};
class llm_graph_input_pos : public llm_graph_input_i {
public:
llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
@ -816,6 +831,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
ggml_tensor * build_inp_pos() const;
ggml_tensor * build_inp_attn_scale() const;
ggml_tensor * build_inp_ngram_ids() const;
ggml_tensor * build_inp_out_ids() const;
ggml_tensor * build_inp_mean() const;
ggml_tensor * build_inp_cls() const;

View File

@ -933,8 +933,20 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
if (ubatch.is_pos_2d()) {
llama_kv_cell_ext ext {
/*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
/*.y =*/ ubatch.pos[i + ubatch.n_tokens],
/*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
/*.y =*/ ubatch.pos[i + ubatch.n_tokens],
/*.id =*/ 0, // unused
};
cells.ext_set(idx, ext);
}
if (ubatch.token) {
// save token id for ngram embeddings
GGML_ASSERT(!ubatch.embd);
llama_kv_cell_ext ext {
/*.x =*/ 0, // unused
/*.y =*/ 0, // unused
/*.id =*/ ubatch.token[i],
};
cells.ext_set(idx, ext);
}
@ -1500,6 +1512,40 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
}
}
std::vector<llama_token> llama_kv_cache::get_last_n_tokens(size_t n, llama_pos pos, llama_seq_id seq_id) const {
std::vector<llama_token> result;
result.resize(n, 0);
for (uint32_t s = 0; s < n_stream; ++s) {
const auto & cell = v_cells[s];
// TODO: linear scan is inefficient, optimize this later
for (uint32_t i = 0; i < cell.size(); ++i) {
if (!cell.seq_has(i, seq_id)) {
continue;
}
const llama_pos p = cell.pos_get(i);
const llama_token tok = cell.ext_get(i).id;
// check distance: (pos - n) <= p < pos
if (pos - (llama_pos) n <= p && p < pos) {
// make sure last token goes last
size_t insert_pos = n - (size_t)(pos - p);
// this assert should mathematically hold, but added for clarity
GGML_ASSERT(insert_pos < n);
result[insert_pos] = tok;
}
}
if (result.size() >= n) {
break;
}
}
return result;
}
size_t llama_kv_cache::total_size() const {
size_t size = 0;
@ -2262,3 +2308,7 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
kv->set_input_pos_bucket(dst, ubatch);
}
std::vector<llama_token> llama_kv_cache_context::get_last_n_tokens(size_t n, llama_pos pos, llama_seq_id seq_id) const {
return kv->get_last_n_tokens(n, pos, seq_id);
}

View File

@ -199,6 +199,10 @@ public:
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
// used by ngram embeddings
// output order: token with higher pos first
std::vector<llama_token> get_last_n_tokens(size_t n, llama_pos pos, llama_seq_id seq_id) const;
private:
const llama_model & model;
const llama_hparams & hparams;
@ -353,6 +357,9 @@ public:
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
// used by ngram embeddings
std::vector<llama_token> get_last_n_tokens(size_t n, llama_pos pos, llama_seq_id seq_id) const;
private:
llama_memory_status status;

View File

@ -15,6 +15,10 @@ struct llama_kv_cell_ext {
llama_pos x = 0;
llama_pos y = 0;
// token ID, used by ngram embeddings
// currently default to 0, according to longcat-ngram implementation
llama_token id = 0;
// return true if the current 2D spatial position is greater than other
bool is_2d_gt(llama_pos ox, llama_pos oy) const {
return (y > oy) || (y == oy && x > ox);

View File

@ -24,6 +24,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
inp_attn = build_attn_inp_kv();
}
// TEST ONLY
build_inp_ngram_ids();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * inp_out_ids = build_inp_out_ids();