Avoid re-compilation in llama-bench

This commit is contained in:
Yu, Zijun 2026-02-04 16:58:39 +08:00
parent 18ab0f562b
commit b6c0697d10
6 changed files with 78 additions and 39 deletions

View File

@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);

View File

@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
add_extra_inputs();
}
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
m_cgraph = cgraph;
m_model_inputs.clear();
m_model_outputs.clear();
m_node_info_list.clear();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * cur_node = cgraph->nodes[node_n];
set_input_output(cur_node);
}
}
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph;
m_model_weights = model_weights;
@ -330,6 +341,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
auto * mask = node->src[3];
std::string mask_name(mask->name);
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
if (mask_name.find("swa") != std::string::npos) {
model_params.swa_layers.push_back(layer);
model_params.ctx_per_seq_swa = cache_k->ne[1];
@ -358,7 +370,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
break;
}
if (node->op == GGML_OP_ROPE) {
model_params.rope_params = node->op_params;
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
}
}
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work
// do not fix ctx size to make llama-bench work across test params
input_shape[2] = -1;
}

View File

@ -5,6 +5,7 @@
#include "openvino/decoder.hpp"
#include <cstdint>
#include <cstring>
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
@ -20,20 +21,21 @@ struct ModelParams {
int n_heads = -1;
int n_heads_kv = -1;
int head_size = -1;
int32_t * rope_params = nullptr;
int32_t rope_params[15];
std::vector<int> swa_layers;
std::vector<std::string> kv_names;
size_t kv_buffer_ctx_id = 0;
bool operator==(const ModelParams & other) const {
return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv &&
head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers &&
ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa;
bool same_rope_params(const ModelParams & other) const {
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
}
bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; }
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
bool can_reuse_statically(const ModelParams & other) const { return *this == other; }
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }
bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
};
struct ComputeParams {
@ -170,7 +172,7 @@ public:
int get_input_len() const { return m_compute_params.input_len; }
virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; }
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
@ -213,6 +215,8 @@ public:
static std::string compute_op_type(const ggml_tensor * node);
void add_extra_inputs();
void update_io(ggml_cgraph * cgraph);
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
}

View File

@ -8,6 +8,7 @@
#include "ggml-quants.hpp"
#include "ggml.h"
#include <atomic>
#include <cstdint>
#include <cstring>
#include <memory>
@ -53,6 +54,7 @@
struct ggml_backend_openvino_buffer_context {
int device;
std::string name;
size_t id;
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
void * data;
@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context {
ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
device(device),
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
id([]() {
static std::atomic<size_t> next_id{1};
return next_id.fetch_add(1);
}()),
data(nullptr),
size(size),
is_remote(is_remote) {
@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context {
~ggml_backend_openvino_buffer_context() {
// Clean up all tensor extras
GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device,
size / 1024 / 1024);
for (auto & pair : tensor_extras) {
delete pair.second;
}
@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
}
size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
if (!ggml_backend_buffer_is_openvino(buffer)) {
return 0;
}
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
return ctx->id;
}
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
}

View File

@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
ComputeParams c_params;
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
const auto key = compute_graph_key(cgraph);
graph_key key(cgraph);
bool cache_hit;
int64_t decoder_end_time;
@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
auto it = decoder_cache.find(key);
cache_hit = it != decoder_cache.end();
ModelParams old_m_params;
if (cache_hit) {
ggml_decoder = it->second;
cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params);
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
}
if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = decoder_cache[key];
ggml_decoder->set_compute_params(c_params);
ggml_decoder->set_model_params(m_params);
if (old_m_params.kv_buffer_changed(m_params)) {
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = infer_request_cache[key];
infer_request = infer_request_cache.at(key);
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
int32_t * pos_data = (int32_t *) inp_pos->data;
@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
const auto is_prefill = get_is_prefill(inp_pos);
const auto key = compute_graph_key(cgraph);
graph_key key(cgraph);
bool cache_hit;
int64_t decoder_end_time;
@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
auto it = decoder_cache.find(key);
cache_hit = it != decoder_cache.end();
ModelParams old_m_params;
if (cache_hit) {
ggml_decoder = it->second;
cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params);
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params);
}
if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder = decoder_cache[key];
ggml_decoder->m_is_prefill = is_prefill;
ggml_decoder->set_model_params(m_params);
ggml_decoder->set_compute_params(c_params);
if (old_m_params.kv_buffer_changed(m_params)) {
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);
decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time;
@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) {
return inp_pos->ne[0] > 1;
}
graph_key compute_graph_key(ggml_cgraph * cgraph) {
graph_key key;
key.n_nodes = cgraph->n_nodes;
for (int i = 0; i < cgraph->n_nodes; ++i) {
const auto * node = cgraph->nodes[i];
if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) {
key.cache_k_l0 = node->src[2];
}
}
return key;
}
#pragma GCC diagnostic pop

View File

@ -5,20 +5,33 @@
#include <algorithm>
#include <cstddef>
#include <openvino/runtime/core.hpp>
#include <string>
struct graph_key {
size_t n_nodes;
void * cache_k_l0;
int n_nodes;
std::string first_node_name;
std::string last_node_name;
graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
if (n_nodes > 0) {
first_node_name = cgraph->nodes[0]->name;
last_node_name = cgraph->nodes[n_nodes - 1]->name;
}
}
bool operator==(const graph_key & other) const {
return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0;
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
last_node_name == other.last_node_name;
}
};
struct graph_key_hash {
size_t operator()(const graph_key & key) const {
size_t h = std::hash<size_t>{}(key.n_nodes);
h ^= std::hash<void *>{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2);
size_t h = std::hash<int>{}(key.n_nodes);
if (key.n_nodes > 0) {
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
}
return h;
}
};
@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
bool get_is_prefill(const ggml_tensor * inp_pos);
graph_key compute_graph_key(struct ggml_cgraph * cgraph);
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
const std::string & param_name);