Avoid re-compilation in llama-bench
This commit is contained in:
parent
18ab0f562b
commit
b6c0697d10
|
|
@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b
|
|||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
|||
add_extra_inputs();
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
|
||||
m_cgraph = cgraph;
|
||||
m_model_inputs.clear();
|
||||
m_model_outputs.clear();
|
||||
m_node_info_list.clear();
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||
auto * cur_node = cgraph->nodes[node_n];
|
||||
set_input_output(cur_node);
|
||||
}
|
||||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
||||
m_cgraph = cgraph;
|
||||
m_model_weights = model_weights;
|
||||
|
|
@ -330,6 +341,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
|
|||
auto * mask = node->src[3];
|
||||
std::string mask_name(mask->name);
|
||||
|
||||
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
|
||||
if (mask_name.find("swa") != std::string::npos) {
|
||||
model_params.swa_layers.push_back(layer);
|
||||
model_params.ctx_per_seq_swa = cache_k->ne[1];
|
||||
|
|
@ -358,7 +370,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
|
|||
break;
|
||||
}
|
||||
if (node->op == GGML_OP_ROPE) {
|
||||
model_params.rope_params = node->op_params;
|
||||
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
|
||||
}
|
||||
}
|
||||
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
|
||||
|
|
@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
// kvcache
|
||||
input_shape = ov::PartialShape{get_shape(input)};
|
||||
if (!m_is_static) {
|
||||
// do not fix ctx size to make llama-bench work
|
||||
// do not fix ctx size to make llama-bench work across test params
|
||||
input_shape[2] = -1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include "openvino/decoder.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
|
|
@ -20,20 +21,21 @@ struct ModelParams {
|
|||
int n_heads = -1;
|
||||
int n_heads_kv = -1;
|
||||
int head_size = -1;
|
||||
int32_t * rope_params = nullptr;
|
||||
int32_t rope_params[15];
|
||||
std::vector<int> swa_layers;
|
||||
|
||||
std::vector<std::string> kv_names;
|
||||
size_t kv_buffer_ctx_id = 0;
|
||||
|
||||
bool operator==(const ModelParams & other) const {
|
||||
return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv &&
|
||||
head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers &&
|
||||
ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa;
|
||||
bool same_rope_params(const ModelParams & other) const {
|
||||
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
|
||||
}
|
||||
|
||||
bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; }
|
||||
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
|
||||
|
||||
bool can_reuse_statically(const ModelParams & other) const { return *this == other; }
|
||||
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }
|
||||
|
||||
bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
|
||||
};
|
||||
|
||||
struct ComputeParams {
|
||||
|
|
@ -170,7 +172,7 @@ public:
|
|||
|
||||
int get_input_len() const { return m_compute_params.input_len; }
|
||||
|
||||
virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; }
|
||||
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
|
||||
|
||||
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
|
||||
|
||||
|
|
@ -213,6 +215,8 @@ public:
|
|||
static std::string compute_op_type(const ggml_tensor * node);
|
||||
void add_extra_inputs();
|
||||
|
||||
void update_io(ggml_cgraph * cgraph);
|
||||
|
||||
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
|
@ -53,6 +54,7 @@
|
|||
struct ggml_backend_openvino_buffer_context {
|
||||
int device;
|
||||
std::string name;
|
||||
size_t id;
|
||||
|
||||
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
|
||||
void * data;
|
||||
|
|
@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context {
|
|||
ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
|
||||
device(device),
|
||||
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
|
||||
id([]() {
|
||||
static std::atomic<size_t> next_id{1};
|
||||
return next_id.fetch_add(1);
|
||||
}()),
|
||||
data(nullptr),
|
||||
size(size),
|
||||
is_remote(is_remote) {
|
||||
|
|
@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context {
|
|||
|
||||
~ggml_backend_openvino_buffer_context() {
|
||||
// Clean up all tensor extras
|
||||
GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device,
|
||||
size / 1024 / 1024);
|
||||
for (auto & pair : tensor_extras) {
|
||||
delete pair.second;
|
||||
}
|
||||
|
|
@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
|
|||
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
|
||||
}
|
||||
|
||||
size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
|
||||
if (!ggml_backend_buffer_is_openvino(buffer)) {
|
||||
return 0;
|
||||
}
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
return ctx->id;
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
ComputeParams c_params;
|
||||
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
|
||||
|
||||
const auto key = compute_graph_key(cgraph);
|
||||
graph_key key(cgraph);
|
||||
bool cache_hit;
|
||||
|
||||
int64_t decoder_end_time;
|
||||
|
|
@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
auto it = decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params);
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_dynamically(m_params);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
ggml_decoder = decoder_cache[key];
|
||||
ggml_decoder->set_compute_params(c_params);
|
||||
ggml_decoder->set_model_params(m_params);
|
||||
if (old_m_params.kv_buffer_changed(m_params)) {
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = infer_request_cache[key];
|
||||
|
||||
infer_request = infer_request_cache.at(key);
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
int32_t * pos_data = (int32_t *) inp_pos->data;
|
||||
|
|
@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
const auto is_prefill = get_is_prefill(inp_pos);
|
||||
const auto key = compute_graph_key(cgraph);
|
||||
graph_key key(cgraph);
|
||||
bool cache_hit;
|
||||
|
||||
int64_t decoder_end_time;
|
||||
|
|
@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
auto it = decoder_cache.find(key);
|
||||
|
||||
cache_hit = it != decoder_cache.end();
|
||||
ModelParams old_m_params;
|
||||
if (cache_hit) {
|
||||
ggml_decoder = it->second;
|
||||
cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params);
|
||||
old_m_params = ggml_decoder->get_model_params();
|
||||
cache_hit = old_m_params.can_reuse_statically(m_params);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
ggml_decoder = decoder_cache[key];
|
||||
ggml_decoder->m_is_prefill = is_prefill;
|
||||
ggml_decoder->set_model_params(m_params);
|
||||
ggml_decoder->set_compute_params(c_params);
|
||||
if (old_m_params.kv_buffer_changed(m_params)) {
|
||||
ggml_decoder->update_io(cgraph);
|
||||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
|
||||
infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
conversion_end_time = decoder_end_time;
|
||||
|
|
@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) {
|
|||
return inp_pos->ne[0] > 1;
|
||||
}
|
||||
|
||||
graph_key compute_graph_key(ggml_cgraph * cgraph) {
|
||||
graph_key key;
|
||||
key.n_nodes = cgraph->n_nodes;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
const auto * node = cgraph->nodes[i];
|
||||
if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) {
|
||||
key.cache_k_l0 = node->src[2];
|
||||
}
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
|
|||
|
|
@ -5,20 +5,33 @@
|
|||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <openvino/runtime/core.hpp>
|
||||
#include <string>
|
||||
|
||||
struct graph_key {
|
||||
size_t n_nodes;
|
||||
void * cache_k_l0;
|
||||
int n_nodes;
|
||||
std::string first_node_name;
|
||||
std::string last_node_name;
|
||||
|
||||
graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
|
||||
if (n_nodes > 0) {
|
||||
first_node_name = cgraph->nodes[0]->name;
|
||||
last_node_name = cgraph->nodes[n_nodes - 1]->name;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const graph_key & other) const {
|
||||
return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0;
|
||||
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
|
||||
last_node_name == other.last_node_name;
|
||||
}
|
||||
};
|
||||
|
||||
struct graph_key_hash {
|
||||
size_t operator()(const graph_key & key) const {
|
||||
size_t h = std::hash<size_t>{}(key.n_nodes);
|
||||
h ^= std::hash<void *>{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
||||
size_t h = std::hash<int>{}(key.n_nodes);
|
||||
if (key.n_nodes > 0) {
|
||||
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
||||
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
};
|
||||
|
|
@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
|
|||
|
||||
bool get_is_prefill(const ggml_tensor * inp_pos);
|
||||
|
||||
graph_key compute_graph_key(struct ggml_cgraph * cgraph);
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
||||
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue