Remove hardcode names

This commit is contained in:
Yu, Zijun 2026-02-03 17:39:21 +08:00
parent 3259921309
commit 18ab0f562b
5 changed files with 56 additions and 30 deletions

View File

@ -161,7 +161,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
ov::PartialShape stateful_kv_shape;
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
it == m_model_params.kv_names.end()) {
m_model_params.kv_names.push_back(src_name);
@ -242,18 +241,18 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
case GGML_OP_PERMUTE: {
if (node->src[0]->op != GGML_OP_VIEW) {
op_case = 1;
} else if (ggml_is_contiguous(node->src[0])) {
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
// kv cache tensor
std::string src_name(node->view_src->name);
if (src_name.find("cache") == std::string::npos) {
op_case = 4;
int layer = extract_layer_from_name(src_name);
if (!is_swa_layer(layer)) {
op_case = 2;
} else {
int layer = extract_layer_from_name(src_name);
if (!is_swa_layer(layer)) {
op_case = 2;
} else {
op_case = 3;
}
op_case = 3;
}
} else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) {
// rope'ed query tensor
op_case = 4;
}
break;
}
@ -383,16 +382,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
auto name = std::string(input->name);
ov::PartialShape input_shape;
if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) {
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
// tokens or positions
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else if (op->op == GGML_OP_GET_ROWS) {
} else if (is_output_idx(input, op)) {
// output index
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
} else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) {
} else if (is_inp_mask(input, op)) {
// mask
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
@ -402,7 +401,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
input_shape = ov::PartialShape{-1, 1, -1, -1};
}
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) {
} else if (is_kvcache(input, op)) {
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
@ -410,7 +409,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
input_shape[2] = -1;
}
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) {
} else if (is_kv_idx(input, op)) {
// kv update index
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
@ -490,9 +489,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
std::map<std::string, std::string> kv_param_res_names;
for (const auto & name : m_model_params.kv_names) {
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
kv_param_res_names[name] = name;
}
kv_param_res_names[name] = name;
}
return kv_param_res_names;
}

View File

@ -213,6 +213,34 @@ public:
static std::string compute_op_type(const ggml_tensor * node);
void add_extra_inputs();
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
}
inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[1];
}
inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
}
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
}
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
}
inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor;
}
inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
}
private:
void set_input_output(ggml_tensor * node, bool naive = false);
int compute_op_case(const ggml_tensor * node) const;

View File

@ -85,7 +85,8 @@ void ggml_openvino_device_config::init() {
// Release the context (queue keeps a reference)
clReleaseContext(cl_ctx);
} else if (device_name == "NPU") {
remote_context = ov_singleton_core().get_default_context(device_name);
// remote tensor is not used for NPU yet
// remote_context = ov_singleton_core().get_default_context(device_name);
}
initialized = true;

View File

@ -139,8 +139,8 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote &&
ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
GGML_ASSERT(ctx->tensor_extras.empty());
auto device = ctx->device;
auto size = ctx->size;

View File

@ -508,8 +508,8 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
if (param_name == "inp_pos" || param_name == "inp_tokens" ||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
assert(ggml_tensor->ne[0] == 1);
ov::Shape input_shape = {1, 1, 1, 1};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
@ -523,7 +523,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
return input_tensor;
}
if (param_name == "inp_out_ids") {
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
ov::Shape input_shape = {1, 1, 1, 1};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
int32_t inp_out_id = *((int32_t *) ggml_tensor->data);
@ -533,7 +533,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
return input_tensor;
}
if (param_name.find("self_kq_mask") == 0) {
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t context_size = ggml_decoder->get_ctx_size();
std::vector<float> padded_data = pad_input<float>(ggml_tensor, 1, context_size, -INFINITY);
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size});
@ -557,8 +557,8 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size);
const size_t chunk_pad_size = chunk_size - chunk_valid_size;
if (param_name == "inp_pos" || param_name == "inp_tokens" ||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
ov::Shape input_shape = {1, 1, 1, chunk_size};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
// copy the chunk_index-th chunk from ggml_tensor
@ -585,7 +585,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
return input_tensor;
}
if (param_name == "inp_out_ids") {
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
size_t output_len = ggml_decoder->get_compute_params().output_len;
ov::Shape input_shape = {1, 1, 1, output_len};
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
@ -600,7 +600,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
return input_tensor;
}
if (param_name.find("self_kq_mask") == 0) {
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t cols = ggml_tensor->ne[0];
size_t rows = ggml_tensor->ne[1];
float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols;
@ -748,7 +748,7 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
if (src == nullptr) {
break;
}
if (std::string(src->name) == "inp_pos") {
if (GgmlOvDecoder::is_inp_pos(src, op)) {
return src;
}
}