Remove hardcode names
This commit is contained in:
parent
3259921309
commit
18ab0f562b
|
|
@ -161,7 +161,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
|
|||
ov::PartialShape stateful_kv_shape;
|
||||
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
|
||||
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
|
||||
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
|
||||
it == m_model_params.kv_names.end()) {
|
||||
m_model_params.kv_names.push_back(src_name);
|
||||
|
|
@ -242,18 +241,18 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
|||
case GGML_OP_PERMUTE: {
|
||||
if (node->src[0]->op != GGML_OP_VIEW) {
|
||||
op_case = 1;
|
||||
} else if (ggml_is_contiguous(node->src[0])) {
|
||||
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
|
||||
// kv cache tensor
|
||||
std::string src_name(node->view_src->name);
|
||||
if (src_name.find("cache") == std::string::npos) {
|
||||
op_case = 4;
|
||||
int layer = extract_layer_from_name(src_name);
|
||||
if (!is_swa_layer(layer)) {
|
||||
op_case = 2;
|
||||
} else {
|
||||
int layer = extract_layer_from_name(src_name);
|
||||
if (!is_swa_layer(layer)) {
|
||||
op_case = 2;
|
||||
} else {
|
||||
op_case = 3;
|
||||
}
|
||||
op_case = 3;
|
||||
}
|
||||
} else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) {
|
||||
// rope'ed query tensor
|
||||
op_case = 4;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -383,16 +382,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
auto name = std::string(input->name);
|
||||
ov::PartialShape input_shape;
|
||||
|
||||
if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) {
|
||||
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
|
||||
// tokens or positions
|
||||
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
||||
input_shape = ov::PartialShape{1, 1, 1, len};
|
||||
|
||||
} else if (op->op == GGML_OP_GET_ROWS) {
|
||||
} else if (is_output_idx(input, op)) {
|
||||
// output index
|
||||
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
|
||||
|
||||
} else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
} else if (is_inp_mask(input, op)) {
|
||||
// mask
|
||||
if (m_is_static) {
|
||||
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
|
||||
|
|
@ -402,7 +401,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
input_shape = ov::PartialShape{-1, 1, -1, -1};
|
||||
}
|
||||
|
||||
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) {
|
||||
} else if (is_kvcache(input, op)) {
|
||||
// kvcache
|
||||
input_shape = ov::PartialShape{get_shape(input)};
|
||||
if (!m_is_static) {
|
||||
|
|
@ -410,7 +409,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
input_shape[2] = -1;
|
||||
}
|
||||
|
||||
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) {
|
||||
} else if (is_kv_idx(input, op)) {
|
||||
// kv update index
|
||||
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
|
||||
input_shape = ov::PartialShape{1, 1, 1, len};
|
||||
|
|
@ -490,9 +489,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
|
|||
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
|
||||
std::map<std::string, std::string> kv_param_res_names;
|
||||
for (const auto & name : m_model_params.kv_names) {
|
||||
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
|
||||
kv_param_res_names[name] = name;
|
||||
}
|
||||
kv_param_res_names[name] = name;
|
||||
}
|
||||
return kv_param_res_names;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -213,6 +213,34 @@ public:
|
|||
static std::string compute_op_type(const ggml_tensor * node);
|
||||
void add_extra_inputs();
|
||||
|
||||
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
|
||||
}
|
||||
|
||||
inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_ROPE && tensor == op->src[1];
|
||||
}
|
||||
|
||||
inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
|
||||
}
|
||||
|
||||
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
|
||||
}
|
||||
|
||||
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
|
||||
}
|
||||
|
||||
inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor;
|
||||
}
|
||||
|
||||
inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
|
||||
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
|
||||
}
|
||||
|
||||
private:
|
||||
void set_input_output(ggml_tensor * node, bool naive = false);
|
||||
int compute_op_case(const ggml_tensor * node) const;
|
||||
|
|
|
|||
|
|
@ -85,7 +85,8 @@ void ggml_openvino_device_config::init() {
|
|||
// Release the context (queue keeps a reference)
|
||||
clReleaseContext(cl_ctx);
|
||||
} else if (device_name == "NPU") {
|
||||
remote_context = ov_singleton_core().get_default_context(device_name);
|
||||
// remote tensor is not used for NPU yet
|
||||
// remote_context = ov_singleton_core().get_default_context(device_name);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
|
|
|
|||
|
|
@ -139,8 +139,8 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
|
|||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
|
||||
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote &&
|
||||
ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
|
||||
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
|
||||
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
|
||||
GGML_ASSERT(ctx->tensor_extras.empty());
|
||||
auto device = ctx->device;
|
||||
auto size = ctx->size;
|
||||
|
|
|
|||
|
|
@ -508,8 +508,8 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
|
|||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
|
||||
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
|
||||
|
||||
if (param_name == "inp_pos" || param_name == "inp_tokens" ||
|
||||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
|
||||
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
|
||||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
|
||||
assert(ggml_tensor->ne[0] == 1);
|
||||
ov::Shape input_shape = {1, 1, 1, 1};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
|
|
@ -523,7 +523,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
if (param_name == "inp_out_ids") {
|
||||
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
|
||||
ov::Shape input_shape = {1, 1, 1, 1};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
int32_t inp_out_id = *((int32_t *) ggml_tensor->data);
|
||||
|
|
@ -533,7 +533,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
if (param_name.find("self_kq_mask") == 0) {
|
||||
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
|
||||
size_t context_size = ggml_decoder->get_ctx_size();
|
||||
std::vector<float> padded_data = pad_input<float>(ggml_tensor, 1, context_size, -INFINITY);
|
||||
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size});
|
||||
|
|
@ -557,8 +557,8 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
|
|||
const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size);
|
||||
const size_t chunk_pad_size = chunk_size - chunk_valid_size;
|
||||
|
||||
if (param_name == "inp_pos" || param_name == "inp_tokens" ||
|
||||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
|
||||
if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) ||
|
||||
GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) {
|
||||
ov::Shape input_shape = {1, 1, 1, chunk_size};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
// copy the chunk_index-th chunk from ggml_tensor
|
||||
|
|
@ -585,7 +585,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
if (param_name == "inp_out_ids") {
|
||||
if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) {
|
||||
size_t output_len = ggml_decoder->get_compute_params().output_len;
|
||||
ov::Shape input_shape = {1, 1, 1, output_len};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
|
||||
|
|
@ -600,7 +600,7 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
|
|||
return input_tensor;
|
||||
}
|
||||
|
||||
if (param_name.find("self_kq_mask") == 0) {
|
||||
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
|
||||
size_t cols = ggml_tensor->ne[0];
|
||||
size_t rows = ggml_tensor->ne[1];
|
||||
float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols;
|
||||
|
|
@ -748,7 +748,7 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
|
|||
if (src == nullptr) {
|
||||
break;
|
||||
}
|
||||
if (std::string(src->name) == "inp_pos") {
|
||||
if (GgmlOvDecoder::is_inp_pos(src, op)) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue