diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 364d064c21..4a079c87bf 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -421,7 +421,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf; if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) { - GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); + // TODO: the following assert fails for llama-parallel even though the results are correct: + // GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); ne[split_dim] = split_state.ne[j]; for (int i = 0; i < GGML_MAX_DIMS; i++) { if (tensor->nb[i] > tensor->nb[split_dim]) { @@ -440,6 +441,9 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer ggml_set_name(t_ij, tensor->name); t_ij->buffer = simple_buf; t_ij->view_offs = tensor->view_offs; + if (t_ij->view_offs > tensor->nb[split_dim]) { + t_ij->view_offs = t_ij->view_offs * ne[split_dim]/tensor->ne[split_dim]; + } t_ij->view_src = tensor->view_src; if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) { t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);