CANN: handle in-place ROPE on non-contiguous f32 tensors (#20274)
RotaryPositionEmbedding on CANN fails when src and dst share the same non-contiguous buffer (inplace + view), because the operator overwrites source data before it is fully read. Add a branch that detects this case and uses contiguous temporary buffers: copy src to temp, run ROPE into another temp, then copy back to the non-contiguous dst. Fixes 20 failing ROPE tests (f32, v=1, inplace=1). Signed-off-by: noemotiovon <757486878@qq.com>
This commit is contained in:
parent
509a31d00f
commit
7f2cbd9a4d
|
|
@ -2943,6 +2943,27 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|||
// Rotate full tensor (no tail), using trans tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
|
||||
} else if (src0->data == dst->data && !ggml_is_contiguous(src0)) {
|
||||
// In-place on non-contiguous tensor: RotaryPositionEmbedding cannot safely
|
||||
// read and write the same non-contiguous buffer. Use contiguous temporaries.
|
||||
size_t contiguous_nb[GGML_MAX_DIMS];
|
||||
contiguous_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
contiguous_nb[i] = contiguous_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
int64_t total_elements = ggml_nelements(src0);
|
||||
ggml_cann_pool_alloc inplace_src_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
ggml_cann_pool_alloc inplace_dst_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
|
||||
acl_tensor_ptr acl_src_contig = ggml_cann_create_tensor(inplace_src_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
src0->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
acl_tensor_ptr acl_dst_contig = ggml_cann_create_tensor(inplace_dst_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
dst->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
|
||||
cann_copy(ctx, acl_src.get(), acl_src_contig.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_contig.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_contig.get());
|
||||
cann_copy(ctx, acl_dst_contig.get(), acl_dst.get());
|
||||
} else {
|
||||
// Rotate full tensor (no tail), using original tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
||||
|
|
|
|||
Loading…
Reference in New Issue