diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index e3fc45a622..9a0ba7a54b 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -46,16 +46,16 @@ extern "C" { // (optional) initialize a tensor in the buffer (eg. add tensor extras) enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // tensor data access - void (*memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*set_tensor_sync_optional) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*set_tensor_async) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported) - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // clear the entire buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); // (optional) reset any internal state due to tensor initialization, such as tensor extras - void (*reset) (ggml_backend_buffer_t buffer); + void (*reset) (ggml_backend_buffer_t buffer); }; struct ggml_backend_buffer { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index fc713d6f80..088bdcee1d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -285,10 +285,11 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + // do not synchronize directly after dispatching async tensor copies static bool disable_sync_optimization = (getenv("GGML_CUDA_DISABLE_SYNC_OPTIMIZATION") != nullptr); - if (!disable_sync_optimization && buf->iface.set_tensor_sync_optional != NULL) { - buf->iface.set_tensor_sync_optional(buf, tensor, data, offset, size, false); + if (!disable_sync_optimization && buf->iface.set_tensor_async != NULL) { + buf->iface.set_tensor_async(buf, tensor, data, offset, size); } else { buf->iface.set_tensor(buf, tensor, data, offset, size); } @@ -605,16 +606,16 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { - /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, - /* .get_base = */ NULL, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ NULL, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_multi_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, }; ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { @@ -2128,29 +2129,29 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { - /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cpu_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // CPU backend buffer type diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 07c82bac93..c6333d1005 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -105,16 +105,16 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { - /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, - /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .set_tensor_s_o = */ nullptr, - /* .get_tensor = */ nullptr, - /* .cpy_tensor = */ nullptr, - /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ nullptr, + /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, + /* .get_base = */ ggml_backend_amx_buffer_get_base, + /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .set_tensor_async = */ nullptr, + /* .get_tensor = */ nullptr, + /* .cpy_tensor = */ nullptr, + /* .clear = */ ggml_backend_amx_buffer_clear, + /* .reset = */ nullptr, }; static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5db937339e..313f86f746 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -617,20 +617,16 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cuda_buffer_set_tensor_async(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } -static void ggml_backend_cuda_buffer_set_tensor_sync_optional(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, bool sync) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - if (sync) CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_cuda_buffer_set_tensor_async(buffer, tensor, data, offset, size); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -671,16 +667,16 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { - /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, - /* .set_tensor_s_o = */ ggml_backend_cuda_buffer_set_tensor_sync_optional, - /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cuda_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .set_tensor_async = */ ggml_backend_cuda_buffer_set_tensor_async, + /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type @@ -984,16 +980,16 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u } static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { - /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, - /* .set_tensor_s_o = */ NULL, - /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_cuda_split_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .set_tensor_async = */ NULL, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, }; // cuda split buffer type