Put kvcache on GPU
This commit is contained in:
parent
3fdcb6ab72
commit
d757849741
|
|
@ -46,13 +46,56 @@ void ggml_openvino_device_config::init() {
|
||||||
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device_name != "CPU") {
|
// Initialize remote context with queue sharing for GPU
|
||||||
|
if (device_name == "GPU") {
|
||||||
|
// Create OpenCL context and queue
|
||||||
|
cl_int err;
|
||||||
|
cl_platform_id platform;
|
||||||
|
err = clGetPlatformIDs(1, &platform, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_device_id cl_device;
|
||||||
|
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err);
|
||||||
|
clReleaseContext(cl_ctx);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create OpenVINO remote context with queue sharing
|
||||||
|
remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue);
|
||||||
|
|
||||||
|
// Release the context (queue keeps a reference)
|
||||||
|
clReleaseContext(cl_ctx);
|
||||||
|
} else if (device_name == "NPU") {
|
||||||
remote_context = ov_singleton_core().get_default_context(device_name);
|
remote_context = ov_singleton_core().get_default_context(device_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_openvino_device_config::~ggml_openvino_device_config() {
|
||||||
|
if (cl_queue != nullptr) {
|
||||||
|
clReleaseCommandQueue(cl_queue);
|
||||||
|
cl_queue = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get the global device config singleton
|
// Get the global device config singleton
|
||||||
ggml_openvino_device_config & ggml_openvino_get_device_config() {
|
ggml_openvino_device_config & ggml_openvino_get_device_config() {
|
||||||
static ggml_openvino_device_config config;
|
static ggml_openvino_device_config config;
|
||||||
|
|
@ -84,6 +127,39 @@ const ov::AnyMap & ggml_openvino_get_compile_config() {
|
||||||
return ggml_openvino_get_device_config().compile_config;
|
return ggml_openvino_get_device_config().compile_config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get the OpenCL command queue for GPU operations
|
||||||
|
cl_command_queue ggml_openvino_get_cl_queue() {
|
||||||
|
return ggml_openvino_get_device_config().cl_queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the clEnqueueMemFillINTEL function pointer (lazy load)
|
||||||
|
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() {
|
||||||
|
static clEnqueueMemFillINTEL_fn fn = nullptr;
|
||||||
|
static bool loaded = false;
|
||||||
|
if (!loaded) {
|
||||||
|
loaded = true;
|
||||||
|
cl_platform_id platform;
|
||||||
|
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
|
||||||
|
fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fn;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the clEnqueueMemcpyINTEL function pointer (lazy load)
|
||||||
|
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
|
||||||
|
static clEnqueueMemcpyINTEL_fn fn = nullptr;
|
||||||
|
static bool loaded = false;
|
||||||
|
if (!loaded) {
|
||||||
|
loaded = true;
|
||||||
|
cl_platform_id platform;
|
||||||
|
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
|
||||||
|
fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fn;
|
||||||
|
}
|
||||||
|
|
||||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||||
if (!ggml_openvino_is_npu()) {
|
if (!ggml_openvino_is_npu()) {
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,9 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "openvino/runtime/core.hpp"
|
#include "openvino/runtime/core.hpp"
|
||||||
|
|
||||||
|
#define CL_TARGET_OPENCL_VERSION 300
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <openvino/core/node.hpp>
|
#include <openvino/core/node.hpp>
|
||||||
|
|
@ -22,6 +25,34 @@ std::optional<ov::RemoteContext> ggml_openvino_get_remote_context();
|
||||||
// Get the compile config for the current device
|
// Get the compile config for the current device
|
||||||
const ov::AnyMap & ggml_openvino_get_compile_config();
|
const ov::AnyMap & ggml_openvino_get_compile_config();
|
||||||
|
|
||||||
|
// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU)
|
||||||
|
cl_command_queue ggml_openvino_get_cl_queue();
|
||||||
|
|
||||||
|
// Intel USM extension function type
|
||||||
|
typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue,
|
||||||
|
void * dst_ptr,
|
||||||
|
const void * pattern,
|
||||||
|
size_t pattern_size,
|
||||||
|
size_t size,
|
||||||
|
cl_uint num_events_in_wait_list,
|
||||||
|
const cl_event * event_wait_list,
|
||||||
|
cl_event * event);
|
||||||
|
|
||||||
|
typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue,
|
||||||
|
cl_bool blocking,
|
||||||
|
void * dst_ptr,
|
||||||
|
const void * src_ptr,
|
||||||
|
size_t size,
|
||||||
|
cl_uint num_events_in_wait_list,
|
||||||
|
const cl_event * event_wait_list,
|
||||||
|
cl_event * event);
|
||||||
|
|
||||||
|
// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available)
|
||||||
|
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL();
|
||||||
|
|
||||||
|
// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available)
|
||||||
|
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL();
|
||||||
|
|
||||||
// =====================================================
|
// =====================================================
|
||||||
// Global Device Configuration (singleton)
|
// Global Device Configuration (singleton)
|
||||||
// =====================================================
|
// =====================================================
|
||||||
|
|
@ -33,8 +64,10 @@ struct ggml_openvino_device_config {
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
std::optional<ov::RemoteContext> remote_context;
|
std::optional<ov::RemoteContext> remote_context;
|
||||||
ov::AnyMap compile_config;
|
ov::AnyMap compile_config;
|
||||||
|
cl_command_queue cl_queue = nullptr;
|
||||||
|
|
||||||
void init();
|
void init();
|
||||||
|
~ggml_openvino_device_config();
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the global device config singleton
|
// Get the global device config singleton
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@
|
||||||
#include "ggml-quants.hpp"
|
#include "ggml-quants.hpp"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <CL/cl_ext.h>
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
@ -52,17 +54,23 @@ struct ggml_backend_openvino_buffer_context {
|
||||||
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
|
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
|
||||||
void * data;
|
void * data;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
bool is_remote;
|
||||||
|
|
||||||
std::shared_ptr<ov::Tensor> ov_tensor;
|
// Wrapping of the buffer
|
||||||
|
std::shared_ptr<ov::Tensor> ov_buffer;
|
||||||
|
|
||||||
// Track all extras for cleanup
|
// Track all extras for cleanup
|
||||||
std::map<ggml_tensor *, ggml_openvino_extra_base *> tensor_extras;
|
std::map<ggml_tensor *, ggml_openvino_extra_base *> tensor_extras;
|
||||||
|
|
||||||
ggml_backend_openvino_buffer_context(int device, size_t size) :
|
// Used for re-allocation on device for kvcache
|
||||||
|
void * data_prev;
|
||||||
|
|
||||||
|
ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
|
||||||
device(device),
|
device(device),
|
||||||
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
|
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
|
||||||
data(nullptr),
|
data(nullptr),
|
||||||
size(size) {
|
size(size),
|
||||||
|
is_remote(is_remote) {
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -76,17 +84,22 @@ struct ggml_backend_openvino_buffer_context {
|
||||||
#else
|
#else
|
||||||
data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
|
data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
|
||||||
#endif
|
#endif
|
||||||
ov_tensor = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
|
ov_buffer = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
|
||||||
} else if (device_name == "GPU") {
|
} else if (device_name == "GPU") {
|
||||||
auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
|
auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
|
||||||
auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
|
ov::intel_gpu::ocl::USMTensor usm_tensor;
|
||||||
|
if (is_remote) {
|
||||||
|
usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size});
|
||||||
|
} else {
|
||||||
|
usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
|
||||||
|
}
|
||||||
data = usm_tensor.get();
|
data = usm_tensor.get();
|
||||||
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
ov_buffer = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
||||||
} else {
|
} else {
|
||||||
auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
|
auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
|
||||||
auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
|
auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
|
||||||
data = l0_tensor.get();
|
data = l0_tensor.get();
|
||||||
ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
|
ov_buffer = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data == nullptr) {
|
if (data == nullptr) {
|
||||||
|
|
@ -135,6 +148,22 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
|
GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||||
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
|
|
||||||
|
// Put kvcache on device memory for GPU
|
||||||
|
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote &&
|
||||||
|
ggml_openvino_get_device_name() == "GPU") {
|
||||||
|
GGML_ASSERT(ctx->tensor_extras.empty());
|
||||||
|
auto device = ctx->device;
|
||||||
|
auto size = ctx->size;
|
||||||
|
auto * data_prev = ctx->data;
|
||||||
|
delete ctx;
|
||||||
|
ctx = new ggml_backend_openvino_buffer_context(device, size, true);
|
||||||
|
buffer->context = ctx;
|
||||||
|
tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev);
|
||||||
|
}
|
||||||
|
|
||||||
// Views share the extra from view_src
|
// Views share the extra from view_src
|
||||||
if (tensor->view_src != nullptr) {
|
if (tensor->view_src != nullptr) {
|
||||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||||
|
|
@ -144,7 +173,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
|
|
||||||
if (tensor->data != nullptr) {
|
if (tensor->data != nullptr) {
|
||||||
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
||||||
|
|
@ -166,9 +195,28 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf
|
||||||
uint8_t value,
|
uint8_t value,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
|
GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||||
memset((char *) tensor->data + offset, value, size);
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
|
if (ctx->is_remote) {
|
||||||
|
// For remote (device) buffers, use OpenCL USM memfill
|
||||||
|
cl_command_queue queue = ggml_openvino_get_cl_queue();
|
||||||
|
auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
|
||||||
|
if (queue != nullptr && mem_fill_fn != nullptr) {
|
||||||
|
uint8_t pattern = value;
|
||||||
|
cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr,
|
||||||
|
nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
|
||||||
|
}
|
||||||
|
clFinish(queue);
|
||||||
|
} else {
|
||||||
|
GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memset((char *) tensor->data + offset, value, size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
|
|
@ -176,6 +224,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
||||||
const void * data,
|
const void * data,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
|
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
|
|
||||||
|
|
@ -260,8 +309,23 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
||||||
e.what());
|
e.what());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Non-weight tensor (KV cache, activations, etc.) - just copy data
|
// Non-weight tensor (KV cache, activations, etc.) - copy data
|
||||||
memcpy((char *) tensor->data + offset, data, size);
|
if (ctx->is_remote) {
|
||||||
|
// For remote (device) buffers, use OpenCL USM memcpy (host-to-device)
|
||||||
|
cl_command_queue queue = ggml_openvino_get_cl_queue();
|
||||||
|
auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
|
||||||
|
if (queue != nullptr && mem_cpy_fn != nullptr) {
|
||||||
|
cl_int err =
|
||||||
|
mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memcpy((char *) tensor->data + offset, data, size);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
||||||
if (extra == nullptr) {
|
if (extra == nullptr) {
|
||||||
|
|
@ -283,28 +347,99 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer
|
||||||
void * data,
|
void * data,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
|
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
|
||||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||||
memcpy(data, (const char *) tensor->data + offset, size);
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
|
if (ctx->is_remote) {
|
||||||
|
// For remote (device) buffers, use OpenCL USM memcpy (device-to-host)
|
||||||
|
cl_command_queue queue = ggml_openvino_get_cl_queue();
|
||||||
|
auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
|
||||||
|
if (queue != nullptr && mem_cpy_fn != nullptr) {
|
||||||
|
cl_int err =
|
||||||
|
mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memcpy(data, (const char *) tensor->data + offset, size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||||
const ggml_tensor * src,
|
const ggml_tensor * src,
|
||||||
ggml_tensor * dst) {
|
ggml_tensor * dst) {
|
||||||
|
// GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name);
|
||||||
GGML_ASSERT(src != nullptr && dst != nullptr);
|
GGML_ASSERT(src != nullptr && dst != nullptr);
|
||||||
// Can copy from any host buffer (including other OpenVINO buffers)
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
|
|
||||||
|
if (ctx->is_remote) {
|
||||||
|
// For remote (device) buffers, use OpenCL USM memcpy
|
||||||
|
cl_command_queue queue = ggml_openvino_get_cl_queue();
|
||||||
|
auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
|
||||||
|
if (queue == nullptr || mem_cpy_fn == nullptr) {
|
||||||
|
GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Can copy from host to device
|
||||||
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
|
cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// Can also copy from device to device if both are OpenVINO remote buffers
|
||||||
|
if (ggml_backend_buffer_is_openvino(src->buffer)) {
|
||||||
|
ggml_backend_openvino_buffer_context * src_ctx =
|
||||||
|
(ggml_backend_openvino_buffer_context *) src->buffer->context;
|
||||||
|
if (src_ctx->is_remote) {
|
||||||
|
cl_int err =
|
||||||
|
mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__,
|
||||||
|
err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host buffer - can copy from any host buffer
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||||
if (ctx->data != nullptr) {
|
GGML_ASSERT(ctx->data != nullptr);
|
||||||
|
if (!ctx->is_remote) {
|
||||||
memset(ctx->data, value, ctx->size);
|
memset(ctx->data, value, ctx->size);
|
||||||
|
} else {
|
||||||
|
// For remote (device) buffers, use OpenCL command queue
|
||||||
|
GGML_ASSERT(ggml_openvino_get_device_name() == "GPU");
|
||||||
|
cl_command_queue queue = ggml_openvino_get_cl_queue();
|
||||||
|
auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
|
||||||
|
if (queue != nullptr && mem_fill_fn != nullptr) {
|
||||||
|
uint8_t pattern = value;
|
||||||
|
cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
|
||||||
|
}
|
||||||
|
clFinish(queue);
|
||||||
|
} else {
|
||||||
|
GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n",
|
||||||
|
__func__);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue