From d7578497415cd660a0856f87823cfa76db5cdc64 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 22 Dec 2025 16:45:17 +0800 Subject: [PATCH] Put kvcache on GPU --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 78 +++++++- ggml/src/ggml-openvino/ggml-openvino-extra.h | 33 ++++ ggml/src/ggml-openvino/ggml-openvino.cpp | 169 ++++++++++++++++-- 3 files changed, 262 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 085ae1ece4..aa50d46c03 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -46,13 +46,56 @@ void ggml_openvino_device_config::init() { ov_singleton_core().set_property(ov::cache_dir(cache_dir)); } - if (device_name != "CPU") { + // Initialize remote context with queue sharing for GPU + if (device_name == "GPU") { + // Create OpenCL context and queue + cl_int err; + cl_platform_id platform; + err = clGetPlatformIDs(1, &platform, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err); + return; + } + + cl_device_id cl_device; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err); + return; + } + + cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err); + return; + } + + cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err); + clReleaseContext(cl_ctx); + return; + } + + // Create OpenVINO remote context with queue sharing + remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue); + + // Release the context (queue keeps a reference) + clReleaseContext(cl_ctx); + } else if (device_name == "NPU") { remote_context = ov_singleton_core().get_default_context(device_name); } initialized = true; } +ggml_openvino_device_config::~ggml_openvino_device_config() { + if (cl_queue != nullptr) { + clReleaseCommandQueue(cl_queue); + cl_queue = nullptr; + } +} + // Get the global device config singleton ggml_openvino_device_config & ggml_openvino_get_device_config() { static ggml_openvino_device_config config; @@ -84,6 +127,39 @@ const ov::AnyMap & ggml_openvino_get_compile_config() { return ggml_openvino_get_device_config().compile_config; } +// Get the OpenCL command queue for GPU operations +cl_command_queue ggml_openvino_get_cl_queue() { + return ggml_openvino_get_device_config().cl_queue; +} + +// Get the clEnqueueMemFillINTEL function pointer (lazy load) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() { + static clEnqueueMemFillINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL"); + } + } + return fn; +} + +// Get the clEnqueueMemcpyINTEL function pointer (lazy load) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { + static clEnqueueMemcpyINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL"); + } + } + return fn; +} + // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(ggml_type type) { if (!ggml_openvino_is_npu()) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index fdd8312dff..a1a8514190 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -3,6 +3,9 @@ #include "ggml.h" #include "openvino/runtime/core.hpp" +#define CL_TARGET_OPENCL_VERSION 300 +#include + #include #include #include @@ -22,6 +25,34 @@ std::optional ggml_openvino_get_remote_context(); // Get the compile config for the current device const ov::AnyMap & ggml_openvino_get_compile_config(); +// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU) +cl_command_queue ggml_openvino_get_cl_queue(); + +// Intel USM extension function type +typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue, + void * dst_ptr, + const void * pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue, + cl_bool blocking, + void * dst_ptr, + const void * src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL(); + +// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -33,8 +64,10 @@ struct ggml_openvino_device_config { bool initialized = false; std::optional remote_context; ov::AnyMap compile_config; + cl_command_queue cl_queue = nullptr; void init(); + ~ggml_openvino_device_config(); }; // Get the global device config singleton diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index c5c25fb6c1..e139c2d662 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,8 @@ #include "ggml-quants.hpp" #include "ggml.h" +#include + #include #include #include @@ -52,17 +54,23 @@ struct ggml_backend_openvino_buffer_context { // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; size_t size; + bool is_remote; - std::shared_ptr ov_tensor; + // Wrapping of the buffer + std::shared_ptr ov_buffer; // Track all extras for cleanup std::map tensor_extras; - ggml_backend_openvino_buffer_context(int device, size_t size) : + // Used for re-allocation on device for kvcache + void * data_prev; + + ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) : device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), data(nullptr), - size(size) { + size(size), + is_remote(is_remote) { if (size == 0) { return; } @@ -76,17 +84,22 @@ struct ggml_backend_openvino_buffer_context { #else data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); #endif - ov_tensor = std::make_shared(ov::element::u8, ov::Shape{size}, data); + ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); } else if (device_name == "GPU") { auto gpu_context = core.get_default_context("GPU").as(); - auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + ov::intel_gpu::ocl::USMTensor usm_tensor; + if (is_remote) { + usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); + } else { + usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + } data = usm_tensor.get(); - ov_tensor = std::make_shared(std::move(usm_tensor)); + ov_buffer = std::make_shared(std::move(usm_tensor)); } else { auto npu_context = core.get_default_context("NPU").as(); auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); data = l0_tensor.get(); - ov_tensor = std::make_shared(std::move(l0_tensor)); + ov_buffer = std::make_shared(std::move(l0_tensor)); } if (data == nullptr) { @@ -135,6 +148,22 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer } static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Put kvcache on device memory for GPU + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && + ggml_openvino_get_device_name() == "GPU") { + GGML_ASSERT(ctx->tensor_extras.empty()); + auto device = ctx->device; + auto size = ctx->size; + auto * data_prev = ctx->data; + delete ctx; + ctx = new ggml_backend_openvino_buffer_context(device, size, true); + buffer->context = ctx; + tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev); + } + // Views share the extra from view_src if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); @@ -144,7 +173,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu return GGML_STATUS_SUCCESS; } - ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + ctx = (ggml_backend_openvino_buffer_context *) buffer->context; if (tensor->data != nullptr) { ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); @@ -166,9 +195,28 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf uint8_t value, size_t offset, size_t size) { + GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); - memset((char *) tensor->data + offset, value, size); - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memfill + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr, + nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__); + } + } else { + memset((char *) tensor->data + offset, value, size); + } } static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer, @@ -176,6 +224,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer const void * data, size_t offset, size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; @@ -260,8 +309,23 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer e.what()); } } else { - // Non-weight tensor (KV cache, activations, etc.) - just copy data - memcpy((char *) tensor->data + offset, data, size); + // Non-weight tensor (KV cache, activations, etc.) - copy data + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy (host-to-device) + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy((char *) tensor->data + offset, data, size); + } ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); if (extra == nullptr) { @@ -283,28 +347,99 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer void * data, size_t offset, size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); - memcpy(data, (const char *) tensor->data + offset, size); - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy (device-to-host) + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy(data, (const char *) tensor->data + offset, size); + } } static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + // GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name); GGML_ASSERT(src != nullptr && dst != nullptr); - // Can copy from any host buffer (including other OpenVINO buffers) + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue == nullptr || mem_cpy_fn == nullptr) { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + return false; + } + // Can copy from host to device + if (ggml_backend_buffer_is_host(src->buffer)) { + cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err); + return false; + } + return true; + } + // Can also copy from device to device if both are OpenVINO remote buffers + if (ggml_backend_buffer_is_openvino(src->buffer)) { + ggml_backend_openvino_buffer_context * src_ctx = + (ggml_backend_openvino_buffer_context *) src->buffer->context; + if (src_ctx->is_remote) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, + err); + return false; + } + return true; + } + } + return false; + } + + // Host buffer - can copy from any host buffer if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; } return false; - GGML_UNUSED(buffer); } static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; - if (ctx->data != nullptr) { + GGML_ASSERT(ctx->data != nullptr); + if (!ctx->is_remote) { memset(ctx->data, value, ctx->size); + } else { + // For remote (device) buffers, use OpenCL command queue + GGML_ASSERT(ggml_openvino_get_device_name() == "GPU"); + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n", + __func__); + } } }