From d7578497415cd660a0856f87823cfa76db5cdc64 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Mon, 22 Dec 2025 16:45:17 +0800
Subject: [PATCH] Put kvcache on GPU

---
 .../src/ggml-openvino/ggml-openvino-extra.cpp |  78 +++++++-
 ggml/src/ggml-openvino/ggml-openvino-extra.h  |  33 ++++
 ggml/src/ggml-openvino/ggml-openvino.cpp      | 169 ++++++++++++++++--
 3 files changed, 262 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index 085ae1ece4..aa50d46c03 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -46,13 +46,56 @@ void ggml_openvino_device_config::init() {
         ov_singleton_core().set_property(ov::cache_dir(cache_dir));
     }
 
-    if (device_name != "CPU") {
+    // Initialize remote context with queue sharing for GPU
+    if (device_name == "GPU") {
+        // Create OpenCL context and queue
+        cl_int err;
+        cl_platform_id platform;
+        err = clGetPlatformIDs(1, &platform, nullptr);
+        if (err != CL_SUCCESS) {
+            GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err);
+            return;
+        }
+
+        cl_device_id cl_device;
+        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr);
+        if (err != CL_SUCCESS) {
+            GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err);
+            return;
+        }
+
+        cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err);
+        if (err != CL_SUCCESS) {
+            GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err);
+            return;
+        }
+
+        cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err);
+        if (err != CL_SUCCESS) {
+            GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err);
+            clReleaseContext(cl_ctx);
+            return;
+        }
+
+        // Create OpenVINO remote context with queue sharing
+        remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue);
+
+        // Release the context (queue keeps a reference)
+        clReleaseContext(cl_ctx);
+    } else if (device_name == "NPU") {
         remote_context = ov_singleton_core().get_default_context(device_name);
     }
 
     initialized = true;
 }
 
+ggml_openvino_device_config::~ggml_openvino_device_config() {
+    if (cl_queue != nullptr) {
+        clReleaseCommandQueue(cl_queue);
+        cl_queue = nullptr;
+    }
+}
+
 // Get the global device config singleton
 ggml_openvino_device_config & ggml_openvino_get_device_config() {
     static ggml_openvino_device_config config;
@@ -84,6 +127,39 @@ const ov::AnyMap & ggml_openvino_get_compile_config() {
     return ggml_openvino_get_device_config().compile_config;
 }
 
+// Get the OpenCL command queue for GPU operations
+cl_command_queue ggml_openvino_get_cl_queue() {
+    return ggml_openvino_get_device_config().cl_queue;
+}
+
+// Get the clEnqueueMemFillINTEL function pointer (lazy load)
+clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() {
+    static clEnqueueMemFillINTEL_fn fn = nullptr;
+    static bool loaded = false;
+    if (!loaded) {
+        loaded = true;
+        cl_platform_id platform;
+        if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
+            fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
+        }
+    }
+    return fn;
+}
+
+// Get the clEnqueueMemcpyINTEL function pointer (lazy load)
+clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
+    static clEnqueueMemcpyINTEL_fn fn = nullptr;
+    static bool loaded = false;
+    if (!loaded) {
+        loaded = true;
+        cl_platform_id platform;
+        if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
+            fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
+        }
+    }
+    return fn;
+}
+
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
 std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
     if (!ggml_openvino_is_npu()) {
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index fdd8312dff..a1a8514190 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -3,6 +3,9 @@
 #include "ggml.h"
 #include "openvino/runtime/core.hpp"
 
+#define CL_TARGET_OPENCL_VERSION 300
+#include <CL/cl.h>
+
 #include <cstdlib>
 #include <memory>
 #include <openvino/core/node.hpp>
@@ -22,6 +25,34 @@ std::optional<ov::RemoteContext> ggml_openvino_get_remote_context();
 // Get the compile config for the current device
 const ov::AnyMap & ggml_openvino_get_compile_config();
 
+// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU)
+cl_command_queue ggml_openvino_get_cl_queue();
+
+// Intel USM extension function type
+typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue,
+                                                       void * dst_ptr,
+                                                       const void * pattern,
+                                                       size_t pattern_size,
+                                                       size_t size,
+                                                       cl_uint num_events_in_wait_list,
+                                                       const cl_event * event_wait_list,
+                                                       cl_event * event);
+
+typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue,
+                                                      cl_bool blocking,
+                                                      void * dst_ptr,
+                                                      const void * src_ptr,
+                                                      size_t size,
+                                                      cl_uint num_events_in_wait_list,
+                                                      const cl_event * event_wait_list,
+                                                      cl_event * event);
+
+// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available)
+clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL();
+
+// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available)
+clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL();
+
 // =====================================================
 // Global Device Configuration (singleton)
 // =====================================================
@@ -33,8 +64,10 @@ struct ggml_openvino_device_config {
     bool initialized = false;
     std::optional<ov::RemoteContext> remote_context;
     ov::AnyMap compile_config;
+    cl_command_queue cl_queue = nullptr;
 
     void init();
+    ~ggml_openvino_device_config();
 };
 
 // Get the global device config singleton
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index c5c25fb6c1..e139c2d662 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -8,6 +8,8 @@
 #include "ggml-quants.hpp"
 #include "ggml.h"
 
+#include <CL/cl_ext.h>
+
 #include <cstdint>
 #include <cstring>
 #include <memory>
@@ -52,17 +54,23 @@ struct ggml_backend_openvino_buffer_context {
     // For non-weight buffers (KV cache, compute), we still use contiguous allocation
     void * data;
     size_t size;
+    bool is_remote;
 
-    std::shared_ptr<ov::Tensor> ov_tensor;
+    // Wrapping of the buffer
+    std::shared_ptr<ov::Tensor> ov_buffer;
 
     // Track all extras for cleanup
     std::map<ggml_tensor *, ggml_openvino_extra_base *> tensor_extras;
 
-    ggml_backend_openvino_buffer_context(int device, size_t size) :
+    // Used for re-allocation on device for kvcache
+    void * data_prev;
+
+    ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
         device(device),
         name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
         data(nullptr),
-        size(size) {
+        size(size),
+        is_remote(is_remote) {
         if (size == 0) {
             return;
         }
@@ -76,17 +84,22 @@ struct ggml_backend_openvino_buffer_context {
 #else
             data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
 #endif
-            ov_tensor = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
+            ov_buffer = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
         } else if (device_name == "GPU") {
             auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
-            auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
+            ov::intel_gpu::ocl::USMTensor usm_tensor;
+            if (is_remote) {
+                usm_tensor = gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size});
+            } else {
+                usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
+            }
             data = usm_tensor.get();
-            ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
+            ov_buffer = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
         } else {
             auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
             auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
             data = l0_tensor.get();
-            ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
+            ov_buffer = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
         }
 
         if (data == nullptr) {
@@ -135,6 +148,22 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
 }
 
 static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
+    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
+
+    // Put kvcache on device memory for GPU
+    if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote &&
+        ggml_openvino_get_device_name() == "GPU") {
+        GGML_ASSERT(ctx->tensor_extras.empty());
+        auto device = ctx->device;
+        auto size = ctx->size;
+        auto * data_prev = ctx->data;
+        delete ctx;
+        ctx = new ggml_backend_openvino_buffer_context(device, size, true);
+        buffer->context = ctx;
+        tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev);
+    }
+
     // Views share the extra from view_src
     if (tensor->view_src != nullptr) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
@@ -144,7 +173,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
         return GGML_STATUS_SUCCESS;
     }
 
-    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
+    ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
 
     if (tensor->data != nullptr) {
         ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
@@ -166,9 +195,28 @@ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buf
                                                        uint8_t value,
                                                        size_t offset,
                                                        size_t size) {
+    GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
     GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
-    memset((char *) tensor->data + offset, value, size);
-    GGML_UNUSED(buffer);
+    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
+
+    if (ctx->is_remote) {
+        // For remote (device) buffers, use OpenCL USM memfill
+        cl_command_queue queue = ggml_openvino_get_cl_queue();
+        auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
+        if (queue != nullptr && mem_fill_fn != nullptr) {
+            uint8_t pattern = value;
+            cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr,
+                                     nullptr);
+            if (err != CL_SUCCESS) {
+                GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
+            }
+            clFinish(queue);
+        } else {
+            GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__);
+        }
+    } else {
+        memset((char *) tensor->data + offset, value, size);
+    }
 }
 
 static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer,
@@ -176,6 +224,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
                                                     const void * data,
                                                     size_t offset,
                                                     size_t size) {
+    // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
     GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
     ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
 
@@ -260,8 +309,23 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
                            e.what());
         }
     } else {
-        // Non-weight tensor (KV cache, activations, etc.) - just copy data
-        memcpy((char *) tensor->data + offset, data, size);
+        // Non-weight tensor (KV cache, activations, etc.) - copy data
+        if (ctx->is_remote) {
+            // For remote (device) buffers, use OpenCL USM memcpy (host-to-device)
+            cl_command_queue queue = ggml_openvino_get_cl_queue();
+            auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
+            if (queue != nullptr && mem_cpy_fn != nullptr) {
+                cl_int err =
+                    mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr);
+                if (err != CL_SUCCESS) {
+                    GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
+                }
+            } else {
+                GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
+            }
+        } else {
+            memcpy((char *) tensor->data + offset, data, size);
+        }
 
         ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
         if (extra == nullptr) {
@@ -283,28 +347,99 @@ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer
                                                     void * data,
                                                     size_t offset,
                                                     size_t size) {
+    // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
     GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
-    memcpy(data, (const char *) tensor->data + offset, size);
-    GGML_UNUSED(buffer);
+    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
+
+    if (ctx->is_remote) {
+        // For remote (device) buffers, use OpenCL USM memcpy (device-to-host)
+        cl_command_queue queue = ggml_openvino_get_cl_queue();
+        auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
+        if (queue != nullptr && mem_cpy_fn != nullptr) {
+            cl_int err =
+                mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr);
+            if (err != CL_SUCCESS) {
+                GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
+            }
+        } else {
+            GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
+        }
+    } else {
+        memcpy(data, (const char *) tensor->data + offset, size);
+    }
 }
 
 static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                                     const ggml_tensor * src,
                                                     ggml_tensor * dst) {
+    // GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name);
     GGML_ASSERT(src != nullptr && dst != nullptr);
-    // Can copy from any host buffer (including other OpenVINO buffers)
+    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
+
+    if (ctx->is_remote) {
+        // For remote (device) buffers, use OpenCL USM memcpy
+        cl_command_queue queue = ggml_openvino_get_cl_queue();
+        auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
+        if (queue == nullptr || mem_cpy_fn == nullptr) {
+            GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
+            return false;
+        }
+        // Can copy from host to device
+        if (ggml_backend_buffer_is_host(src->buffer)) {
+            cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
+            if (err != CL_SUCCESS) {
+                GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err);
+                return false;
+            }
+            return true;
+        }
+        // Can also copy from device to device if both are OpenVINO remote buffers
+        if (ggml_backend_buffer_is_openvino(src->buffer)) {
+            ggml_backend_openvino_buffer_context * src_ctx =
+                (ggml_backend_openvino_buffer_context *) src->buffer->context;
+            if (src_ctx->is_remote) {
+                cl_int err =
+                    mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
+                if (err != CL_SUCCESS) {
+                    GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__,
+                                   err);
+                    return false;
+                }
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Host buffer - can copy from any host buffer
     if (ggml_backend_buffer_is_host(src->buffer)) {
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
     }
     return false;
-    GGML_UNUSED(buffer);
 }
 
 static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
-    if (ctx->data != nullptr) {
+    GGML_ASSERT(ctx->data != nullptr);
+    if (!ctx->is_remote) {
         memset(ctx->data, value, ctx->size);
+    } else {
+        // For remote (device) buffers, use OpenCL command queue
+        GGML_ASSERT(ggml_openvino_get_device_name() == "GPU");
+        cl_command_queue queue = ggml_openvino_get_cl_queue();
+        auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
+        if (queue != nullptr && mem_fill_fn != nullptr) {
+            uint8_t pattern = value;
+            cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr);
+            if (err != CL_SUCCESS) {
+                GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
+            }
+            clFinish(queue);
+        } else {
+            GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n",
+                          __func__);
+        }
     }
 }