From b7feacf7f39d79518e6ca48a9fcf697c5244b585 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 Jan 2026 10:49:40 +0100 Subject: [PATCH] ggml: new backend for Virglrenderer API Remoting acceleration (v2) (#18718) --- CODEOWNERS | 1 + ggml/CMakeLists.txt | 3 + ggml/include/ggml-virtgpu.h | 16 + ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 14 + ggml/src/ggml-virtgpu/CMakeLists.txt | 70 +++ .../ggml-virtgpu/apir_cs_ggml-rpc-front.cpp | 87 +++ ggml/src/ggml-virtgpu/backend/CMakeLists.txt | 21 + .../backend/apir_cs_ggml-rpc-back.cpp | 115 ++++ .../ggml-virtgpu/backend/backend-convert.h | 13 + .../backend/backend-dispatched-backend.cpp | 65 +++ .../backend-dispatched-buffer-type.cpp | 89 ++++ .../backend/backend-dispatched-buffer.cpp | 131 +++++ .../backend/backend-dispatched-device.cpp | 148 ++++++ .../backend/backend-dispatched.cpp | 46 ++ .../backend/backend-dispatched.gen.h | 130 +++++ .../ggml-virtgpu/backend/backend-dispatched.h | 23 + .../ggml-virtgpu/backend/backend-virgl-apir.h | 32 ++ ggml/src/ggml-virtgpu/backend/backend.cpp | 148 ++++++ .../backend/shared/api_remoting.h | 90 ++++ .../backend/shared/apir_backend.gen.h | 36 ++ .../backend/shared/apir_backend.h | 46 ++ .../src/ggml-virtgpu/backend/shared/apir_cs.h | 383 ++++++++++++++ .../backend/shared/apir_cs_ggml.h | 211 ++++++++ .../ggml-virtgpu/backend/shared/apir_cs_rpc.h | 54 ++ .../ggml-virtgpu/ggml-backend-buffer-type.cpp | 98 ++++ ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp | 119 +++++ ggml/src/ggml-virtgpu/ggml-backend-device.cpp | 144 +++++ ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 137 +++++ ggml/src/ggml-virtgpu/ggml-backend.cpp | 69 +++ ggml/src/ggml-virtgpu/ggml-remoting.h | 68 +++ .../ggml-virtgpu/ggmlremoting_functions.yaml | 168 ++++++ ggml/src/ggml-virtgpu/include/apir_hw.h | 9 + ggml/src/ggml-virtgpu/regenerate_remoting.py | 322 +++++++++++ ggml/src/ggml-virtgpu/virtgpu-apir.h | 15 + .../ggml-virtgpu/virtgpu-forward-backend.cpp | 50 ++ .../virtgpu-forward-buffer-type.cpp | 125 +++++ .../ggml-virtgpu/virtgpu-forward-buffer.cpp | 157 ++++++ .../ggml-virtgpu/virtgpu-forward-device.cpp | 200 +++++++ ggml/src/ggml-virtgpu/virtgpu-forward-impl.h | 29 + ggml/src/ggml-virtgpu/virtgpu-forward.gen.h | 51 ++ ggml/src/ggml-virtgpu/virtgpu-shm.cpp | 99 ++++ ggml/src/ggml-virtgpu/virtgpu-shm.h | 23 + ggml/src/ggml-virtgpu/virtgpu-utils.cpp | 179 +++++++ ggml/src/ggml-virtgpu/virtgpu-utils.h | 86 +++ ggml/src/ggml-virtgpu/virtgpu.cpp | 498 ++++++++++++++++++ ggml/src/ggml-virtgpu/virtgpu.h | 92 ++++ 47 files changed, 4711 insertions(+) create mode 100644 ggml/include/ggml-virtgpu.h create mode 100644 ggml/src/ggml-virtgpu/CMakeLists.txt create mode 100644 ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/CMakeLists.txt create mode 100644 ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-convert.h create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h create mode 100644 ggml/src/ggml-virtgpu/backend/backend-dispatched.h create mode 100644 ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h create mode 100644 ggml/src/ggml-virtgpu/backend/backend.cpp create mode 100644 ggml/src/ggml-virtgpu/backend/shared/api_remoting.h create mode 100644 ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h create mode 100644 ggml/src/ggml-virtgpu/backend/shared/apir_backend.h create mode 100644 ggml/src/ggml-virtgpu/backend/shared/apir_cs.h create mode 100644 ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h create mode 100644 ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h create mode 100644 ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp create mode 100644 ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp create mode 100644 ggml/src/ggml-virtgpu/ggml-backend-device.cpp create mode 100644 ggml/src/ggml-virtgpu/ggml-backend-reg.cpp create mode 100644 ggml/src/ggml-virtgpu/ggml-backend.cpp create mode 100644 ggml/src/ggml-virtgpu/ggml-remoting.h create mode 100644 ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml create mode 100644 ggml/src/ggml-virtgpu/include/apir_hw.h create mode 100755 ggml/src/ggml-virtgpu/regenerate_remoting.py create mode 100644 ggml/src/ggml-virtgpu/virtgpu-apir.h create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward-impl.h create mode 100644 ggml/src/ggml-virtgpu/virtgpu-forward.gen.h create mode 100644 ggml/src/ggml-virtgpu/virtgpu-shm.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-shm.h create mode 100644 ggml/src/ggml-virtgpu/virtgpu-utils.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu-utils.h create mode 100644 ggml/src/ggml-virtgpu/virtgpu.cpp create mode 100644 ggml/src/ggml-virtgpu/virtgpu.h diff --git a/CODEOWNERS b/CODEOWNERS index 55f5011dfa..6086abb564 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -67,6 +67,7 @@ /ggml/src/ggml-rpc/ @rgerganov /ggml/src/ggml-threading.* @ggerganov /ggml/src/ggml-vulkan/ @0cc4m +/ggml/src/ggml-virtgpu/ @kpouget /ggml/src/ggml-webgpu/ @reeselevine /ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM /ggml/src/ggml.c @ggerganov diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0176ca1ce9..b0b8e57898 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -228,6 +228,8 @@ option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU) option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON) option(GGML_ZDNN "ggml: use zDNN" OFF) +option(GGML_VIRTGPU "ggml: use the VirtGPU/Virglrenderer API Remoting frontend" OFF) +option(GGML_VIRTGPU_BACKEND "ggml: build the VirtGPU/Virglrenderer API Remoting backend" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) @@ -320,6 +322,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-opt.h include/ggml-metal.h include/ggml-rpc.h + include/ggml-virtgpu.h include/ggml-sycl.h include/ggml-vulkan.h include/ggml-webgpu.h diff --git a/ggml/include/ggml-virtgpu.h b/ggml/include/ggml-virtgpu.h new file mode 100644 index 0000000000..1cb4bd7a03 --- /dev/null +++ b/ggml/include/ggml-virtgpu.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend" + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg(); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6192a87046..260ad48f0e 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -451,6 +451,7 @@ ggml_add_backend(HIP) ggml_add_backend(METAL) ggml_add_backend(MUSA) ggml_add_backend(RPC) +ggml_add_backend(VirtGPU) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(WebGPU) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 6bee1bc4b4..dd991f262e 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -69,6 +69,10 @@ #include "ggml-rpc.h" #endif +#ifdef GGML_USE_VIRTGPU_FRONTEND +#include "ggml-virtgpu.h" +#endif + #ifdef GGML_USE_CANN #include "ggml-cann.h" #endif @@ -180,7 +184,12 @@ struct ggml_backend_registry { register_backend(ggml_backend_sycl_reg()); #endif #ifdef GGML_USE_VULKAN + // Add runtime disable check + if (getenv("GGML_DISABLE_VULKAN") == nullptr) { register_backend(ggml_backend_vk_reg()); + } else { + GGML_LOG_DEBUG("Vulkan backend disabled by GGML_DISABLE_VULKAN environment variable\n"); + } #endif #ifdef GGML_USE_WEBGPU register_backend(ggml_backend_webgpu_reg()); @@ -188,6 +197,10 @@ struct ggml_backend_registry { #ifdef GGML_USE_ZDNN register_backend(ggml_backend_zdnn_reg()); #endif +#ifdef GGML_USE_VIRTGPU_FRONTEND + register_backend(ggml_backend_virtgpu_reg()); +#endif + #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif @@ -604,6 +617,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("virtgpu", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); diff --git a/ggml/src/ggml-virtgpu/CMakeLists.txt b/ggml/src/ggml-virtgpu/CMakeLists.txt new file mode 100644 index 0000000000..e6b020beb5 --- /dev/null +++ b/ggml/src/ggml-virtgpu/CMakeLists.txt @@ -0,0 +1,70 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +include(ExternalProject) + +message(STATUS "Including the VirtGPU/Virglrenderer API Remoting") + +# Download venus_hw.h from virglrenderer repository +ExternalProject_Add( + venus_hw_header + URL https://gitlab.freedesktop.org/virgl/virglrenderer/-/raw/virglrenderer-1.2.0/src/venus_hw.h + DOWNLOAD_NO_EXTRACT YES + DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include + DOWNLOAD_NAME venus_hw.h + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + LOG_DOWNLOAD ON +) + +if (NOT GGML_VIRTGPU_BACKEND STREQUAL "ONLY") + message(STATUS "Enable the VirtGPU/Virglrenderer API Remoting frontend library") + + find_package(PkgConfig REQUIRED) + pkg_check_modules(DRM REQUIRED libdrm) + if (NOT GGML_BACKEND_DL) + # cannot simply use USE_VIRTGPU, as in the 'else()' case the + # frontend isn't compiled + target_compile_definitions(ggml PUBLIC "GGML_USE_VIRTGPU_FRONTEND") + endif() + + ggml_add_backend_library(ggml-virtgpu + ggml-backend-buffer.cpp + ggml-backend.cpp + ggml-backend-device.cpp + ggml-backend-reg.cpp + ggml-backend-buffer-type.cpp + virtgpu-apir.h + virtgpu-forward.gen.h + virtgpu.cpp + virtgpu-shm.cpp + virtgpu-utils.cpp + virtgpu-forward-device.cpp + virtgpu-forward-buffer-type.cpp + virtgpu-forward-buffer.cpp + virtgpu-forward-backend.cpp + virtgpu-forward-impl.h + apir_cs_ggml-rpc-front.cpp + ../../include/ggml-virtgpu.h) + + target_include_directories(ggml-virtgpu PUBLIC /usr/include/libdrm/) + + target_link_libraries(ggml-virtgpu PUBLIC ${DRM_LIBRARIES}) + target_include_directories(ggml-virtgpu PUBLIC ${DRM_INCLUDE_DIRS}) + target_compile_options(ggml-virtgpu PUBLIC ${DRM_CFLAGS_OTHER}) + + target_include_directories(ggml-virtgpu PUBLIC ./include) + target_include_directories(ggml-virtgpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + + # Ensure venus_hw.h is downloaded before building ggml-virtgpu + add_dependencies(ggml-virtgpu venus_hw_header) + + target_compile_options(ggml-virtgpu PRIVATE -std=c++20) +else() + message(STATUS "Not building the VirtGPU/Virglrenderer API Remoting frontend library") +endif() + +if (NOT GGML_VIRTGPU_BACKEND STREQUAL "OFF") + add_subdirectory("backend") +endif() diff --git a/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp b/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp new file mode 100644 index 0000000000..f60ae3556c --- /dev/null +++ b/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp @@ -0,0 +1,87 @@ +#include "backend/shared/apir_cs_rpc.h" +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-remoting.h" + +#include +#include +#include +#include + +apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) { + apir_rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HOST_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + if (tensor->data) { + if (!tensor->buffer) { + GGML_ABORT("tensor has data but not buffer"); + } + // tensor->data is serialized as an offset to the buffer base address + result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); + } + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +void apir_add_tensor(ggml_tensor * tensor, + std::vector & tensors, + std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + apir_add_tensor(tensor->src[i], tensors, visited); + } + apir_add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(apir_serialize_tensor(tensor)); +} + +void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + apir_add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(apir_rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = + sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(apir_rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + apir_rpc_tensor * out_tensors = + (apir_rpc_tensor *) (output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(apir_rpc_tensor)); +} diff --git a/ggml/src/ggml-virtgpu/backend/CMakeLists.txt b/ggml/src/ggml-virtgpu/backend/CMakeLists.txt new file mode 100644 index 0000000000..0b49c403b9 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable the VirtGPU/Virglrenderer backend library") + +ggml_add_backend_library(ggml-virtgpu-backend + backend.cpp + backend-dispatched.cpp + backend-dispatched-backend.cpp + backend-dispatched-device.cpp + backend-dispatched-buffer.cpp + backend-dispatched-buffer-type.cpp + shared/api_remoting.h + shared/apir_backend.h + shared/apir_cs.h + apir_cs_ggml-rpc-back.cpp) + +target_compile_options(ggml-virtgpu-backend PRIVATE -std=c++20) + +# Add include directory for ggml-backend-impl.h and other core headers +target_include_directories(ggml-virtgpu-backend PRIVATE ../..) diff --git a/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp b/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp new file mode 100644 index 0000000000..60a8a93bfb --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp @@ -0,0 +1,115 @@ +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "shared/apir_cs_rpc.h" + +#include +#include +#include +#include + +std::unordered_set backend_buffers; + +void apir_track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer) { + auto it = backend_buffers.find(buffer); + if (it == backend_buffers.end()) { + return false; + } + + backend_buffers.erase(it); + return true; +} + +std::unordered_set apir_get_track_backend_buffers() { + return backend_buffers; +} + +ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor) { + ggml_tensor * result = + ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: HOST BUFFER NOT FOUND | %p\n", (void *) result->buffer); + result->buffer = nullptr; + } + + uint64_t tensor_data = tensor->data; + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + + // tensor->data is serialized as an offset to the buffer base address + tensor_data += buffer_start; + + GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow + GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor_data); + ggml_set_name(result, tensor->name); + return result; +} + +ggml_tensor * apir_create_node(uint64_t id, + ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const apir_rpc_tensor * tensor = tensor_ptrs.at(id); + ggml_tensor * result = apir_deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = apir_create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = apir_create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * apir_deserialize_graph(uint32_t n_nodes, + uint32_t n_tensors, + const apir_rpc_tensor * tensors, + const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead() * (n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + ggml_init_params params = { + /*.mem_size =*/buf_size, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/true, + }; + ggml_context * ctx = ggml_init(params); + ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = apir_create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-convert.h b/ggml/src/ggml-virtgpu/backend/backend-convert.h new file mode 100644 index 0000000000..1978d21f7e --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-convert.h @@ -0,0 +1,13 @@ +#include "shared/apir_backend.h" + +#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name) + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_host_handle_t) buffer; +} + +static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp new file mode 100644 index 0000000000..77b4ee71e1 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp @@ -0,0 +1,65 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "shared/apir_backend.h" + +#include + +uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(enc); + + static bool async_backend_initialized = false; + static bool async_backend; + + if (!async_backend_initialized) { + ggml_backend_dev_props props; + + dev->iface.get_props(dev, &props); + async_backend = props.caps.async; + async_backend_initialized = true; + } + + uint32_t shmem_res_id; + apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); + if (!shmem_data) { + GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + apir_decoder_set_fatal(dec); + return 1; + } + size_t cgraph_size; + apir_decode_size_t(dec, &cgraph_size); + + apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size); + + ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size); + + ggml_status status; +#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 + for (int idx = 0; idx < cgraph->n_nodes; idx++) { + ggml_tensor * op = ggml_graph_node(cgraph, idx); + if (dev->iface.supports_op(dev, op)) { + continue; + } + GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op)); + + status = GGML_STATUS_ABORTED; + apir_encode_ggml_status(enc, &status); + + return 0; + } +#endif + status = bck->iface.graph_compute(bck, cgraph); + + if (async_backend) { + bck->iface.synchronize(bck); + } + + apir_encode_ggml_status(enc, &status); + + return 0; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp new file mode 100644 index 0000000000..8ea1bb4fb4 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp @@ -0,0 +1,89 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" + +#include + +uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + const char * string = buft->iface.get_name(buft); + + const size_t string_size = strlen(string) + 1; + apir_encode_array_size(enc, string_size); + apir_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_alignment(buft); + apir_encode_size_t(enc, &value); + + return 0; +} + +uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_max_size(buft); + apir_encode_size_t(enc, &value); + + return 0; +} + +uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + bool is_host = buft->iface.is_host(buft); + apir_encode_bool_t(enc, &is_host); + + return 0; +} + +uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + size_t size; + apir_decode_size_t(dec, &size); + + ggml_backend_buffer_t buffer; + + buffer = buft->iface.alloc_buffer(buft, size); + + apir_encode_ggml_buffer(enc, buffer); + + if (buffer) { + apir_track_backend_buffer(buffer); + } + + return 0; +} + +uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = apir_decode_ggml_buffer_type(dec); + + const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec); + + size_t value = buft->iface.get_alloc_size(buft, op); + + apir_encode_size_t(enc, &value); + + return 0; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp new file mode 100644 index 0000000000..cf81888e98 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp @@ -0,0 +1,131 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" + +#include + +uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); + apir_encode_uintptr_t(enc, &base); + + return 0; +} + +uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + ggml_tensor * tensor; + // safe to remove the const qualifier here + tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + apir_decode_size_t(dec, &offset); + + size_t size; + apir_decode_size_t(dec, &size); + + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); + + if (!shmem_data) { + GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + return 1; + } + + buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size); + + return 0; +} + +uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + const ggml_tensor * tensor; + // safe to remove the const qualifier here + tensor = apir_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + apir_decode_size_t(dec, &offset); + + size_t size; + apir_decode_size_t(dec, &size); + + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); + if (!shmem_data) { + GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + return 1; + } + + buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); + + return 0; +} + +uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + const ggml_tensor * src; + // safe to remove the const qualifier here + src = apir_decode_ggml_tensor(dec); + ggml_tensor * dst = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec); + + bool ret = buffer->iface.cpy_tensor(buffer, src, (ggml_tensor *) dst); + + apir_encode_bool_t(enc, &ret); + + return 0; +} + +uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + uint8_t value; + apir_decode_uint8_t(dec, &value); + + buffer->iface.clear(buffer, value); + + return 0; +} + +uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = apir_decode_ggml_buffer(dec); + + if (!apir_untrack_backend_buffer(buffer)) { + GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer); + return 1; + } + + buffer->iface.free_buffer(buffer); + + return 0; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp new file mode 100644 index 0000000000..497f737a88 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp @@ -0,0 +1,148 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" + +#include + +uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + apir_encode_int32_t(enc, &dev_count); + + return 0; +} + +uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + apir_encode_int32_t(enc, &dev_count); + + return 0; +} + +uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + const char * string = dev->iface.get_name(dev); + + const size_t string_size = strlen(string) + 1; + apir_encode_array_size(enc, string_size); + apir_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + const char * string = dev->iface.get_description(dev); + + const size_t string_size = strlen(string) + 1; + apir_encode_array_size(enc, string_size); + apir_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + uint32_t type = dev->iface.get_type(dev); + apir_encode_uint32_t(enc, &type); + + return 0; +} + +uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + + apir_encode_size_t(enc, &free); + apir_encode_size_t(enc, &total); + + return 0; +} + +uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + + const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec); + + bool supports_op = dev->iface.supports_op(dev, op); + + apir_encode_bool_t(enc, &supports_op); + + return 0; +} + +uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); + + apir_encode_ggml_buffer_type(enc, bufft); + + return 0; +} + +uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + ggml_backend_dev_props props; + dev->iface.get_props(dev, &props); + + apir_encode_bool_t(enc, &props.caps.async); + apir_encode_bool_t(enc, &props.caps.host_buffer); + apir_encode_bool_t(enc, &props.caps.buffer_from_host_ptr); + apir_encode_bool_t(enc, &props.caps.events); + + return 0; +} + +uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { + GGML_UNUSED(ctx); + GGML_UNUSED(dec); + + uint32_t shmem_res_id; + apir_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); + if (!shmem_ptr) { + GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + apir_decoder_set_fatal(dec); + return 1; + } + + size_t size; + apir_decode_size_t(dec, &size); + size_t max_tensor_size; + apir_decode_size_t(dec, &max_tensor_size); + + ggml_backend_buffer_t buffer; + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); + + apir_encode_ggml_buffer(enc, buffer); + apir_encode_ggml_buffer_type(enc, buffer->buft); + + if (buffer) { + apir_track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp new file mode 100644 index 0000000000..51d445725f --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp @@ -0,0 +1,46 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" + +#include + +ggml_backend_reg_t reg = NULL; +ggml_backend_dev_t dev = NULL; +ggml_backend_t bck = NULL; + +uint64_t timer_start = 0; +uint64_t timer_total = 0; +uint64_t timer_count = 0; + +uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) { + if (reg != NULL) { + GGML_LOG_WARN("%s: already initialized\n", __func__); + return APIR_BACKEND_INITIALIZE_ALREADY_INITED; + } + ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; + + reg = ggml_backend_reg_fct(); + if (reg == NULL) { + GGML_LOG_ERROR("%s: backend registration failed\n", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED; + } + + if (!reg->iface.get_device_count(reg)) { + GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__); + return APIR_BACKEND_INITIALIZE_NO_DEVICE; + } + + dev = reg->iface.get_device(reg, 0); + + if (!dev) { + GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__); + return APIR_BACKEND_INITIALIZE_NO_DEVICE; + } + + bck = dev->iface.init_backend(dev, NULL); + + return APIR_BACKEND_INITIALIZE_SUCCESS; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h new file mode 100644 index 0000000000..b81fd5039b --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h @@ -0,0 +1,130 @@ +#pragma once + +/* device */ +uint32_t backend_device_get_device_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_count(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_description(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_memory(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_supports_op(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_buffer_type(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_get_props(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); + +/* buffer-type */ +uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); + +/* buffer */ +uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); + +/* backend */ +uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); + +static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) { + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT: + return "backend_device_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: + return "backend_device_get_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: + return "backend_device_get_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: + return "backend_device_get_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: + return "backend_device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: + return "backend_device_get_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: + return "backend_device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: + return "backend_device_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: + return "backend_device_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: + return "backend_device_buffer_from_ptr"; + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: + return "backend_buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: + return "backend_buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: + return "backend_buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: + return "backend_buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: + return "backend_buffer_type_alloc_buffer"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: + return "backend_buffer_type_get_alloc_size"; + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: + return "backend_buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: + return "backend_buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: + return "backend_buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR: + return "backend_buffer_cpy_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: + return "backend_buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: + return "backend_buffer_free_buffer"; + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: + return "backend_backend_graph_compute"; + + default: + return "unknown"; + } +} + +extern "C" { +static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { + + /* device */ + + /* APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = */ backend_device_get_device_count, + /* APIR_COMMAND_TYPE_DEVICE_GET_COUNT = */ backend_device_get_count, + /* APIR_COMMAND_TYPE_DEVICE_GET_NAME = */ backend_device_get_name, + /* APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = */ backend_device_get_description, + /* APIR_COMMAND_TYPE_DEVICE_GET_TYPE = */ backend_device_get_type, + /* APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = */ backend_device_get_memory, + /* APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = */ backend_device_supports_op, + /* APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = */ backend_device_get_buffer_type, + /* APIR_COMMAND_TYPE_DEVICE_GET_PROPS = */ backend_device_get_props, + /* APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = */ backend_device_buffer_from_ptr, + + /* buffer-type */ + + /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = */ backend_buffer_type_get_name, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = */ backend_buffer_type_get_alignment, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = */ backend_buffer_type_get_max_size, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = */ backend_buffer_type_alloc_buffer, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = */ backend_buffer_type_get_alloc_size, + + /* buffer */ + + /* APIR_COMMAND_TYPE_BUFFER_GET_BASE = */ backend_buffer_get_base, + /* APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = */ backend_buffer_set_tensor, + /* APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = */ backend_buffer_get_tensor, + /* APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR = */ backend_buffer_cpy_tensor, + /* APIR_COMMAND_TYPE_BUFFER_CLEAR = */ backend_buffer_clear, + /* APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = */ backend_buffer_free_buffer, + + /* backend */ + + /* APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = */ backend_backend_graph_compute, +}; +} diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h new file mode 100644 index 0000000000..6ccbecf078 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + +#include + +#include "backend-convert.h" +#include "backend-virgl-apir.h" +#include "shared/apir_backend.h" +#include "shared/apir_cs.h" +#include "shared/apir_cs_ggml.h" + +struct virgl_apir_context { + uint32_t ctx_id; + virgl_apir_callbacks * iface; +}; + +typedef uint32_t (*backend_dispatch_t)(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); + +#include "backend-dispatched.gen.h" + +uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p); diff --git a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h new file mode 100644 index 0000000000..44b347f853 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h @@ -0,0 +1,32 @@ +#pragma once + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "shared/api_remoting.h" + +#include +#include +#include + +extern ggml_backend_reg_t reg; +extern ggml_backend_dev_t dev; +extern ggml_backend_t bck; + +struct virgl_apir_callbacks { + const char * (*get_config)(uint32_t virgl_ctx_id, const char * key); + void * (*get_shmem_ptr)(uint32_t virgl_ctx_id, uint32_t res_id); +}; + +extern "C" { +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs); +void apir_backend_deinit(uint32_t virgl_ctx_id); +uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, + virgl_apir_callbacks * virgl_cbs, + uint32_t cmd_type, + char * dec_cur, + const char * dec_end, + char * enc_cur, + const char * enc_end, + char ** enc_cur_after); +} diff --git a/ggml/src/ggml-virtgpu/backend/backend.cpp b/ggml/src/ggml-virtgpu/backend/backend.cpp new file mode 100644 index 0000000000..95d602ed60 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/backend.cpp @@ -0,0 +1,148 @@ +#include "backend-dispatched.h" +#include "backend-virgl-apir.h" + +#include "shared/api_remoting.h" +#include "shared/apir_backend.h" +#include "shared/apir_cs.h" + +#include +#include + +#include + +#define APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH" +#define APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG" +#define APIR_LLAMA_CPP_LOG_TO_FILE_ENV "APIR_LLAMA_CPP_LOG_TO_FILE" + +#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init" + +static void * backend_library_handle = NULL; +static FILE * apir_logfile = NULL; + +static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) { + FILE * logfile = (FILE *)user_data; + fprintf(logfile, "[%d] %s", level, text); + fflush(logfile); +} + +extern "C" { +void apir_backend_deinit(uint32_t virgl_ctx_id) { + GGML_UNUSED(virgl_ctx_id); + + auto buffers = apir_get_track_backend_buffers(); + for (const auto & buffer : buffers) { + apir_untrack_backend_buffer(buffer); + buffer->iface.free_buffer(buffer); + } + + if (dev) { + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024); + } + + if (backend_library_handle) { + GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__); + dlclose(backend_library_handle); + backend_library_handle = NULL; + } + + if (apir_logfile) { + fclose(apir_logfile); + apir_logfile = NULL; + } +} + +#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path" +#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg" + +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) { + const char * dlsym_error; + + const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV); + if (apir_log_to_file) { + apir_logfile = fopen(apir_log_to_file, "w"); + if (apir_logfile) { + ggml_log_set(log_to_file_callback, apir_logfile); + } else { + GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file); + } + } + + const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY); + const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY); + const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; + + if (!library_name) { + GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); + + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; + } + + backend_library_handle = dlopen(library_name, RTLD_LAZY); + + if (!backend_library_handle) { + GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror()); + + return APIR_LOAD_LIBRARY_CANNOT_OPEN; + } + + if (!library_reg) { + GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); + + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; + } + + void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); + dlsym_error = dlerror(); + if (dlsym_error) { + GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg, + APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error); + + return APIR_LOAD_LIBRARY_SYMBOL_MISSING; + } + + uint32_t ret = backend_dispatch_initialize(ggml_backend_reg_fct); + + return (ApirLoadLibraryReturnCode) (APIR_LOAD_LIBRARY_INIT_BASE_INDEX + ret); +} + +uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, + virgl_apir_callbacks * virgl_cbs, + uint32_t cmd_type, + char * dec_cur, + const char * dec_end, + char * enc_cur, + const char * enc_end, + char ** enc_cur_after) { + apir_encoder enc = { + .cur = enc_cur, + .start = enc_cur, + .end = enc_end, + .fatal = false, + }; + + apir_decoder dec = { + .cur = dec_cur, + .end = dec_end, + .fatal = false, + }; + + virgl_apir_context ctx = { + .ctx_id = virgl_ctx_id, + .iface = virgl_cbs, + }; + + if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) { + GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + return APIR_BACKEND_FORWARD_INDEX_INVALID; + } + + backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; + uint32_t ret = forward_fct(&enc, &dec, &ctx); + + *enc_cur_after = enc.cur; + + return ret; +} +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h new file mode 100644 index 0000000000..f19a5d12d1 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h @@ -0,0 +1,90 @@ +#pragma once + +/* the rest of this file must match virglrenderer/src/apir-protocol.h */ + +#include + +#include + +#define APIR_PROTOCOL_MAJOR 0 +#define APIR_PROTOCOL_MINOR 1 + +#define APIR_HANDSHAKE_MAGIC 0xab1e + +enum ApirCommandType { + APIR_COMMAND_TYPE_HANDSHAKE = 0, + APIR_COMMAND_TYPE_LOADLIBRARY = 1, + APIR_COMMAND_TYPE_FORWARD = 2, + + APIR_COMMAND_TYPE_LENGTH = 3, +}; + +typedef uint64_t ApirCommandFlags; + +enum ApirLoadLibraryReturnCode { + APIR_LOAD_LIBRARY_SUCCESS = 0, + APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1, + APIR_LOAD_LIBRARY_ALREADY_LOADED = 2, + APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3, + APIR_LOAD_LIBRARY_CANNOT_OPEN = 4, + APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5, + APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code +}; + +enum ApirForwardReturnCode { + APIR_FORWARD_SUCCESS = 0, + APIR_FORWARD_NO_DISPATCH_FCT = 1, + APIR_FORWARD_TIMEOUT = 2, + + APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code +} ; + +__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) { + switch (type) { + case APIR_COMMAND_TYPE_HANDSHAKE: + return "HandShake"; + case APIR_COMMAND_TYPE_LOADLIBRARY: + return "LoadLibrary"; + case APIR_COMMAND_TYPE_FORWARD: + return "Forward"; + default: + return "unknown"; + } +} + +__attribute__((unused)) static const char * apir_load_library_error(ApirLoadLibraryReturnCode code) { +#define APIR_LOAD_LIBRARY_ERROR(code_name) \ + do { \ + if (code == code_name) \ + return #code_name; \ + } while (0) + + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING); + APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX); + + return "Unknown APIR_COMMAND_TYPE_LoadLibrary error"; + +#undef APIR_LOAD_LIBRARY_ERROR +} + +__attribute__((unused)) static const char * apir_forward_error(ApirForwardReturnCode code) { +#define APIR_FORWARD_ERROR(code_name) \ + do { \ + if (code == code_name) \ + return #code_name; \ + } while (0) + + APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS); + APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT); + APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT); + APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX); + + return "Unknown APIR_COMMAND_TYPE_FORWARD error"; + +#undef APIR_FORWARD_ERROR +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h new file mode 100644 index 0000000000..d214b6f2a9 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h @@ -0,0 +1,36 @@ +typedef enum ApirBackendCommandType { + + /* device */ + APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT = 0, + APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 1, + APIR_COMMAND_TYPE_DEVICE_GET_NAME = 2, + APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 3, + APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 4, + APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 5, + APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 6, + APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 7, + APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 8, + APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 9, + + /* buffer-type */ + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 11, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 12, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 13, + APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 14, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = 15, + + /* buffer */ + APIR_COMMAND_TYPE_BUFFER_GET_BASE = 16, + APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 17, + APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 18, + APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR = 19, + APIR_COMMAND_TYPE_BUFFER_CLEAR = 20, + APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 21, + + /* backend */ + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 22, + + // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 23, +} ApirBackendCommandType; diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h new file mode 100644 index 0000000000..f3efa52c72 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h @@ -0,0 +1,46 @@ +#pragma once + +#include "apir_backend.gen.h" + +#include // for uintptr_t +#include // for timespec, clock_gettime + +#define APIR_BACKEND_INITIALIZE_SUCCESS 0 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 +#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 +#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 +#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5 +#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6 +#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7 +#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8 + + +// new entries here need to be added to the apir_backend_initialize_error function below + +#define APIR_BACKEND_FORWARD_INDEX_INVALID 6 + +// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received +#define APIR_BACKEND_CHECK_SUPPORTS_OP 0 + +typedef uintptr_t apir_buffer_type_host_handle_t; +typedef uintptr_t apir_buffer_host_handle_t; + +static const char * apir_backend_initialize_error(int code) { +#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \ + do { \ + if (code == code_name) \ + return #code_name; \ + } while (0) + + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED); + + return "Unknown APIR_BACKEND_INITIALIZE error:/"; + +#undef APIR_BACKEND_INITIALIZE_ERROR +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h new file mode 100644 index 0000000000..27a61091ff --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h @@ -0,0 +1,383 @@ +#pragma once + +#include "ggml-impl.h" + +#include +#include + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct apir_encoder { + char * cur; + const char * start; + const char * end; + bool fatal; + +}; + +struct apir_decoder { + const char * cur; + const char * end; + bool fatal; +}; + +/* + * new encoder and decoder + */ + +static apir_decoder apir_new_decoder(const char * ptr, size_t size) { + apir_decoder dec = { + .cur = ptr, + .end = ptr + size, + .fatal = false, + }; + + return dec; +} + +static apir_encoder apir_new_encoder(char * ptr, size_t size) { + apir_encoder enc = { + .cur = ptr, + .start = ptr, + .end = ptr + size, + .fatal = false, + }; + + return enc; +} + +/* + * fatal flag handling + */ + +static inline void apir_encoder_reset_fatal(apir_encoder * enc) { + enc->fatal = false; +} + +static inline void apir_encoder_set_fatal(apir_encoder * enc) { + enc->fatal = true; +} + +static inline bool apir_encoder_get_fatal(const apir_encoder * enc) { + return enc->fatal; +} + +static inline void apir_decoder_reset_fatal(apir_decoder * dec) { + dec->fatal = false; +} + +static inline void apir_decoder_set_fatal(apir_decoder * dec) { + dec->fatal = true; +} + +static inline bool apir_decoder_get_fatal(const apir_decoder * dec) { + return dec->fatal; +} + +/* + * encode peek + */ + +static inline bool apir_decoder_peek_internal(apir_decoder * dec, + size_t size, + void * val, + size_t val_size) { + assert(val_size <= size); + + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + GGML_LOG_ERROR("reading too much from the decoder ...\n"); + apir_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; +} + +static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val, size_t val_size) { + apir_decoder_peek_internal(dec, size, val, val_size); +} + +static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) { + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + GGML_LOG_ERROR("reading too much from the decoder ...\n"); + apir_decoder_set_fatal(dec); + return NULL; + } + const void * addr = dec->cur; + dec->cur += size; + + return addr; +} + +/* + * read/write + */ + +static inline void apir_decoder_read(apir_decoder * dec, size_t size, void * val, size_t val_size) { + if (apir_decoder_peek_internal(dec, size, val, val_size)) { + dec->cur += size; + } +} + +static inline char * apir_encoder_write(apir_encoder * enc, size_t size, const void * val, size_t val_size) { + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); + + char * write_addr = enc->cur; + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(write_addr, val, val_size); + enc->cur += size; + + return write_addr; +} + +/* + * encode/decode + */ + +static inline void apir_decode(apir_decoder * dec, size_t size, void * data, size_t data_size) { + assert(size % 4 == 0); + apir_decoder_read(dec, size, data, data_size); +} + +static inline void apir_encode(apir_encoder * enc, size_t size, const void * data, size_t data_size) { + assert(size % 4 == 0); + apir_encoder_write(enc, size, data, data_size); +} + +/* + * typed encode/decode + */ + +/* uint8_t */ + +static inline void apir_encode_uint8_t(apir_encoder * enc, const uint8_t * val) { + apir_encode(enc, sizeof(int), val, sizeof(*val)); +} + +static inline void apir_decode_uint8_t(apir_decoder * dec, uint8_t * val) { + apir_decode(dec, sizeof(int), val, sizeof(*val)); +} + +/* uint64_t */ + +static inline void apir_encode_uint64_t(apir_encoder * enc, const uint64_t * val) { + apir_encode(enc, 8, val, sizeof(*val)); +} + +static inline void apir_decode_uint64_t(apir_decoder * dec, uint64_t * val) { + apir_decode(dec, 8, val, sizeof(*val)); +} + +static inline void apir_encode_uint64_t_array(apir_encoder * enc, const uint64_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_encode(enc, size, val, size); +} + +static inline void apir_decode_uint64_t_array(apir_decoder * dec, uint64_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_decode(dec, size, val, size); +} + +static inline const uint64_t * apir_decode_uint64_t_array_inplace(apir_decoder * dec, uint32_t count) { + return (uint64_t *) (uintptr_t) apir_decoder_use_inplace(dec, count * sizeof(uint64_t)); +} + +/* int32_t */ + +static inline void apir_encode_int32_t(apir_encoder * enc, const int32_t * val) { + apir_encode(enc, 4, val, sizeof(*val)); +} + +static inline void apir_decode_int32_t(apir_decoder * dec, int32_t * val) { + apir_decode(dec, 4, val, sizeof(*val)); +} + +static inline void apir_encode_int32_t_array(apir_encoder * enc, const int32_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_encode(enc, size, val, size); +} + +static inline void apir_decode_int32_t_array(apir_decoder * dec, int32_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_decode(dec, size, val, size); +} + +/* array size (uint64_t) */ + +static inline void apir_encode_array_size(apir_encoder * enc, uint64_t size) { + apir_encode_uint64_t(enc, &size); +} + +static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expected_size) { + uint64_t size; + apir_decode_uint64_t(dec, &size); + if (size != expected_size) { + GGML_LOG_ERROR("Couldn't decode array from the decoder\n"); + apir_decoder_set_fatal(dec); + size = 0; + } + return size; +} + +static inline uint64_t apir_decode_array_size_unchecked(apir_decoder * dec) { + uint64_t size; + apir_decode_uint64_t(dec, &size); + return size; +} + +/* non-array pointer */ + +static inline bool apir_encode_simple_pointer(apir_encoder * enc, const void * val) { + apir_encode_array_size(enc, val ? 1 : 0); + return val; +} + +static inline bool apir_decode_simple_pointer(apir_decoder * dec) { + return apir_decode_array_size_unchecked(dec); +} + +/* uint32_t */ + +static inline void apir_encode_uint32_t(apir_encoder * enc, const uint32_t * val) { + apir_encode(enc, 4, val, sizeof(*val)); +} + +static inline void apir_decode_uint32_t(apir_decoder * dec, uint32_t * val) { + apir_decode(dec, 4, val, sizeof(*val)); +} + +static inline void apir_encode_uint32_t_array(apir_encoder * enc, const uint32_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_encode(enc, size, val, size); +} + +static inline void apir_decode_uint32_t_array(apir_decoder * dec, uint32_t * val, uint32_t count) { + const size_t size = sizeof(*val) * count; + assert(size >= count); + apir_decode(dec, size, val, size); +} + +/* size_t */ + +static inline void apir_encode_size_t(apir_encoder * enc, const size_t * val) { + const uint64_t tmp = *val; + apir_encode_uint64_t(enc, &tmp); +} + +static inline void apir_decode_size_t(apir_decoder * dec, size_t * val) { + uint64_t tmp; + apir_decode_uint64_t(dec, &tmp); + *val = tmp; +} + +static inline void apir_encode_size_t_array(apir_encoder * enc, const size_t * val, uint32_t count) { + if (sizeof(size_t) == sizeof(uint64_t)) { + apir_encode_uint64_t_array(enc, (const uint64_t *) val, count); + } else { + for (uint32_t i = 0; i < count; i++) { + apir_encode_size_t(enc, &val[i]); + } + } +} + +static inline void apir_decode_size_t_array(apir_decoder * dec, size_t * val, uint32_t count) { + if (sizeof(size_t) == sizeof(uint64_t)) { + apir_decode_uint64_t_array(dec, (uint64_t *) val, count); + } else { + for (uint32_t i = 0; i < count; i++) { + apir_decode_size_t(dec, &val[i]); + } + } +} + +/* opaque blob */ + +static inline void apir_encode_blob_array(apir_encoder * enc, const void * val, size_t size) { + apir_encode(enc, (size + 3) & ~3, val, size); +} + +static inline void apir_decode_blob_array(apir_decoder * dec, void * val, size_t size) { + apir_decode(dec, (size + 3) & ~3, val, size); +} + +/* string */ + +static inline void apir_encode_char_array(apir_encoder * enc, const char * val, size_t size) { + assert(size && strlen(val) < size); + apir_encode_blob_array(enc, val, size); +} + +static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t size) { + apir_decode_blob_array(dec, val, size); + if (size) { + val[size - 1] = '\0'; + } else { + GGML_LOG_ERROR("Couldn't decode the blog array\n"); + apir_decoder_set_fatal(dec); + } +} + +/* (temp) buffer allocation */ + +static inline void * apir_decoder_alloc_array(size_t size, size_t count) { + size_t alloc_size; + if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { + GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count); + return NULL; + } + + return malloc(alloc_size); +} + +/* bool */ + +static inline void apir_encode_bool_t(apir_encoder * enc, const bool * val) { + apir_encode(enc, sizeof(int), val, sizeof(bool)); +} + +static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) { + apir_decode(dec, sizeof(int), val, sizeof(bool)); +} + +/* apir_buffer_type_host_handle_t */ + +static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc, + const apir_buffer_type_host_handle_t * val) { + apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec, + apir_buffer_type_host_handle_t * val) { + apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +/* apir_buffer_host_handle_t */ + +static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, + const apir_buffer_host_handle_t * val) { + apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +static inline void apir_decode_apir_buffer_host_handle_t(apir_decoder * dec, apir_buffer_host_handle_t * val) { + apir_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +/* uintptr_t */ + +static inline void apir_encode_uintptr_t(apir_encoder * enc, const uintptr_t * val) { + apir_encode(enc, sizeof(*val), val, sizeof(*val)); +} + +static inline void apir_decode_uintptr_t(apir_decoder * dec, uintptr_t * val) { + apir_decode(dec, sizeof(*val), val, sizeof(*val)); +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h new file mode 100644 index 0000000000..070c3b25fb --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h @@ -0,0 +1,211 @@ +#include "ggml-impl.h" +#include "apir_cs.h" +#include "apir_cs_rpc.h" + +// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); + +static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, + const apir_buffer_host_handle_t * handle); + +static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec); + +/* apir_rpc_tensor */ + +static inline void apir_encode_rcp_tensor(apir_encoder * enc, const apir_rpc_tensor * apir_rpc_tensor) { + size_t apir_rpc_tensor_size = sizeof(*apir_rpc_tensor); + apir_encode(enc, apir_rpc_tensor_size, apir_rpc_tensor, apir_rpc_tensor_size); +} + +static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder * dec) { + size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor); + + return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); +} + +static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, + uint32_t n_tensors) { + size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors; + + return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); +} + +/* ggml_tensor */ + +static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor * tensor) { + apir_rpc_tensor serialized = apir_serialize_tensor(tensor); + + apir_encode_rcp_tensor(enc, &serialized); +} + +static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) { + const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec); + ggml_init_params params{ + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + + const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor); + + return tensor; +} + +/* *** ggml_backend_buffer_type_t *** */ + +// ggml_backend_buffer_type_t is a POINTER (to a struct). +// Only the host pointer is shared between the host and guest. +// The guest stores it in `buft->context`. +// The host simply writes the pointer address in the buffer variable. + +static inline void apir_encode_ggml_buffer_type(apir_encoder * enc, ggml_backend_buffer_type_t buft) { + apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft); + apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decoder * dec) { + apir_buffer_type_host_handle_t handle; + + apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return (ggml_backend_buffer_type_t) handle; +} + +static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) { + apir_buffer_type_host_handle_t handle; + + apir_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return handle; +} + +/* *** ggml_backend_type_t *** */ + +// ggml_backend_buffer_t is a POINTER. +// same logic as for ggml_backend_buffer_type_t + +static inline void apir_encode_ggml_buffer(apir_encoder * enc, const ggml_backend_buffer_t buffer) { + apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer); + apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) { + ggml_backend_buffer_t buffer; + size_t buffer_ptr_size = sizeof(buffer); + + apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); + + return buffer; +} + +/* enum ggml_status */ + +static inline void apir_encode_ggml_status(apir_encoder * enc, const ggml_status * status) { + apir_encoder_write(enc, sizeof(*status), status, sizeof(*status)); +} + +static inline void apir_decode_ggml_status(apir_decoder * dec, ggml_status * status) { + apir_decoder_read(dec, sizeof(*status), status, sizeof(*status)); +} + +/* virtgpu_shmem */ + +static inline void apir_encode_virtgpu_shmem_res_id(apir_encoder * enc, uint32_t shmem_res_id) { + apir_encode_uint32_t(enc, &shmem_res_id); +} + +static inline void apir_decode_virtgpu_shmem_res_id(apir_decoder * dec, uint32_t * shmem_res_id) { + apir_decode_uint32_t(dec, shmem_res_id); +} + +/* ggml_cgraph */ + +static inline size_t apir_serialize_ggml_cgraph(ggml_cgraph * cgraph, std::vector & cgraph_data) { + apir_serialize_graph(cgraph, cgraph_data); + + return cgraph_data.size(); +} + +static inline void apir_encode_cgraph_data(apir_encoder * enc, std::vector & cgraph_data) { + size_t cgraph_size = cgraph_data.size(); + + apir_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size); +} + +static inline ggml_cgraph * apir_decode_ggml_cgraph(apir_decoder * dec, size_t cgraph_size) { + GGML_UNUSED(cgraph_size); + + uint32_t n_nodes; + apir_decode_uint32_t(dec, &n_nodes); + const uint64_t * nodes = apir_decode_uint64_t_array_inplace(dec, n_nodes); + + uint32_t n_tensors; + apir_decode_uint32_t(dec, &n_tensors); + const apir_rpc_tensor * tensors = apir_decode_apir_rpc_tensor_array_inplace(dec, n_tensors); + + return apir_deserialize_graph(n_nodes, n_tensors, tensors, nodes); +} + +static inline void apir_encode_ggml_buffer_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle) { + apir_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +} + +static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml_tensor * tensor) { + size_t tensor_size = sizeof(*tensor); + + if (tensor->extra) { + GGML_ABORT("Cannot pass tensors with extra"); + } + + if (tensor->src[0] && tensor->buffer) { + static int first = 1; + if (first) { + GGML_LOG_WARN("Cannot pass tensors with src and buffer\n"); + first = 0; + } + } + + apir_encoder_write(enc, tensor_size, tensor, tensor_size); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. + // (could also make a copy of the tensor, and update locally.) + + if (tensor->buffer) { + apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); + apir_encode_ggml_buffer_handle(enc, &buffer_handle); + } + + if (tensor->view_src) { + apir_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); + } + + for (int i = 0; tensor->src[i]; i++) { + const ggml_tensor * tensor_src = tensor->src[i]; + apir_encoder_write(enc, tensor_size, tensor_src, tensor_size); + } +} + +static inline const ggml_tensor * apir_decode_ggml_tensor_inplace(apir_decoder * dec) { + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_tensor * tensor = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. + if (tensor->buffer) { + tensor->buffer = apir_decode_ggml_buffer(dec); + } + + if (tensor->view_src) { + ggml_tensor * tensor_view_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->view_src = tensor_view_src; + } + + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor * tensor_src = (ggml_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor + } + + return tensor; +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h new file mode 100644 index 0000000000..f681798952 --- /dev/null +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h @@ -0,0 +1,54 @@ +#include "ggml.h" +#include "ggml-backend-impl.h" + +#include +#include +#include +#include + +// ggml_tensor is serialized into apir_rpc_tensor +struct apir_rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; + + char padding[4]; +}; + +/* frontend */ + +apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor); + +void apir_serialize_graph(const ggml_cgraph * cgraph, std::vector & output); + +/* backend */ + +void apir_track_backend_buffer(ggml_backend_buffer_t buffer); +bool apir_untrack_backend_buffer(ggml_backend_buffer_t buffer); +std::unordered_set apir_get_track_backend_buffers(); + +void apir_add_tensor(ggml_tensor * tensor, + std::vector & tensors, + std::unordered_set & visited); + +ggml_tensor * apir_deserialize_tensor(ggml_context * ctx, const apir_rpc_tensor * tensor); + +ggml_tensor * apir_create_node(uint64_t id, + ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map); + +ggml_cgraph * apir_deserialize_graph(uint32_t n_nodes, + uint32_t n_tensors, + const apir_rpc_tensor * tensors, + const uint64_t * nodes); diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp new file mode 100644 index 0000000000..7f650659b8 --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp @@ -0,0 +1,98 @@ +#include "ggml-remoting.h" + +static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + GGML_ABORT("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + + bool async__unused, host_buffer__unused, events__unused; + bool buffer_from_host_ptr; + apir_device_get_props(gpu, &async__unused, &host_buffer__unused, &buffer_from_host_ptr, &events__unused); + + if (buffer_from_host_ptr) { + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem.mmap_ptr; + context->is_from_ptr = true; + } else { + context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->is_from_ptr = false; + context->base = NULL; + } + + ggml_backend_buffer_t buffer = + ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + + return buffer; +} + +static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_get_name(gpu, buft); +} + +static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + static size_t align = 0; + + if (align == 0) { + align = apir_buffer_type_get_alignment(gpu, buft); + } + + return align; +} + +static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + static size_t max_size = 0; + if (max_size == 0) { + max_size = apir_buffer_type_get_max_size(gpu, buft); + } + + return max_size; +} + +static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_is_host(gpu, buft); +} + +static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const ggml_tensor * tensor) { + virtgpu * gpu = BUFT_TO_GPU(buft); + + if (tensor->buffer == NULL + || !tensor->buffer->context + || !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) { + return ggml_nbytes(tensor); + } + + return apir_buffer_type_get_alloc_size(gpu, buft, tensor); +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp new file mode 100644 index 0000000000..6b95362dd8 --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp @@ -0,0 +1,119 @@ +#include "ggml-remoting.h" + +#define BUFFER_TO_GPU(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->gpu + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) buffer->context; + if (context->base) { + return context->base; + } + + context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), BUFFER_TO_APIR_CONTEXT(buffer)); + + return context->base; +} + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + virtgpu * gpu = BUFFER_TO_GPU(buffer); + + ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy((char *) tensor->data + offset, data, size); + } else { + apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + virtgpu * gpu = BUFFER_TO_GPU(buffer); + ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy(data, (const char *) tensor->data + offset, size); + } else { + apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } +} + +static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + UNUSED(buffer); + + memcpy((char *) tensor->data + offset, data, size); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + UNUSED(buffer); + + memcpy(data, (const char *) tensor->data + offset, size); +} + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * src, + ggml_tensor * dst) { + virtgpu * gpu = BUFFER_TO_GPU(buffer); + + bool ret = apir_buffer_cpy_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), src, dst); + + return ret; +} + +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + virtgpu * gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value); + + return; +} + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + virtgpu * gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); + + ggml_backend_remoting_buffer_context * context = BUFFER_TO_GGML_CONTEXT(buffer); + free(context); + buffer->context = NULL; +} + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp new file mode 100644 index 0000000000..579eb99078 --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp @@ -0,0 +1,144 @@ +#include "ggml-remoting.h" + +static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + virtgpu * gpu = DEV_TO_GPU(dev); + + return apir_device_get_name(gpu); +} + +static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + virtgpu * gpu = DEV_TO_GPU(dev); + + return apir_device_get_description(gpu); +} + +static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { + virtgpu * gpu = DEV_TO_GPU(dev); + + static enum ggml_backend_dev_type type; + static bool has_type = false; + if (!has_type) { + has_type = true; + type = (enum ggml_backend_dev_type) apir_device_get_type(gpu); + } + + return type; +} + +static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + virtgpu * gpu = DEV_TO_GPU(dev); + + return apir_device_get_memory(gpu, free, total); +} + +static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +#if USE_ALWAYS_TRUE_SUPPORTS_OP == 1 + /* ggml-rpc cheats it like this */ + /* with the current implementation of serialize_tensor, the src/view aren't properly passed */ + UNUSED(dev); + UNUSED(op); + + return true; +#else + virtgpu * gpu = DEV_TO_GPU(dev); + + return apir_device_supports_op(gpu, op); +#endif +} + +static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + bool supported = buft->device == dev; + + return supported; +} + +static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + UNUSED(dev); + UNUSED(op); + + return false; +} + +static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + + virtgpu * gpu = DEV_TO_GPU(dev); + apir_device_get_props(gpu, &props->caps.async, &props->caps.host_buffer, &props->caps.buffer_from_host_ptr, + &props->caps.events); + + props->caps.buffer_from_host_ptr = false; + props->caps.async = false; + props->caps.events = false; +} + +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + virtgpu * gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static ggml_backend_buffer_type buft{ + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { + virtgpu * gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static ggml_backend_buffer_type buft{ + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, + void * ptr, + size_t size, + size_t max_tensor_size) { + virtgpu * gpu = DEV_TO_GPU(dev); + + ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + GGML_ABORT("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); + context->base = ptr; + context->is_from_ptr = true; + + ggml_backend_buffer_t buffer = + ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), + ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); + + return buffer; +} + +const ggml_backend_device_i ggml_backend_remoting_device_interface = { + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp new file mode 100644 index 0000000000..c46cf51c02 --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp @@ -0,0 +1,137 @@ +#include "ggml-remoting.h" +#include "ggml-virtgpu.h" + +#include +#include + +static virtgpu * apir_initialize() { + static virtgpu * apir_gpu_instance = NULL; + static bool apir_initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (apir_initialized) { + return apir_gpu_instance; + } + + apir_gpu_instance = create_virtgpu(); + if (!apir_gpu_instance) { + GGML_ABORT("failed to initialize the virtgpu"); + } + + apir_initialized = true; + } + + return apir_gpu_instance; +} + +static int ggml_backend_remoting_get_device_count() { + virtgpu * gpu = apir_initialize(); + if (!gpu) { + GGML_LOG_WARN("apir_initialize failed\n"); + return 0; + } + + return apir_device_get_count(gpu); +} + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + + return ggml_backend_remoting_get_device_count(); +} + +static std::vector devices; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) { + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { + if (devices.size() > 0) { + GGML_LOG_INFO("%s: already initialized\n", __func__); + return; + } + + virtgpu * gpu = apir_initialize(); + if (!gpu) { + GGML_LOG_ERROR("apir_initialize failed\n"); + return; + } + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) { + ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; + + ctx->device = i; + ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); + ctx->description = desc; + ctx->gpu = gpu; + + ggml_backend_dev_t dev = new ggml_backend_device{ + /* .iface = */ ggml_backend_remoting_device_interface, + /* .reg = */ reg, + /* .context = */ ctx, + }; + devices.push_back(dev); + } + initialized = true; + } + } +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + + return ggml_backend_remoting_get_device(device); +} + +static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + + return GGML_REMOTING_FRONTEND_NAME; +} + +static const ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_virtgpu_reg() { + virtgpu * gpu = apir_initialize(); + if (!gpu) { + GGML_LOG_ERROR("virtgpu_apir_initialize failed\n"); + return NULL; + } + + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ gpu, + }; + + static bool initialized = false; + if (initialized) { + return ® + } + initialized = true; + + ggml_backend_remoting_reg_init_devices(®); + + GGML_LOG_INFO("%s: initialized\n", __func__); + + return ® +} + +GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg) diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp new file mode 100644 index 0000000000..5cd6c0c060 --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp @@ -0,0 +1,69 @@ +#include "ggml-remoting.h" +#include "../../include/ggml-virtgpu.h" + +static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { + UNUSED(backend); + + return "API Remoting backend"; +} + +static void ggml_backend_remoting_free(ggml_backend_t backend) { + delete backend; +} + +static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + virtgpu * gpu = DEV_TO_GPU(backend->device); + + return apir_backend_graph_compute(gpu, cgraph); +} + +static void ggml_backend_remoting_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { + virtgpu * gpu = DEV_TO_GPU(backend->device); +#if true + UNUSED(gpu); + UNUSED(cgraph); +#else + // not working yet + + apir_backend_graph_optimize(gpu, cgraph); +#endif +} + +static ggml_backend_i ggml_backend_remoting_interface = { + /* .get_name = */ ggml_backend_remoting_get_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .graph_optimize = */ ggml_backend_remoting_graph_optimize, +}; + +static ggml_guid_t ggml_backend_remoting_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, + 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + + return &guid; +} + +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *) dev->context; + + ggml_backend_t remoting_backend = new ggml_backend{ + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device), + /* .context = */ ctx, + }; + + return remoting_backend; +} diff --git a/ggml/src/ggml-virtgpu/ggml-remoting.h b/ggml/src/ggml-virtgpu/ggml-remoting.h new file mode 100644 index 0000000000..36fc6b2a7b --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggml-remoting.h @@ -0,0 +1,68 @@ +#pragma once + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "virtgpu.h" + +#include +#include + +// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes + +#define USE_ALWAYS_TRUE_SUPPORTS_OP 1 +#define USE_METAL_GUEST_SUPPORTS_OP 0 + +#define DEV_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->context)->gpu + +#define BUFFER_TO_GGML_CONTEXT(name) ((ggml_backend_remoting_buffer_context *) (name)->context) + +#define BUFFER_TO_APIR_CONTEXT(name) &((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context + +#define BUFFER_TO_HOST_HANDLE(name) ((ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle + +#define GET_DEVICE_CONTEXT() (ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context + +#define BUFT_TO_GPU(name) ((ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +struct ggml_backend_remoting_device_context { + size_t device; + std::string name; + std::string description; + + std::vector> shared_memory; + + virtgpu * gpu; +}; + +struct ggml_backend_remoting_buffer_context { + apir_buffer_context_t apir_context; + + virtgpu * gpu; + + void * base; + + bool is_from_ptr; +}; + +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; +extern const ggml_backend_device_i ggml_backend_remoting_device_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev); + +static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft->context; +} + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + if (!buffer->context) { + GGML_ABORT("%s: no context available :/", __func__); + } + return BUFFER_TO_HOST_HANDLE(buffer); +} diff --git a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml new file mode 100644 index 0000000000..0b7cccfe9c --- /dev/null +++ b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml @@ -0,0 +1,168 @@ +# YAML schema for GGML remoting API functions +# This defines the structure for generating the remoting layer code + +# Configuration for the generated files +config: + # Base path for the generated files + base_path: "ggml/src" + + # Header files to update + files: + apir_backend_header: "ggml-virtgpu-apir/backend/shared/apir_backend.gen.h" + backend_dispatched_header: "ggml-virtgpu-apir/backend/backend-dispatched.gen.h" + virtgpu_forward_header: "ggml-virtgpu-apir/virtgpu-forward.gen.h" + +# Simplified function definitions with grouping and metadata combined +functions: + device: + group_description: "device" + functions: + get_device_count: + # No specific metadata - uses default void return and base params + + get_count: + frontend_return: "int" + + get_name: + frontend_return: "const char *" + + get_description: + frontend_return: "const char *" + + get_type: + frontend_return: "uint32_t" + + get_memory: + frontend_return: "void" + frontend_extra_params: + - "size_t *free" + - "size_t *total" + + supports_op: + frontend_return: "bool" + frontend_extra_params: + - "const ggml_tensor *op" + + get_buffer_type: + frontend_return: "apir_buffer_type_host_handle_t" + + get_props: + frontend_return: "void" + frontend_extra_params: + - "bool *async" + - "bool *host_buffer" + - "bool *buffer_from_host_ptr" + - "bool *events" + + buffer_from_ptr: + frontend_return: "apir_buffer_context_t" + frontend_extra_params: + - "size_t size" + - "size_t max_tensor_size" + + buffer_type: + group_description: "buffer-type" + functions: + get_name: + frontend_return: "const char *" + frontend_extra_params: + - "ggml_backend_buffer_type_t buft" + + get_alignment: + frontend_return: "size_t" + frontend_extra_params: + - "ggml_backend_buffer_type_t buft" + + get_max_size: + frontend_return: "size_t" + frontend_extra_params: + - "ggml_backend_buffer_type_t buft" + + is_host: + frontend_return: "bool" + frontend_extra_params: + - "ggml_backend_buffer_type_t buft" + + alloc_buffer: + frontend_return: "apir_buffer_context_t" + frontend_extra_params: + - "ggml_backend_buffer_type_t buffer_buft" + - "size_t size" + + get_alloc_size: + frontend_return: "size_t" + frontend_extra_params: + - "ggml_backend_buffer_type_t buft" + - "const ggml_tensor *op" + + buffer: + group_description: "buffer" + functions: + get_base: + frontend_return: "void *" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + + set_tensor: + frontend_return: "void" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + - "ggml_tensor *tensor" + - "const void *data" + - "size_t offset" + - "size_t size" + + get_tensor: + frontend_return: "void" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + - "const ggml_tensor *tensor" + - "void *data" + - "size_t offset" + - "size_t size" + + cpy_tensor: + frontend_return: "bool" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + - "const ggml_tensor *src" + - "const ggml_tensor *dst" + + clear: + frontend_return: "void" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + - "uint8_t value" + + free_buffer: + frontend_return: "void" + frontend_extra_params: + - "apir_buffer_context_t *buffer_context" + + backend: + group_description: "backend" + functions: + graph_compute: + frontend_return: "ggml_status" + frontend_extra_params: + - "ggml_cgraph *cgraph" + + graph_optimize: + frontend_return: "ggml_cgraph *" + frontend_extra_params: + - "ggml_cgraph *cgraph" + enabled: false + +# Naming patterns used for code generation +naming_patterns: + # How to generate enum names + enum_prefix: "APIR_COMMAND_TYPE_" + + # How to generate backend function names + backend_function_prefix: "backend_" + + # How to generate frontend function names + frontend_function_prefix: "apir_" + + # Standard frontend first parameter + frontend_base_param: "struct virtgpu *gpu" diff --git a/ggml/src/ggml-virtgpu/include/apir_hw.h b/ggml/src/ggml-virtgpu/include/apir_hw.h new file mode 100644 index 0000000000..33af045ca2 --- /dev/null +++ b/ggml/src/ggml-virtgpu/include/apir_hw.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +struct virgl_renderer_capset_apir { + uint32_t apir_version; + uint32_t supports_blob_resources; + uint32_t reserved[4]; // For future expansion +}; diff --git a/ggml/src/ggml-virtgpu/regenerate_remoting.py b/ggml/src/ggml-virtgpu/regenerate_remoting.py new file mode 100755 index 0000000000..4174a24327 --- /dev/null +++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 +""" +# Generated by Claude AI + +Script to completely regenerate the GGML remoting codebase from YAML configuration. + +This script reads api_functions.yaml and regenerates all the header files and +implementation templates for the GGML remoting layer. + +Usage: + python regenerate_remoting.py + +The script will: +1. Read ggmlremoting_functions.yaml configuration +2. Generate updated header files +3. Generate implementation templates in dedicated files +4. Show a summary of what was generated +""" + +import yaml +from typing import Dict, List, Any +from pathlib import Path +import os +import subprocess +import shutil +import logging + +NL = '\n' # can't have f"{'\n'}" in f-strings + + +class RemotingCodebaseGenerator: + def __init__(self, yaml_path: str = "ggmlremoting_functions.yaml"): + """Initialize the generator with the YAML configuration.""" + self.yaml_path = yaml_path + + if not Path(yaml_path).exists(): + raise FileNotFoundError(f"Configuration file {yaml_path} not found") + + with open(yaml_path, 'r') as f: + self.config = yaml.safe_load(f) + + self.functions = self.config['functions'] + self.naming_patterns = self.config['naming_patterns'] + self.config_data = self.config['config'] + + # Check if clang-format is available + self.clang_format_available = self._check_clang_format_available() + + def _check_clang_format_available(self) -> bool: + """Check if clang-format is available in the system PATH.""" + return shutil.which("clang-format") is not None + + def _format_file_with_clang_format(self, file_path: Path) -> bool: + """Format a file with clang-format -i. Returns True if successful, False otherwise.""" + if not self.clang_format_available: + return False + + try: + subprocess.run( + ["clang-format", "-i", str(file_path)], + check=True, + capture_output=True, + text=True + ) + return True + except subprocess.CalledProcessError: + logging.exception(f" ⚠️ clang-format failed for {file_path}") + return False + except Exception as e: + logging.exception(f" ⚠️ Unexpected error formatting {file_path}: {e}") + return False + + def generate_enum_name(self, group_name: str, function_name: str) -> str: + """Generate the APIR_COMMAND_TYPE enum name for a function.""" + prefix = self.naming_patterns['enum_prefix'] + return f"{prefix}{group_name.upper()}_{function_name.upper()}" + + def generate_backend_function_name(self, group_name: str, function_name: str) -> str: + """Generate the backend function name.""" + function_key = f"{group_name}_{function_name}" + overrides = self.naming_patterns.get('backend_function_overrides', {}) + + if function_key in overrides: + return overrides[function_key] + + prefix = self.naming_patterns['backend_function_prefix'] + return f"{prefix}{group_name}_{function_name}" + + def generate_frontend_function_name(self, group_name: str, function_name: str) -> str: + """Generate the frontend function name.""" + prefix = self.naming_patterns['frontend_function_prefix'] + return f"{prefix}{group_name}_{function_name}" + + def get_enabled_functions(self) -> List[Dict[str, Any]]: + """Get all enabled functions with their metadata.""" + functions = [] + enum_value = 0 + + for group_name, group_data in self.functions.items(): + group_description = group_data['group_description'] + + for function_name, func_metadata in group_data['functions'].items(): + # Handle case where func_metadata is None or empty (functions with only comments) + if func_metadata is None: + func_metadata = {} + + # Functions are enabled by default unless explicitly disabled + if func_metadata.get('enabled', True): + functions.append({ + 'group_name': group_name, + 'function_name': function_name, + 'enum_name': self.generate_enum_name(group_name, function_name), + 'enum_value': enum_value, + 'backend_function': self.generate_backend_function_name(group_name, function_name), + 'frontend_function': self.generate_frontend_function_name(group_name, function_name), + 'frontend_return': func_metadata.get('frontend_return', 'void'), + 'frontend_extra_params': func_metadata.get('frontend_extra_params', []), + 'group_description': group_description, + 'newly_added': func_metadata.get('newly_added', False) + }) + enum_value += 1 + + return functions + + def generate_apir_backend_header(self) -> str: + """Generate the complete apir_backend.h file.""" + functions = self.get_enabled_functions() + + # Generate the enum section + enum_lines = ["typedef enum ApirBackendCommandType {"] + current_group = None + + for func in functions: + # Add comment for new group + if func['group_name'] != current_group: + enum_lines.append("") + enum_lines.append(f" /* {func['group_description']} */") + current_group = func['group_name'] + + enum_lines.append(f" {func['enum_name']} = {func['enum_value']},") + + # Add the count + total_count = len(functions) + enum_lines.append("\n // last command_type index + 1") + enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},") + enum_lines.append("} ApirBackendCommandType;") + + # Full header template + header_content = NL.join(enum_lines) + "\n" + + return header_content + + def generate_backend_dispatched_header(self) -> str: + """Generate the complete backend-dispatched.h file.""" + functions = self.get_enabled_functions() + + # Function declarations + decl_lines = [] + current_group = None + + for func in functions: + if func['group_name'] != current_group: + decl_lines.append(f"\n/* {func['group_description']} */") + current_group = func['group_name'] + + signature = "uint32_t" + params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx" + decl_lines.append(f"{signature} {func['backend_function']}({params});") + + # Switch cases + switch_lines = [] + current_group = None + + for func in functions: + if func['group_name'] != current_group: + switch_lines.append(f" /* {func['group_description']} */") + current_group = func['group_name'] + + switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}\";") + + # Dispatch table + table_lines = [] + current_group = None + + for func in functions: + if func['group_name'] != current_group: + table_lines.append(f"\n /* {func['group_description']} */") + table_lines.append("") + current_group = func['group_name'] + + table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']},") + + header_content = f'''\ +#pragma once + +{NL.join(decl_lines)} + +static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) +{{ + switch (type) {{ +{NL.join(switch_lines)} + + default: return "unknown"; + }} +}} + +extern "C" {{ +static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{ + {NL.join(table_lines)} +}}; +}} +''' + return header_content + + def generate_virtgpu_forward_header(self) -> str: + """Generate the complete virtgpu-forward.gen.h file.""" + functions = self.get_enabled_functions() + + decl_lines = [] + current_group = None + + for func in functions: + if func['group_name'] != current_group: + decl_lines.append("") + decl_lines.append(f"/* {func['group_description']} */") + current_group = func['group_name'] + + # Build parameter list + params = [self.naming_patterns['frontend_base_param']] + params.extend(func['frontend_extra_params']) + param_str = ', '.join(params) + + decl_lines.append(f"{func['frontend_return']} {func['frontend_function']}({param_str});") + + header_content = f'''\ +#pragma once +{NL.join(decl_lines)} +''' + return header_content + + def regenerate_codebase(self) -> None: + """Regenerate the entire remoting codebase.""" + logging.info("πŸ”„ Regenerating GGML Remoting Codebase...") + logging.info("=" * 50) + + # Detect if we're running from frontend directory + current_dir = os.getcwd() + is_frontend_dir = current_dir.endswith('ggml-virtgpu') + + if is_frontend_dir: + # Running from ggml/src/ggml-virtgpu-apir + logging.info("πŸ“ Detected frontend directory execution") + frontend_base = Path(".") + else: + # Running from project root (fallback to original behavior) + logging.info("πŸ“ Detected project root execution") + base_path = self.config_data.get('base_path', 'ggml/src') + frontend_base = Path(base_path) / "ggml-virtgpu" + + # Compute final file paths + backend_base = frontend_base / "backend" + apir_backend_path = backend_base / "shared" / "apir_backend.gen.h" + backend_dispatched_path = backend_base / "backend-dispatched.gen.h" + virtgpu_forward_path = frontend_base / "virtgpu-forward.gen.h" + + # Create output directories for each file + apir_backend_path.parent.mkdir(parents=True, exist_ok=True) + backend_dispatched_path.parent.mkdir(parents=True, exist_ok=True) + virtgpu_forward_path.parent.mkdir(parents=True, exist_ok=True) + + # Generate header files + logging.info("πŸ“ Generating header files...") + + apir_backend_content = self.generate_apir_backend_header() + apir_backend_path.write_text(apir_backend_content) + logging.info(f" βœ… {apir_backend_path.resolve()}") + + backend_dispatched_content = self.generate_backend_dispatched_header() + backend_dispatched_path.write_text(backend_dispatched_content) + logging.info(f" βœ… {backend_dispatched_path.resolve()}") + + virtgpu_forward_content = self.generate_virtgpu_forward_header() + virtgpu_forward_path.write_text(virtgpu_forward_content) + logging.info(f" βœ… {virtgpu_forward_path.resolve()}") + + # Format generated files with clang-format + generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path] + + if not self.clang_format_available: + logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted." + " Install clang-format to enable automatic code formatting.") + else: + logging.info("\n🎨 Formatting files with clang-format...") + for file_path in generated_files: + if self._format_file_with_clang_format(file_path): + logging.info(f" βœ… Formatted {file_path.name}") + else: + logging.warning(f" ❌ Failed to format {file_path.name}") + + # Generate summary + functions = self.get_enabled_functions() + total_functions = len(functions) + + logging.info("\nπŸ“Š Generation Summary:") + logging.info("=" * 50) + logging.info(f" Total functions: {total_functions}") + logging.info(f" Function groups: {len(self.functions)}") + logging.info(" Header files: 3") + logging.info(f" Working directory: {current_dir}") + + +def main(): + try: + generator = RemotingCodebaseGenerator() + generator.regenerate_codebase() + except Exception as e: + logging.exception(f"❌ Error: {e}") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/ggml/src/ggml-virtgpu/virtgpu-apir.h b/ggml/src/ggml-virtgpu/virtgpu-apir.h new file mode 100644 index 0000000000..238f960acd --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-apir.h @@ -0,0 +1,15 @@ +#include "backend/shared/apir_backend.h" +#include "ggml-alloc.h" +#include "ggml-impl.h" +#include "ggml.h" +#include "virtgpu-shm.h" +#include "virtgpu-utils.h" + +struct apir_buffer_context_t { + apir_buffer_host_handle_t host_handle; + + struct virtgpu_shmem shmem; + apir_buffer_type_host_handle_t buft_host_handle; +}; + +#include "virtgpu-forward.gen.h" diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp new file mode 100644 index 0000000000..bf3c41011a --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp @@ -0,0 +1,50 @@ +#include "virtgpu-forward-impl.h" + +static long long current_time_ms() { + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + return (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); + + std::vector cgraph_data; + size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data); + + virtgpu_shmem temp_shmem; // Local storage for large buffers + virtgpu_shmem * shmem = &temp_shmem; + + if (cgraph_size <= gpu->data_shmem.mmap_size) { + // prefer the init-time allocated page, if large enough + shmem = &gpu->data_shmem; + } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) { + GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + } + + apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + + apir_encode_size_t(encoder, &cgraph_size); + + char * shmem_data = (char *) shmem->mmap_ptr; + apir_encoder secondary_enc = apir_new_encoder(shmem_data, cgraph_size); + + apir_encode_cgraph_data(&secondary_enc, cgraph_data); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + ggml_status status = GGML_STATUS_ABORTED; + apir_decode_ggml_status(decoder, &status); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != &gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem); + } + + return status; +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp new file mode 100644 index 0000000000..03cb09e064 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp @@ -0,0 +1,125 @@ +#include "virtgpu-forward-impl.h" + +const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); + + apir_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = apir_decode_array_size_unchecked(decoder); + char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + if (!string) { + GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__); + apir_decoder_set_fatal(decoder); + } + apir_decode_char_array(decoder, string, string_size); + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); + + apir_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + size_t alignment; + apir_decode_size_t(decoder, &alignment); + + remote_call_finish(gpu, encoder, decoder); + + return alignment; +} + +size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); + + apir_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + size_t max_size; + apir_decode_size_t(decoder, &max_size); + + remote_call_finish(gpu, encoder, decoder); + + return max_size; +} + +bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); + + apir_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + bool is_host; + apir_decode_bool_t(decoder, &is_host); + + remote_call_finish(gpu, encoder, decoder); + + return is_host; +} + +apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + apir_buffer_context_t buffer_context; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); + + apir_encode_ggml_buffer_type(encoder, buft); + + apir_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + + remote_call_finish(gpu, encoder, decoder); + + return buffer_context; +} + +size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE); + + apir_encode_ggml_buffer_type(encoder, buft); + + apir_encode_ggml_tensor_inline(encoder, op); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + size_t alloc_size; + apir_decode_size_t(decoder, &alloc_size); + + remote_call_finish(gpu, encoder, decoder); + + return alloc_size; +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp new file mode 100644 index 0000000000..3181e39440 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp @@ -0,0 +1,157 @@ +#include "virtgpu-forward-impl.h" + +void * apir_buffer_get_base(virtgpu * gpu, apir_buffer_context_t * buffer_context) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + uintptr_t base; + apir_decode_uintptr_t(decoder, &base); + + remote_call_finish(gpu, encoder, decoder); + + return (void *) base; +} + +void apir_buffer_set_tensor(virtgpu * gpu, + apir_buffer_context_t * buffer_context, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + apir_encode_ggml_tensor(encoder, tensor); + + virtgpu_shmem temp_shmem; // Local storage for large buffers + virtgpu_shmem * shmem = &temp_shmem; + + if (size <= gpu->data_shmem.mmap_size) { + // prefer the init-time allocated page, if large enough + shmem = &gpu->data_shmem; + + } else if (virtgpu_shmem_create(gpu, size, shmem)) { + GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + } + + memcpy(shmem->mmap_ptr, data, size); + apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + + apir_encode_size_t(encoder, &offset); + apir_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != &gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem); + } + + return; +} + +void apir_buffer_get_tensor(virtgpu * gpu, + apir_buffer_context_t * buffer_context, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + apir_encode_ggml_tensor(encoder, tensor); + + virtgpu_shmem temp_shmem; // Local storage for large buffers + virtgpu_shmem * shmem = &temp_shmem; + + if (size <= gpu->data_shmem.mmap_size) { + // prefer the init-time allocated page, if large enough + shmem = &gpu->data_shmem; + + } else if (virtgpu_shmem_create(gpu, size, shmem)) { + GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + } + + apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + apir_encode_size_t(encoder, &offset); + apir_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + memcpy(data, shmem->mmap_ptr, size); + + remote_call_finish(gpu, encoder, decoder); + + if (shmem != &gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem); + } +} + +bool apir_buffer_cpy_tensor(virtgpu * gpu, + apir_buffer_context_t * buffer_context, + const ggml_tensor * src, + const ggml_tensor * dst) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + apir_encode_ggml_tensor(encoder, src); + apir_encode_ggml_tensor(encoder, dst); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + bool ret_val; + apir_decode_bool_t(decoder, &ret_val); + + remote_call_finish(gpu, encoder, decoder); + + return ret_val; +} + +void apir_buffer_clear(virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + apir_encode_uint8_t(encoder, &value); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); +} + +void apir_buffer_free_buffer(virtgpu * gpu, apir_buffer_context_t * buffer_context) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER); + + apir_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + remote_call_finish(gpu, encoder, decoder); +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp new file mode 100644 index 0000000000..3e45e55bdc --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp @@ -0,0 +1,200 @@ +#include "virtgpu-forward-impl.h" +#include "virtgpu-shm.h" + +int apir_device_get_count(virtgpu * gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + return dev_count; + } + + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT); + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_int32_t(decoder, &dev_count); + + remote_call_finish(gpu, encoder, decoder); + + return dev_count; +} + +const char * apir_device_get_name(virtgpu * gpu) { + static char * string = nullptr; + if (string) { + return string; + } + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME); + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = apir_decode_array_size_unchecked(decoder); + string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + if (!string) { + GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__); + return NULL; + } + apir_decode_char_array(decoder, string, string_size); + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +const char * apir_device_get_description(virtgpu * gpu) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + const size_t string_size = apir_decode_array_size_unchecked(decoder); + char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + if (!string) { + GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__); + + return NULL; + } + apir_decode_char_array(decoder, string, string_size); + + remote_call_finish(gpu, encoder, decoder); + + return string; +} + +uint32_t apir_device_get_type(virtgpu * gpu) { + static uint32_t dev_type = 255; + if (dev_type != 255) { + return dev_type; + } + + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_uint32_t(decoder, &dev_type); + + remote_call_finish(gpu, encoder, decoder); + + return dev_type; +} + +void apir_device_get_memory(virtgpu * gpu, size_t * free, size_t * total) { + static size_t dev_free = 0; + static size_t dev_total = 0; + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_size_t(decoder, &dev_free); + apir_decode_size_t(decoder, &dev_total); + + *free = dev_free; + *total = dev_total; + + remote_call_finish(gpu, encoder, decoder); + + return; +} + +bool apir_device_supports_op(virtgpu * gpu, const ggml_tensor * op) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); + + apir_encode_ggml_tensor_inline(encoder, op); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + bool supports_op; + apir_decode_bool_t(decoder, &supports_op); + + remote_call_finish(gpu, encoder, decoder); + + return supports_op; +} + +apir_buffer_type_host_handle_t apir_device_get_buffer_type(virtgpu * gpu) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_buffer_type_host_handle_t buft_handle; + apir_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle); + + remote_call_finish(gpu, encoder, decoder); + + return buft_handle; +} + +void apir_device_get_props(virtgpu * gpu, + bool * async, + bool * host_buffer, + bool * buffer_from_host_ptr, + bool * events) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_bool_t(decoder, async); + apir_decode_bool_t(decoder, host_buffer); + apir_decode_bool_t(decoder, buffer_from_host_ptr); + apir_decode_bool_t(decoder, events); + + remote_call_finish(gpu, encoder, decoder); + + return; +} + +apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, size_t max_tensor_size) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirForwardReturnCode ret; + + apir_buffer_context_t buffer_context; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); + + if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) { + GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + } + + apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id); + + apir_encode_size_t(encoder, &size); + apir_encode_size_t(encoder, &max_tensor_size); + + REMOTE_CALL(gpu, encoder, decoder, ret); + + apir_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + buffer_context.buft_host_handle = apir_decode_apir_buffer_type_host_handle(decoder); + + remote_call_finish(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h new file mode 100644 index 0000000000..eea3e7e5a9 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h @@ -0,0 +1,29 @@ +#include "virtgpu.h" + +#include "ggml-remoting.h" +#include "backend/shared/apir_backend.h" +#include "backend/shared/apir_cs_ggml.h" + +#include "ggml-backend-impl.h" + +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + do { \ + int32_t forward_flag = (int32_t) apir_command_type__; \ + encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \ + if (!encoder_name) { \ + GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); \ + } \ + } while (0) + +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ + do { \ + ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ + if (!decoder_name) { \ + GGML_ABORT("%s: failed to kick the remote call", __func__); \ + } \ + if (ret_name < APIR_FORWARD_BASE_INDEX) { \ + GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__, \ + apir_forward_error(ret_name), ret_name); \ + } \ + ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ + } while (0) diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h new file mode 100644 index 0000000000..c27c07f086 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h @@ -0,0 +1,51 @@ +#pragma once + +/* device */ +void apir_device_get_device_count(struct virtgpu * gpu); +int apir_device_get_count(struct virtgpu * gpu); +const char * apir_device_get_name(struct virtgpu * gpu); +const char * apir_device_get_description(struct virtgpu * gpu); +uint32_t apir_device_get_type(struct virtgpu * gpu); +void apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total); +bool apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op); +apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu * gpu); +void apir_device_get_props(struct virtgpu * gpu, + bool * async, + bool * host_buffer, + bool * buffer_from_host_ptr, + bool * events); +apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size); + +/* buffer-type */ +const char * apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); +bool apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu, + ggml_backend_buffer_type_t buffer_buft, + size_t size); +size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op); + +/* buffer */ +void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context); +void apir_buffer_set_tensor(struct virtgpu * gpu, + apir_buffer_context_t * buffer_context, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size); +void apir_buffer_get_tensor(struct virtgpu * gpu, + apir_buffer_context_t * buffer_context, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size); +bool apir_buffer_cpy_tensor(struct virtgpu * gpu, + apir_buffer_context_t * buffer_context, + const ggml_tensor * src, + const ggml_tensor * dst); +void apir_buffer_clear(struct virtgpu * gpu, apir_buffer_context_t * buffer_context, uint8_t value); +void apir_buffer_free_buffer(struct virtgpu * gpu, apir_buffer_context_t * buffer_context); + +/* backend */ +ggml_status apir_backend_graph_compute(struct virtgpu * gpu, ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-virtgpu/virtgpu-shm.cpp b/ggml/src/ggml-virtgpu/virtgpu-shm.cpp new file mode 100644 index 0000000000..4def405a62 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-shm.cpp @@ -0,0 +1,99 @@ +#include "virtgpu-shm.h" + +#include "virtgpu.h" + +#include + +static uint32_t virtgpu_ioctl_resource_create_blob(virtgpu * gpu, + uint32_t blob_mem, + uint32_t blob_flags, + size_t blob_size, + uint64_t blob_id, + uint32_t * res_id) { +#ifdef SIMULATE_BO_SIZE_FIX + blob_size = align64(blob_size, 4096); +#endif + + drm_virtgpu_resource_create_blob args = { + .blob_mem = blob_mem, + .blob_flags = blob_flags, + .bo_handle = 0, + .res_handle = 0, + .size = blob_size, + .pad = 0, + .cmd_size = 0, + .cmd = 0, + .blob_id = blob_id, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) { + return 0; + } + + *res_id = args.res_handle; + return args.bo_handle; +} + +static void virtgpu_ioctl_gem_close(virtgpu * gpu, uint32_t gem_handle) { + drm_gem_close args = { + .handle = gem_handle, + .pad = 0, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args); + assert(!ret); +#ifdef NDEBUG + UNUSED(ret); +#endif +} + +static void * virtgpu_ioctl_map(virtgpu * gpu, uint32_t gem_handle, size_t size) { + drm_virtgpu_map args = { + .offset = 0, + .handle = gem_handle, + .pad = 0, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) { + return NULL; + } + + void * ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, args.offset); + if (ptr == MAP_FAILED) { + return NULL; + } + + return ptr; +} + +void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem) { + munmap(shmem->mmap_ptr, shmem->mmap_size); + virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); +} + +int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) { + size = align64(size, 16384); + + uint32_t res_id; + uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(gpu, VIRTGPU_BLOB_MEM_HOST3D, + VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, &res_id); + + if (!gem_handle) { + return 1; + } + + void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size); + if (!ptr) { + virtgpu_ioctl_gem_close(gpu, gem_handle); + GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n"); + exit(1); + return 1; + } + + shmem->res_id = res_id; + shmem->mmap_size = size; + shmem->mmap_ptr = ptr; + shmem->gem_handle = gem_handle; + + return 0; +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-shm.h b/ggml/src/ggml-virtgpu/virtgpu-shm.h new file mode 100644 index 0000000000..606860a094 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-shm.h @@ -0,0 +1,23 @@ +#pragma once + +#include "virtgpu-utils.h" + +#include + +#include +#include +#include +#include + +struct virtgpu; + +struct virtgpu_shmem { + uint32_t res_id; + size_t mmap_size; + void * mmap_ptr; + + uint32_t gem_handle; +}; + +int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem); +void virtgpu_shmem_destroy(virtgpu * gpu, virtgpu_shmem * shmem); diff --git a/ggml/src/ggml-virtgpu/virtgpu-utils.cpp b/ggml/src/ggml-virtgpu/virtgpu-utils.cpp new file mode 100644 index 0000000000..8a2805e990 --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-utils.cpp @@ -0,0 +1,179 @@ +#include "virtgpu-utils.h" + +#include +#include + +#include + +#define NODE_ALLOC_ALIGN 64 +#define NODE_PTR_MASK (~((uintptr_t) NODE_ALLOC_ALIGN - 1)) +#define NODE_LEVEL_MASK ((uintptr_t) NODE_ALLOC_ALIGN - 1) +#define NULL_NODE 0 + +#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) +#define os_free_aligned(_ptr) free(_ptr) +#define p_atomic_cmpxchg(v, old, _new) __sync_val_compare_and_swap((v), (old), (_new)) + +static inline uint64_t util_logbase2_64(uint64_t n) { +#if defined(HAVE___BUILTIN_CLZLL) + return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1)); +#else + uint64_t pos = 0ull; + if (n >= 1ull << 32) { + n >>= 32; + pos += 32; + } + if (n >= 1ull << 16) { + n >>= 16; + pos += 16; + } + if (n >= 1ull << 8) { + n >>= 8; + pos += 8; + } + if (n >= 1ull << 4) { + n >>= 4; + pos += 4; + } + if (n >= 1ull << 2) { + n >>= 2; + pos += 2; + } + if (n >= 1ull << 1) { + pos += 1; + } + return pos; +#endif +} + +void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size) { + memset(arr, 0, sizeof(*arr)); + arr->elem_size = elem_size; + arr->node_size_log2 = util_logbase2_64(node_size); + assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2)); +} + +static inline void * os_malloc_aligned(size_t size, size_t alignment) { + void * ptr; + alignment = (alignment + sizeof(void *) - 1) & ~(sizeof(void *) - 1); + if (posix_memalign(&ptr, alignment, size) != 0) { + return NULL; + } + return ptr; +} + +static inline void * _util_sparse_array_node_data(uintptr_t handle) { + return (void *) (handle & NODE_PTR_MASK); +} + +static inline unsigned _util_sparse_array_node_level(uintptr_t handle) { + return handle & NODE_LEVEL_MASK; +} + +static inline void _util_sparse_array_node_finish(util_sparse_array * arr, uintptr_t node) { + if (_util_sparse_array_node_level(node) > 0) { + uintptr_t * children = (uintptr_t *) _util_sparse_array_node_data(node); + size_t node_size = 1ull << arr->node_size_log2; + for (size_t i = 0; i < node_size; i++) { + if (children[i]) { + _util_sparse_array_node_finish(arr, children[i]); + } + } + } + + os_free_aligned(_util_sparse_array_node_data(node)); +} + +static inline uintptr_t _util_sparse_array_node(void * data, unsigned level) { + assert(data != NULL); + assert(((uintptr_t) data & NODE_LEVEL_MASK) == 0); + assert((level & NODE_PTR_MASK) == 0); + return (uintptr_t) data | level; +} + +inline uintptr_t _util_sparse_array_node_alloc(util_sparse_array * arr, unsigned level) { + size_t size; + if (level == 0) { + size = arr->elem_size << arr->node_size_log2; + } else { + size = sizeof(uintptr_t) << arr->node_size_log2; + } + + void * data = os_malloc_aligned(size, NODE_ALLOC_ALIGN); + memset(data, 0, size); + + return _util_sparse_array_node(data, level); +} + +static inline uintptr_t _util_sparse_array_set_or_free_node(uintptr_t * node_ptr, uintptr_t cmp_node, uintptr_t node) { + uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node); + + if (prev_node != cmp_node) { + /* We lost the race. Free this one and return the one that was already + * allocated. + */ + os_free_aligned(_util_sparse_array_node_data(node)); + return prev_node; + } else { + return node; + } +} + +void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx) { + const unsigned node_size_log2 = arr->node_size_log2; + uintptr_t root = p_atomic_read(&arr->root); + if (unlikely(!root)) { + unsigned root_level = 0; + uint64_t idx_iter = idx >> node_size_log2; + while (idx_iter) { + idx_iter >>= node_size_log2; + root_level++; + } + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level); + root = _util_sparse_array_set_or_free_node(&arr->root, NULL_NODE, new_root); + } + + while (1) { + unsigned root_level = _util_sparse_array_node_level(root); + uint64_t root_idx = idx >> (root_level * node_size_log2); + if (likely(root_idx < (1ull << node_size_log2))) { + break; + } + + /* In this case, we have a root but its level is low enough that the + * requested index is out-of-bounds. + */ + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1); + + uintptr_t * new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root); + new_root_children[0] = root; + + /* We only add one at a time instead of the whole tree because it's + * easier to ensure correctness of both the tree building and the + * clean-up path. Because we're only adding one node we never have to + * worry about trying to free multiple things without freeing the old + * things. + */ + root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root); + } + + void * node_data = _util_sparse_array_node_data(root); + unsigned node_level = _util_sparse_array_node_level(root); + while (node_level > 0) { + uint64_t child_idx = (idx >> (node_level * node_size_log2)) & ((1ull << node_size_log2) - 1); + + uintptr_t * children = (uintptr_t *) node_data; + uintptr_t child = p_atomic_read(&children[child_idx]); + + if (unlikely(!child)) { + child = _util_sparse_array_node_alloc(arr, node_level - 1); + child = _util_sparse_array_set_or_free_node(&children[child_idx], NULL_NODE, child); + } + + node_data = _util_sparse_array_node_data(child); + node_level = _util_sparse_array_node_level(child); + } + + uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1); + return (void *) ((char *) node_data + (elem_idx * arr->elem_size)); +} diff --git a/ggml/src/ggml-virtgpu/virtgpu-utils.h b/ggml/src/ggml-virtgpu/virtgpu-utils.h new file mode 100644 index 0000000000..a0036b4e2b --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu-utils.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) + +#ifndef UNUSED +# define UNUSED(x) (void) (x) +#endif + +/** Checks is a value is a power of two. Does not handle zero. */ +#define IS_POT(v) (((v) & ((v) - 1)) == 0) + +/** Checks is a value is a power of two. Zero handled. */ +#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v)) + +/** Align a value to a power of two */ +#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1)) + +#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) + +static inline bool util_is_power_of_two_nonzero64(uint64_t v) { + return IS_POT_NONZERO(v); +} + +static inline uint64_t align64(uint64_t value, uint64_t alignment) { + assert(util_is_power_of_two_nonzero64(alignment)); + return ALIGN_POT(value, alignment); +} + +struct list_head { + list_head * prev; + list_head * next; +}; + +struct util_sparse_array { + size_t elem_size; + unsigned node_size_log2; + + uintptr_t root; +}; + +void * util_sparse_array_get(util_sparse_array * arr, uint64_t idx); +void util_sparse_array_init(util_sparse_array * arr, size_t elem_size, size_t node_size); + +inline void os_time_sleep(int64_t usecs) { + timespec time; + time.tv_sec = usecs / 1000000; + time.tv_nsec = (usecs % 1000000) * 1000; + while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR) + ; +} + +struct timer_data { + long long start; + long long total; + long long count; +}; + +static inline void start_timer(timer_data * timer) { + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + timer->start = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +// returns the duration in ns +static inline long long stop_timer(timer_data * timer) { + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + long long timer_end = (long long) ts.tv_sec * 1000000000LL + ts.tv_nsec; + + long long duration = (timer_end - timer->start); + timer->total += duration; + timer->count += 1; + + return duration; +} diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp new file mode 100644 index 0000000000..005c8e21db --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu.cpp @@ -0,0 +1,498 @@ +#include "virtgpu.h" + +#include +#include + +#include +#include +#include + +static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev); +static virt_gpu_result_t virtgpu_open(virtgpu * gpu); + +static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu); +static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu); + +static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id); +static int virtgpu_ioctl_get_caps(virtgpu * gpu, + virgl_renderer_capset id, + uint32_t version, + void * capset, + size_t capset_size); +static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param); +static void virtgpu_init_renderer_info(virtgpu * gpu); + +static void log_call_duration(long long call_duration_ns, const char * name); + +const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS = 2 * 1000; // 2s +const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60 * 1000; // 60s + +static int virtgpu_handshake(virtgpu * gpu) { + apir_encoder * encoder; + apir_decoder * decoder; + + encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0); + if (!encoder) { + GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); + return 1; + } + + /* write handshake props */ + + uint32_t guest_major = APIR_PROTOCOL_MAJOR; + uint32_t guest_minor = APIR_PROTOCOL_MINOR; + apir_encode_uint32_t(encoder, &guest_major); + apir_encode_uint32_t(encoder, &guest_minor); + + /* *** */ + + uint32_t ret_magic; + long long call_duration_ns; + ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns); + log_call_duration(call_duration_ns, "API Remoting handshake"); + + if (!decoder) { + GGML_ABORT( + "%s: failed to initiate the communication with the virglrenderer library. " + "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", + __func__); + return 1; + } + + /* read handshake return values */ + + uint32_t host_major; + uint32_t host_minor; + + if (ret_magic != APIR_HANDSHAKE_MAGIC) { + GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, + apir_backend_initialize_error(ret_magic)); + } else { + apir_decode_uint32_t(decoder, &host_major); + apir_decode_uint32_t(decoder, &host_minor); + } + + remote_call_finish(gpu, encoder, decoder); + + if (ret_magic != APIR_HANDSHAKE_MAGIC) { + return 1; + } + + GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor); + GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor); + + if (guest_major != host_major) { + GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major); + } else if (guest_minor != host_minor) { + GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor); + } + + return 0; +} + +static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { + apir_encoder * encoder; + apir_decoder * decoder; + ApirLoadLibraryReturnCode ret; + + encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0); + if (!encoder) { + GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__); + return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; + } + + long long call_duration_ns; + + ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder, APIR_LOADLIBRARY_MAX_WAIT_MS, + &call_duration_ns); + log_call_duration(call_duration_ns, "API Remoting LoadLibrary"); + + if (!decoder) { + GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__); + return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; + } + + remote_call_finish(gpu, encoder, decoder); + + if (ret == APIR_LOAD_LIBRARY_SUCCESS) { + GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__); + + return ret; + } + + // something wrong happened, find out what. + + if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { + GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__, + apir_load_library_error(ret), ret); + return ret; + } + + GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__); + + ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX); + + if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { + GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)", + __func__, apir_ret, apir_load_library_error(apir_ret)); + } else { + uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX; + GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__, + lib_ret); + } + return ret; +} + +virtgpu * create_virtgpu() { + virtgpu * gpu = new virtgpu(); + + gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr; + util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024); + + if (virtgpu_open(gpu) != APIR_SUCCESS) { + GGML_ABORT("%s: failed to open the virtgpu device", __func__); + return NULL; + } + + if (virtgpu_init_capset(gpu) != APIR_SUCCESS) { + GGML_ABORT("%s: failed to initialize the GPU capset", __func__); + return NULL; + } + + if (virtgpu_init_context(gpu) != APIR_SUCCESS) { + GGML_ABORT("%s: failed to initialize the GPU context", __func__); + return NULL; + } + + if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) { + GGML_ABORT("%s: failed to create the shared reply memory pages", __func__); + return NULL; + } + + if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) { + GGML_ABORT("%s: failed to create the shared data memory pages", __func__); + return NULL; + } + + if (virtgpu_handshake(gpu)) { + GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__); + return NULL; + } + + if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) { + GGML_ABORT("%s: failed to load the backend library", __func__); + return NULL; + } + + return gpu; +} + +static virt_gpu_result_t virtgpu_open(virtgpu * gpu) { + drmDevicePtr devs[8]; + int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); + if (count < 0) { + GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED; + for (int i = 0; i < count; i++) { + result = virtgpu_open_device(gpu, devs[i]); + if (result == APIR_SUCCESS) { + break; + } + } + + drmFreeDevices(devs, count); + + return result; +} + +static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev) { + const char * node_path = dev->nodes[DRM_NODE_RENDER]; + + int fd = open(node_path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + GGML_ABORT("failed to open %s", node_path); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + drmVersionPtr version = drmGetVersion(fd); + if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) { + if (version) { + GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major); + } else { + GGML_ABORT("failed to get DRM driver version"); + } + + if (version) { + drmFreeVersion(version); + } + close(fd); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + gpu->fd = fd; + + drmFreeVersion(version); + + GGML_LOG_INFO("using DRM device %s\n", node_path); + + return APIR_SUCCESS; +} + +static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) { + assert(!gpu->capset.version); + const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id); + if (ret) { + GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno)); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + return APIR_SUCCESS; +} + +static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) { + if (gpu->use_apir_capset) { + GGML_LOG_INFO("Using the APIR capset\n"); + gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR; + } else { + GGML_LOG_INFO("Using the Venus capset\n"); + gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS; + } + gpu->capset.version = 0; + + int ret = + virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data)); + + if (ret) { + GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno)); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + assert(gpu->capset.data.supports_blob_resources); + + return APIR_SUCCESS; +} + +static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id) { + drm_virtgpu_context_set_param ctx_set_params[3] = { + { + .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID, + .value = capset_id, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS, + .value = 1, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK, + .value = 0, /* don't generate drm_events on fence signaling */ + }, + }; + + drm_virtgpu_context_init args = { + .num_params = ARRAY_SIZE(ctx_set_params), + .pad = 0, + .ctx_set_params = (uintptr_t) &ctx_set_params, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args); +} + +static int virtgpu_ioctl_get_caps(virtgpu * gpu, + virgl_renderer_capset id, + uint32_t version, + void * capset, + size_t capset_size) { + drm_virtgpu_get_caps args = { + .cap_set_id = id, + .cap_set_ver = version, + .addr = (uintptr_t) capset, + .size = (__u32) capset_size, + .pad = 0, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args); +} + +static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param) { + /* val must be zeroed because kernel only writes the lower 32 bits */ + uint64_t val = 0; + drm_virtgpu_getparam args = { + .param = param, + .value = (uintptr_t) &val, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args); + return ret ? 0 : val; +} + +apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags) { + /* + * Prepare the command encoder and its buffer + */ + + static char encoder_buffer[4096]; + + static apir_encoder enc; + enc = { + .cur = encoder_buffer, + .start = encoder_buffer, + .end = encoder_buffer + sizeof(encoder_buffer), + .fatal = false, + }; + + /* + * Fill the command encoder with the common args: + * - cmd_type (int32_t) + * - cmd_flags (int32_t) + * - reply res id (uint32_t) + */ + + int32_t cmd_type = apir_cmd_type; + + // for testing during the hypervisor transition + if (!gpu->use_apir_capset) { + cmd_type += VENUS_COMMAND_TYPE_LENGTH; + } + apir_encode_int32_t(&enc, &cmd_type); + apir_encode_int32_t(&enc, &cmd_flags); + + uint32_t reply_res_id = gpu->reply_shmem.res_id; + apir_encode_uint32_t(&enc, &reply_res_id); + + return &enc; +} + +void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) { + UNUSED(gpu); + + if (!enc) { + GGML_LOG_ERROR("Invalid (null) encoder\n"); + } + + if (!dec) { + GGML_LOG_ERROR("Invalid (null) decoder\n"); + } + + if (apir_encoder_get_fatal(enc)) { + GGML_LOG_ERROR("Failed to encode the output parameters.\n"); + } + + if (apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR("Failed to decode the input parameters.\n"); + } +} + +uint32_t remote_call(virtgpu * gpu, + apir_encoder * encoder, + apir_decoder ** decoder, + float max_wait_ms, + long long * call_duration_ns) { + /* + * Prepare the reply notification pointer + */ + + volatile std::atomic_uint * atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem.mmap_ptr; + *atomic_reply_notif = 0; + + /* + * Trigger the execbuf ioctl + */ + + drm_virtgpu_execbuffer args = { + .flags = VIRTGPU_EXECBUF_RING_IDX, + .size = (uint32_t) (encoder->cur - encoder->start), + .command = (uintptr_t) encoder->start, + + .bo_handles = 0, + .num_bo_handles = 0, + + .fence_fd = 0, + .ring_idx = 0, + .syncobj_stride = 0, + .num_in_syncobjs = 0, + .num_out_syncobjs = 0, + .in_syncobjs = 0, + .out_syncobjs = 0, + }; + + *decoder = NULL; + + int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); + + if (ret != 0) { + GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret); + } + + /* + * Wait for the response notification + */ + timer_data wait_host_reply_timer = { 0, 0, 0 }; + + start_timer(&wait_host_reply_timer); + + timespec ts_start, ts_end; + clock_gettime(CLOCK_MONOTONIC, &ts_start); + long long start_time = (long long) ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec; + + bool timedout = false; + uint32_t notif_value = 0; + while (true) { + notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire); + + if (notif_value != 0) { + break; + } + + int64_t base_sleep_us = 15; + + os_time_sleep(base_sleep_us); + + if (max_wait_ms) { + clock_gettime(CLOCK_MONOTONIC, &ts_end); + long long end_time = (long long) ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec; + float duration_ms = (end_time - start_time) / 1000000; + + if (duration_ms > max_wait_ms) { + timedout = true; + break; + } + } + } + + if (call_duration_ns) { + *call_duration_ns = stop_timer(&wait_host_reply_timer); + } + + if (max_wait_ms && timedout) { + GGML_LOG_ERROR("timed out waiting for the host answer...\n"); + return APIR_FORWARD_TIMEOUT; + } + + /* + * Prepare the decoder + */ + static apir_decoder response_dec; + response_dec.cur = (char *) gpu->reply_shmem.mmap_ptr + sizeof(*atomic_reply_notif); + response_dec.end = (char *) gpu->reply_shmem.mmap_ptr + gpu->reply_shmem.mmap_size; + *decoder = &response_dec; + + // extract the actual return value from the notif flag + uint32_t returned_value = notif_value - 1; + return returned_value; +} + +static void log_call_duration(long long call_duration_ns, const char * name) { + double call_duration_ms = (double) call_duration_ns / 1e6; // 1 millisecond = 1e6 nanoseconds + double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds + + if (call_duration_s > 1) { + GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name); + } else if (call_duration_ms > 1) { + GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name); + } else { + GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name); + } +} diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h new file mode 100644 index 0000000000..d4bb42e20b --- /dev/null +++ b/ggml/src/ggml-virtgpu/virtgpu.h @@ -0,0 +1,92 @@ +#pragma once + +#include "virtgpu-utils.h" +#include "virtgpu-shm.h" +#include "virtgpu-apir.h" + +#include "backend/shared/api_remoting.h" +#include "backend/shared/apir_cs.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VIRGL_RENDERER_UNSTABLE_APIS 1 +#include "apir_hw.h" +#include +#include "venus_hw.h" + +#ifndef VIRTGPU_DRM_CAPSET_APIR +// Will be defined include/drm/virtgpu_drm.h when +// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs +// is merged +#define VIRTGPU_DRM_CAPSET_APIR 10 +#endif + +// Mesa/Virlgrenderer Venus internal. Only necessary during the +// Venus->APIR transition in Virglrenderer +#define VENUS_COMMAND_TYPE_LENGTH 331 + +#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16 +#define VIRTGPU_DRM_CAPSET_VENUS 4 +#endif + +typedef uint32_t virgl_renderer_capset; + +/* from src/virtio/vulkan/vn_renderer_virtgpu.c */ +#define VIRTGPU_PCI_VENDOR_ID 0x1af4 +#define VIRTGPU_PCI_DEVICE_ID 0x1050 +#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004 +#define VIRTGPU_PARAM_GUEST_VRAM 9 + +#define SHMEM_DATA_SIZE 0x1830000 // 24MiB +#define SHMEM_REPLY_SIZE 0x4000 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +enum virt_gpu_result_t { + APIR_SUCCESS = 0, + APIR_ERROR_INITIALIZATION_FAILED = -1, +}; + +#define PRINTFLIKE(f, a) __attribute__((format(__printf__, f, a))) + +struct virtgpu { + bool use_apir_capset; + + int fd; + + struct { + virgl_renderer_capset id; + uint32_t version; + virgl_renderer_capset_apir data; + } capset; + + util_sparse_array shmem_array; + + /* APIR communication pages */ + virtgpu_shmem reply_shmem; + virtgpu_shmem data_shmem; +}; + +static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) { + return drmIoctl(gpu->fd, request, args); +} + +virtgpu * create_virtgpu(); + +apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags); + +uint32_t remote_call(virtgpu * gpu, + apir_encoder * enc, + apir_decoder ** dec, + float max_wait_ms, + long long * call_duration_ns); + +void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec);