llama.cpp/ggml/src/ggml-virtgpu/virtgpu.cpp

506 lines
16 KiB
C++

#include "virtgpu.h"
#include <stdio.h>
#include <unistd.h>
#include <cassert>
#include <cerrno>
#include <cstdlib>
static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev);
static virt_gpu_result_t virtgpu_open(virtgpu * gpu);
static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu);
static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu);
static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id);
static int virtgpu_ioctl_get_caps(virtgpu * gpu,
virgl_renderer_capset id,
uint32_t version,
void * capset,
size_t capset_size);
static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param);
static void virtgpu_init_renderer_info(virtgpu * gpu);
static void log_call_duration(long long call_duration_ns, const char * name);
const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS = 2 * 1000; // 2s
const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60 * 1000; // 60s
static int virtgpu_handshake(virtgpu * gpu) {
apir_encoder * encoder;
apir_decoder * decoder;
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0);
if (!encoder) {
GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);
return 1;
}
/* write handshake props */
uint32_t guest_major = APIR_PROTOCOL_MAJOR;
uint32_t guest_minor = APIR_PROTOCOL_MINOR;
apir_encode_uint32_t(encoder, &guest_major);
apir_encode_uint32_t(encoder, &guest_minor);
/* *** */
uint32_t ret_magic;
long long call_duration_ns;
ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns);
log_call_duration(call_duration_ns, "API Remoting handshake");
if (!decoder) {
GGML_ABORT(
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
return 1;
}
/* read handshake return values */
uint32_t host_major;
uint32_t host_minor;
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
apir_backend_initialize_error(ret_magic));
} else {
apir_decode_uint32_t(decoder, &host_major);
apir_decode_uint32_t(decoder, &host_minor);
}
remote_call_finish(gpu, encoder, decoder);
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
return 1;
}
GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
if (guest_major != host_major) {
GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
} else if (guest_minor != host_minor) {
GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
}
return 0;
}
static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirLoadLibraryReturnCode ret;
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0);
if (!encoder) {
GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__);
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
}
long long call_duration_ns;
ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder, APIR_LOADLIBRARY_MAX_WAIT_MS,
&call_duration_ns);
log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
if (!decoder) {
GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__);
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
}
remote_call_finish(gpu, encoder, decoder);
if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__);
return ret;
}
// something wrong happened, find out what.
if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__,
apir_load_library_error(ret), ret);
return ret;
}
GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)",
__func__, apir_ret, apir_load_library_error(apir_ret));
} else {
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
lib_ret);
}
return ret;
}
virtgpu * create_virtgpu() {
virtgpu * gpu = new virtgpu();
gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
// Initialize mutex to protect shared data_shmem buffer
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
delete gpu;
GGML_ABORT("%s: failed to initialize data_shmem mutex", __func__);
return NULL;
}
if (virtgpu_open(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to open the virtgpu device", __func__);
return NULL;
}
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to initialize the GPU capset", __func__);
return NULL;
}
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to initialize the GPU context", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
GGML_ABORT("%s: failed to create the shared reply memory pages", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
GGML_ABORT("%s: failed to create the shared data memory pages", __func__);
return NULL;
}
if (virtgpu_handshake(gpu)) {
GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__);
return NULL;
}
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
GGML_ABORT("%s: failed to load the backend library", __func__);
return NULL;
}
return gpu;
}
static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
drmDevicePtr devs[8];
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
if (count < 0) {
GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__);
return APIR_ERROR_INITIALIZATION_FAILED;
}
virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
for (int i = 0; i < count; i++) {
result = virtgpu_open_device(gpu, devs[i]);
if (result == APIR_SUCCESS) {
break;
}
}
drmFreeDevices(devs, count);
return result;
}
static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr dev) {
const char * node_path = dev->nodes[DRM_NODE_RENDER];
int fd = open(node_path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
GGML_ABORT("failed to open %s", node_path);
return APIR_ERROR_INITIALIZATION_FAILED;
}
drmVersionPtr version = drmGetVersion(fd);
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
if (version) {
GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major);
} else {
GGML_ABORT("failed to get DRM driver version");
}
if (version) {
drmFreeVersion(version);
}
close(fd);
return APIR_ERROR_INITIALIZATION_FAILED;
}
gpu->fd = fd;
drmFreeVersion(version);
GGML_LOG_INFO("using DRM device %s\n", node_path);
return APIR_SUCCESS;
}
static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
assert(!gpu->capset.version);
const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
if (ret) {
GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
return APIR_SUCCESS;
}
static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
if (gpu->use_apir_capset) {
GGML_LOG_INFO("Using the APIR capset\n");
gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR;
} else {
GGML_LOG_INFO("Using the Venus capset\n");
gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS;
}
gpu->capset.version = 0;
int ret =
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
if (ret) {
GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
assert(gpu->capset.data.supports_blob_resources);
return APIR_SUCCESS;
}
static int virtgpu_ioctl_context_init(virtgpu * gpu, virgl_renderer_capset capset_id) {
drm_virtgpu_context_set_param ctx_set_params[3] = {
{
.param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
.value = capset_id,
},
{
.param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
.value = 1,
},
{
.param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
.value = 0, /* don't generate drm_events on fence signaling */
},
};
drm_virtgpu_context_init args = {
.num_params = ARRAY_SIZE(ctx_set_params),
.pad = 0,
.ctx_set_params = (uintptr_t) &ctx_set_params,
};
return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
}
static int virtgpu_ioctl_get_caps(virtgpu * gpu,
virgl_renderer_capset id,
uint32_t version,
void * capset,
size_t capset_size) {
drm_virtgpu_get_caps args = {
.cap_set_id = id,
.cap_set_ver = version,
.addr = (uintptr_t) capset,
.size = (__u32) capset_size,
.pad = 0,
};
return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
}
static uint64_t virtgpu_ioctl_getparam(virtgpu * gpu, uint64_t param) {
/* val must be zeroed because kernel only writes the lower 32 bits */
uint64_t val = 0;
drm_virtgpu_getparam args = {
.param = param,
.value = (uintptr_t) &val,
};
const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
return ret ? 0 : val;
}
apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, int32_t cmd_flags) {
/*
* Prepare the command encoder and its buffer
*/
thread_local char encoder_buffer[4096];
thread_local apir_encoder enc;
enc = {
.cur = encoder_buffer,
.start = encoder_buffer,
.end = encoder_buffer + sizeof(encoder_buffer),
.fatal = false,
};
/*
* Fill the command encoder with the common args:
* - cmd_type (int32_t)
* - cmd_flags (int32_t)
* - reply res id (uint32_t)
*/
int32_t cmd_type = apir_cmd_type;
// for testing during the hypervisor transition
if (!gpu->use_apir_capset) {
cmd_type += VENUS_COMMAND_TYPE_LENGTH;
}
apir_encode_int32_t(&enc, &cmd_type);
apir_encode_int32_t(&enc, &cmd_flags);
uint32_t reply_res_id = gpu->reply_shmem.res_id;
apir_encode_uint32_t(&enc, &reply_res_id);
return &enc;
}
void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) {
UNUSED(gpu);
if (!enc) {
GGML_LOG_ERROR("Invalid (null) encoder\n");
}
if (!dec) {
GGML_LOG_ERROR("Invalid (null) decoder\n");
}
if (apir_encoder_get_fatal(enc)) {
GGML_LOG_ERROR("Failed to encode the output parameters.\n");
}
if (apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR("Failed to decode the input parameters.\n");
}
}
uint32_t remote_call(virtgpu * gpu,
apir_encoder * encoder,
apir_decoder ** decoder,
float max_wait_ms,
long long * call_duration_ns) {
/*
* Prepare the reply notification pointer
*/
volatile std::atomic_uint * atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem.mmap_ptr;
*atomic_reply_notif = 0;
/*
* Trigger the execbuf ioctl
*/
drm_virtgpu_execbuffer args = {
.flags = VIRTGPU_EXECBUF_RING_IDX,
.size = (uint32_t) (encoder->cur - encoder->start),
.command = (uintptr_t) encoder->start,
.bo_handles = 0,
.num_bo_handles = 0,
.fence_fd = 0,
.ring_idx = 0,
.syncobj_stride = 0,
.num_in_syncobjs = 0,
.num_out_syncobjs = 0,
.in_syncobjs = 0,
.out_syncobjs = 0,
};
*decoder = NULL;
int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
if (ret != 0) {
GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
}
/*
* Wait for the response notification
*/
timer_data wait_host_reply_timer = { 0, 0, 0 };
start_timer(&wait_host_reply_timer);
timespec ts_start, ts_end;
clock_gettime(CLOCK_MONOTONIC, &ts_start);
long long start_time = (long long) ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec;
bool timedout = false;
uint32_t notif_value = 0;
while (true) {
notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
if (notif_value != 0) {
break;
}
int64_t base_sleep_us = 15;
os_time_sleep(base_sleep_us);
if (max_wait_ms) {
clock_gettime(CLOCK_MONOTONIC, &ts_end);
long long end_time = (long long) ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec;
float duration_ms = (end_time - start_time) / 1000000;
if (duration_ms > max_wait_ms) {
timedout = true;
break;
}
}
}
if (call_duration_ns) {
*call_duration_ns = stop_timer(&wait_host_reply_timer);
}
if (max_wait_ms && timedout) {
GGML_LOG_ERROR("timed out waiting for the host answer...\n");
return APIR_FORWARD_TIMEOUT;
}
/*
* Prepare the decoder
*/
static apir_decoder response_dec;
response_dec.cur = (char *) gpu->reply_shmem.mmap_ptr + sizeof(*atomic_reply_notif);
response_dec.end = (char *) gpu->reply_shmem.mmap_ptr + gpu->reply_shmem.mmap_size;
*decoder = &response_dec;
// extract the actual return value from the notif flag
uint32_t returned_value = notif_value - 1;
return returned_value;
}
static void log_call_duration(long long call_duration_ns, const char * name) {
double call_duration_ms = (double) call_duration_ns / 1e6; // 1 millisecond = 1e6 nanoseconds
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
if (call_duration_s > 1) {
GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name);
} else if (call_duration_ms > 1) {
GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name);
} else {
GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name);
}
}