Merge eb81f41889 into 7fbd36c50c
This commit is contained in:
commit
f02c3ce692
|
|
@ -120,6 +120,10 @@ if (Vulkan_FOUND)
|
|||
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
if (ANDROID)
|
||||
target_link_libraries(ggml-vulkan PRIVATE android)
|
||||
endif()
|
||||
|
||||
# Set up toolchain for host compilation whether cross-compiling or not
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,12 @@ using vk::DispatchLoaderDynamic;
|
|||
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#define VULKAN_HPP_DEFAULT_DISPATCHER ggml_vk_default_dispatcher()
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#ifndef VK_USE_PLATFORM_ANDROID_KHR
|
||||
#define VK_USE_PLATFORM_ANDROID_KHR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <vulkan/vulkan.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
|
|
@ -39,6 +45,29 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
|||
#include <future>
|
||||
#include <thread>
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include <android/hardware_buffer.h>
|
||||
#include <vulkan/vulkan_android.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
typedef int (*pfn_AHardwareBuffer_allocate)(const AHardwareBuffer_Desc*, AHardwareBuffer**);
|
||||
typedef void (*pfn_AHardwareBuffer_release)(AHardwareBuffer*);
|
||||
|
||||
static pfn_AHardwareBuffer_allocate ggml_vk_AHardwareBuffer_allocate = nullptr;
|
||||
static pfn_AHardwareBuffer_release ggml_vk_AHardwareBuffer_release = nullptr;
|
||||
|
||||
static void ggml_vk_load_ahb_funcs() {
|
||||
static bool loaded = false;
|
||||
if (loaded) return;
|
||||
void* handle = dlopen("libandroid.so", RTLD_NOW);
|
||||
if (handle) {
|
||||
ggml_vk_AHardwareBuffer_allocate = (pfn_AHardwareBuffer_allocate)dlsym(handle, "AHardwareBuffer_allocate");
|
||||
ggml_vk_AHardwareBuffer_release = (pfn_AHardwareBuffer_release)dlsym(handle, "AHardwareBuffer_release");
|
||||
}
|
||||
loaded = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define NOMINMAX 1
|
||||
# include <windows.h>
|
||||
|
|
@ -90,8 +119,10 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
|||
|
||||
#define VK_VENDOR_ID_AMD 0x1002
|
||||
#define VK_VENDOR_ID_APPLE 0x106b
|
||||
#define VK_VENDOR_ID_ARM 0x13B5
|
||||
#define VK_VENDOR_ID_INTEL 0x8086
|
||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||
#define VK_VENDOR_ID_QUALCOMM 0x5143
|
||||
|
||||
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
|
||||
|
||||
|
|
@ -610,6 +641,9 @@ struct vk_device_struct {
|
|||
|
||||
bool shader_64b_indexing;
|
||||
|
||||
bool ahb_buffer_support;
|
||||
uint64_t non_coherent_atom_size;
|
||||
|
||||
bool integer_dot_product;
|
||||
// 0: default, 1: force mmvq, -1: disable mmvq
|
||||
int32_t mmvq_mode;
|
||||
|
|
@ -2641,6 +2675,154 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
|||
return buf;
|
||||
}
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
static vk_buffer ggml_vk_create_buffer_ahb(vk_device& device, size_t size) {
|
||||
ggml_vk_load_ahb_funcs();
|
||||
if (!ggml_vk_AHardwareBuffer_allocate || !ggml_vk_AHardwareBuffer_release) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_create_buffer_ahb(" << device->name << ", " << size << ")");
|
||||
size_t aligned_size = (size + 4095) & ~4095;
|
||||
|
||||
AHardwareBuffer_Desc desc = {};
|
||||
desc.width = aligned_size;
|
||||
desc.height = 1;
|
||||
desc.layers = 1;
|
||||
desc.format = AHARDWAREBUFFER_FORMAT_BLOB;
|
||||
desc.usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN | AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
|
||||
|
||||
AHardwareBuffer* ahb = nullptr;
|
||||
if (ggml_vk_AHardwareBuffer_allocate(&desc, &ahb) != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
|
||||
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eAndroidHardwareBufferANDROID;
|
||||
|
||||
vk::BufferCreateInfo buffer_create_info{
|
||||
vk::BufferCreateFlags(),
|
||||
size,
|
||||
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
|
||||
vk::SharingMode::eExclusive,
|
||||
0,
|
||||
nullptr,
|
||||
};
|
||||
if (device->buffer_device_address) {
|
||||
buffer_create_info.usage |= vk::BufferUsageFlagBits::eShaderDeviceAddress;
|
||||
}
|
||||
buffer_create_info.setPNext(&external_memory_bci);
|
||||
|
||||
vk::Buffer vk_buf = device->device.createBuffer(buffer_create_info);
|
||||
|
||||
VkAndroidHardwareBufferPropertiesANDROID ahb_props = {};
|
||||
ahb_props.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
|
||||
|
||||
// Use dispatcher for extension functions
|
||||
auto& d = VULKAN_HPP_DEFAULT_DISPATCHER;
|
||||
if (d.vkGetAndroidHardwareBufferPropertiesANDROID(device->device, ahb, &ahb_props) != VK_SUCCESS) {
|
||||
device->device.destroyBuffer(vk_buf);
|
||||
ggml_vk_AHardwareBuffer_release(ahb);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||
uint32_t memory_type_idx = (uint32_t)-1;
|
||||
|
||||
// Prefer HostVisible + HostCoherent
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
if ((ahb_props.memoryTypeBits & (1u << i))) {
|
||||
vk::MemoryPropertyFlags flags = mem_props.memoryTypes[i].propertyFlags;
|
||||
if ((flags & vk::MemoryPropertyFlagBits::eHostVisible) && (flags & vk::MemoryPropertyFlagBits::eHostCoherent)) {
|
||||
memory_type_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to just HostVisible
|
||||
if (memory_type_idx == (uint32_t)-1) {
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
if ((ahb_props.memoryTypeBits & (1u << i))) {
|
||||
vk::MemoryPropertyFlags flags = mem_props.memoryTypes[i].propertyFlags;
|
||||
if (flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
memory_type_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to first available
|
||||
if (memory_type_idx == (uint32_t)-1) {
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
if ((ahb_props.memoryTypeBits & (1u << i))) {
|
||||
memory_type_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_type_idx == (uint32_t)-1) {
|
||||
device->device.destroyBuffer(vk_buf);
|
||||
ggml_vk_AHardwareBuffer_release(ahb);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
VkImportAndroidHardwareBufferInfoANDROID import_info = {};
|
||||
import_info.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
|
||||
import_info.buffer = ahb;
|
||||
|
||||
VkMemoryAllocateInfo alloc_info = {};
|
||||
alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
|
||||
alloc_info.allocationSize = ahb_props.allocationSize;
|
||||
alloc_info.memoryTypeIndex = memory_type_idx;
|
||||
alloc_info.pNext = &import_info;
|
||||
|
||||
VkMemoryAllocateFlagsInfo mem_flags_info = {};
|
||||
if (device->buffer_device_address) {
|
||||
mem_flags_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO;
|
||||
mem_flags_info.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
|
||||
mem_flags_info.pNext = &import_info;
|
||||
alloc_info.pNext = &mem_flags_info;
|
||||
}
|
||||
|
||||
VkDeviceMemory dev_mem;
|
||||
if (d.vkAllocateMemory(device->device, &alloc_info, nullptr, &dev_mem) != VK_SUCCESS) {
|
||||
device->device.destroyBuffer(vk_buf);
|
||||
ggml_vk_AHardwareBuffer_release(ahb);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
device->device.bindBufferMemory(vk_buf, (vk::DeviceMemory)dev_mem, 0);
|
||||
|
||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||
buf->buffer = vk_buf;
|
||||
buf->device_memory = (vk::DeviceMemory)dev_mem;
|
||||
buf->device = device;
|
||||
buf->size = size;
|
||||
buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags;
|
||||
|
||||
if (device->buffer_device_address) {
|
||||
const vk::BufferDeviceAddressInfo addressInfo(buf->buffer);
|
||||
buf->bda_addr = device->device.getBufferAddress(addressInfo);
|
||||
}
|
||||
|
||||
// Attempt to map
|
||||
try {
|
||||
buf->ptr = device->device.mapMemory((vk::DeviceMemory)dev_mem, 0, VK_WHOLE_SIZE);
|
||||
} catch (...) {
|
||||
buf->ptr = nullptr;
|
||||
}
|
||||
|
||||
// AHB reference is now held by Vulkan (via import), we can release ours.
|
||||
ggml_vk_AHardwareBuffer_release(ahb);
|
||||
|
||||
device->memory_logger->log_allocation(buf, size);
|
||||
return buf;
|
||||
}
|
||||
#endif
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags});
|
||||
|
|
@ -2654,6 +2836,13 @@ static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk:
|
|||
static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
#if defined(__ANDROID__)
|
||||
static bool use_ahb = getenv("GGML_VK_AHB") != nullptr;
|
||||
if (use_ahb && device->ahb_buffer_support) {
|
||||
buf = ggml_vk_create_buffer_ahb(device, size);
|
||||
if (buf) return buf;
|
||||
}
|
||||
#endif
|
||||
if (device->prefer_host_memory) {
|
||||
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
|
||||
vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
|
|
@ -3073,6 +3262,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_QUALCOMM) {
|
||||
uint32_t s = device->subgroup_size;
|
||||
m_warptile = m_warptile_mmq = m_warptile_mmq_int = m_warptile_mmq_k = m_warptile_mmq_int_k =
|
||||
m_warptile_id = m_warptile_mmqid = m_warptile_mmqid_int = m_warptile_mmqid_int_k =
|
||||
{ 64, 64, 64, s, s, 16, 8, 8, 1, 1, s };
|
||||
s_warptile = s_warptile_mmq = s_warptile_mmq_int = s_warptile_mmq_k = s_warptile_mmq_int_k =
|
||||
s_warptile_id = s_warptile_mmqid = s_warptile_mmqid_int = s_warptile_mmqid_int_k =
|
||||
{ 32, 32, 64, s, s, 16, 8, 8, 1, 1, s };
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_ARM && device->subgroup_size >= 16) {
|
||||
uint32_t s = device->subgroup_size;
|
||||
s_warptile = s_warptile_mmq = s_warptile_mmq_int = s_warptile_mmq_k = s_warptile_mmq_int_k =
|
||||
s_warptile_id = s_warptile_mmqid = s_warptile_mmqid_int = s_warptile_mmqid_int_k =
|
||||
{ 32, 32, 64, s, s, 16, 4, 4, 1, 1, s };
|
||||
m_warptile = m_warptile_mmq = m_warptile_mmq_int = m_warptile_mmq_k = m_warptile_mmq_int_k =
|
||||
m_warptile_id = m_warptile_mmqid = m_warptile_mmqid_int = m_warptile_mmqid_int_k =
|
||||
{ 64, 64, 64, s, s, 16, 4, 4, 1, 1, s };
|
||||
l_warptile = l_warptile_mmq = l_warptile_mmq_int = l_warptile_mmq_k = l_warptile_mmq_int_k =
|
||||
l_warptile_id = l_warptile_mmqid = l_warptile_mmqid_int = l_warptile_mmqid_int_k =
|
||||
{ 64, 64, 64, s, s, 16, 4, 4, 1, 1, s };
|
||||
}
|
||||
|
||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||
|
|
@ -3082,6 +3290,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
m_align = 64;
|
||||
s_align = 32;
|
||||
|
||||
if (device->vendor_id == VK_VENDOR_ID_QUALCOMM) {
|
||||
m_align = 128;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
|
||||
ggml_type t = (ggml_type)i;
|
||||
// Disable medium and large matrix multiplication if not enough shared memory is available
|
||||
|
|
@ -4688,6 +4900,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
device->physical_device.getProperties2(&props2);
|
||||
device->properties = props2.properties;
|
||||
device->non_coherent_atom_size = device->properties.limits.nonCoherentAtomSize;
|
||||
device->vendor_id = device->properties.vendorID;
|
||||
device->driver_id = driver_props.driverID;
|
||||
|
||||
|
|
@ -4735,6 +4948,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
|
||||
device->suballocation_block_size = 1024*1024*1024;
|
||||
}
|
||||
|
||||
if (device->vendor_id == VK_VENDOR_ID_ARM) {
|
||||
// ARM Mali optimization: disable fp16-matrix to prevent crashes, limit memory blocks
|
||||
device->coopmat_support = false;
|
||||
device->suballocation_block_size = 256 * 1024 * 1024;
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_QUALCOMM) {
|
||||
// Qualcomm Adreno optimization: disable fp16-matrix and int8-dotprod to prevent crashes, limit memory blocks
|
||||
device->coopmat_support = false;
|
||||
device->integer_dot_product = false;
|
||||
device->suballocation_block_size = 256 * 1024 * 1024;
|
||||
}
|
||||
device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
|
||||
|
||||
device->subgroup_size = subgroup_props.subgroupSize;
|
||||
|
|
@ -5061,6 +5285,23 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
bool ahb_support = false;
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp(VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME, properties.extensionName) == 0) {
|
||||
ahb_support = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ahb_support) {
|
||||
device_extensions.push_back(VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME);
|
||||
device_extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
|
||||
device_extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
|
||||
device->ahb_buffer_support = true;
|
||||
VK_LOG_DEBUG("ggml_vulkan: Android Hardware Buffer support enabled");
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(VK_KHR_cooperative_matrix)
|
||||
if (device->coopmat_support) {
|
||||
// Query supported shapes
|
||||
|
|
@ -5213,6 +5454,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = true;
|
||||
break;
|
||||
case VK_VENDOR_ID_ARM:
|
||||
case VK_VENDOR_ID_QUALCOMM:
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_id_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
device->mul_mat_s[i] = true;
|
||||
device->mul_mat_id_m[i] = true;
|
||||
device->mul_mat_id_s[i] = true;
|
||||
break;
|
||||
case VK_VENDOR_ID_APPLE:
|
||||
device->mul_mat_l[i] = false;
|
||||
device->mul_mat_m[i] = true;
|
||||
|
|
@ -6477,11 +6727,18 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|||
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
|
||||
if (!(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent)) {
|
||||
const size_t atom_size = dst->device->non_coherent_atom_size;
|
||||
const size_t flush_offset = (offset / atom_size) * atom_size;
|
||||
const size_t flush_end = ((offset + height * width + atom_size - 1) / atom_size) * atom_size;
|
||||
const size_t flush_size = std::min(flush_end - flush_offset, dst->size - flush_offset);
|
||||
vk::MappedMemoryRange range(dst->device_memory, flush_offset, flush_size);
|
||||
(void) dst->device->device.flushMappedMemoryRanges(1, &range);
|
||||
}
|
||||
} else {
|
||||
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
||||
|
||||
|
|
@ -6577,7 +6834,14 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|||
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
|
||||
// the HW device to host copy path.
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
||||
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
if (!(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent)) {
|
||||
const size_t atom_size = src->device->non_coherent_atom_size;
|
||||
const size_t inv_offset = (offset / atom_size) * atom_size;
|
||||
const size_t inv_end = ((offset + size + atom_size - 1) / atom_size) * atom_size;
|
||||
const size_t inv_size = std::min(inv_end - inv_offset, src->size - inv_offset);
|
||||
vk::MappedMemoryRange range(src->device_memory, inv_offset, inv_size);
|
||||
(void) src->device->device.invalidateMappedMemoryRanges(1, &range);
|
||||
}
|
||||
|
||||
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
||||
} else {
|
||||
|
|
@ -6654,6 +6918,15 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
|||
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
|
||||
dst->device->uma) {
|
||||
memset((uint8_t*)dst->ptr + offset, c, size);
|
||||
|
||||
if (!(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent)) {
|
||||
const size_t atom_size = dst->device->non_coherent_atom_size;
|
||||
const size_t flush_offset = (offset / atom_size) * atom_size;
|
||||
const size_t flush_end = ((offset + size + atom_size - 1) / atom_size) * atom_size;
|
||||
const size_t flush_size = std::min(flush_end - flush_offset, dst->size - flush_offset);
|
||||
vk::MappedMemoryRange range(dst->device_memory, flush_offset, flush_size);
|
||||
(void) dst->device->device.flushMappedMemoryRanges(1, &range);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -13870,6 +14143,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
int nodes_per_submit = 100;
|
||||
if (ctx->device->vendor_id == VK_VENDOR_ID_QUALCOMM) {
|
||||
nodes_per_submit = 1000;
|
||||
}
|
||||
int submitted_nodes = 0;
|
||||
int submit_count = 0;
|
||||
uint64_t mul_mat_bytes = 0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue