ggml webgpu: add support for emscripten builds (#17184)
* Faster tensors (#8) Add fast matrix and matrix/vector multiplication. * Use map for shader replacements instead of pair of strings * Wasm (#9) * webgpu : fix build on emscripten * more debugging stuff * test-backend-ops: force single thread on wasm * fix single-thread case for init_tensor_uniform * use jspi * add pthread * test: remember to set n_thread for cpu backend * Add buffer label and enable dawn-specific toggles to turn off some checks * Intermediate state * Fast working f16/f32 vec4 * Working float fast mul mat * Clean up naming of mul_mat to match logical model, start work on q mul_mat * Setup for subgroup matrix mat mul * Basic working subgroup matrix * Working subgroup matrix tiling * Handle weirder sg matrix sizes (but still % sg matrix size) * Working start to gemv * working f16 accumulation with shared memory staging * Print out available subgroup matrix configurations * Vectorize dst stores for sg matrix shader * Gemv working scalar * Minor set_rows optimization (#4) * updated optimization, fixed errors * non vectorized version now dispatches one thread per element * Simplify * Change logic for set_rows pipelines --------- Co-authored-by: Neha Abbas <nehaabbas@macbookpro.lan> Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local> Co-authored-by: Reese Levine <reeselevine1@gmail.com> * Comment on dawn toggles * Working subgroup matrix code for (semi)generic sizes * Remove some comments * Cleanup code * Update dawn version and move to portable subgroup size * Try to fix new dawn release * Update subgroup size comment * Only check for subgroup matrix configs if they are supported * Add toggles for subgroup matrix/f16 support on nvidia+vulkan * Make row/col naming consistent * Refactor shared memory loading * Move sg matrix stores to correct file * Working q4_0 * Formatting * Work with emscripten builds * Fix test-backend-ops emscripten for f16/quantized types * Use emscripten memory64 to support get_memory * Add build flags and try ci --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> * Remove extra whitespace * Move wasm single-thread logic out of test-backend-ops for cpu backend * Disable multiple threads for emscripten single-thread builds in ggml_graph_plan * Fix .gitignore * Add memory64 option and remove unneeded macros for setting threads to 1 --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
b3e3060f4e
commit
7ca5991d2b
|
|
@ -547,6 +547,46 @@ jobs:
|
||||||
# This is using llvmpipe and runs slower than other backends
|
# This is using llvmpipe and runs slower than other backends
|
||||||
ctest -L main --verbose --timeout 3600
|
ctest -L main --verbose --timeout 3600
|
||||||
|
|
||||||
|
ubuntu-24-wasm-webgpu:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-latest-wasm-webgpu
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install Emscripten
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/emscripten-core/emsdk.git
|
||||||
|
cd emsdk
|
||||||
|
./emsdk install latest
|
||||||
|
./emsdk activate latest
|
||||||
|
|
||||||
|
- name: Fetch emdawnwebgpu
|
||||||
|
run: |
|
||||||
|
DAWN_TAG="v20251027.212519"
|
||||||
|
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||||
|
echo "Downloading ${EMDAWN_PKG}"
|
||||||
|
curl -L -o emdawn.zip \
|
||||||
|
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
|
||||||
|
unzip emdawn.zip
|
||||||
|
|
||||||
|
- name: Build WASM WebGPU
|
||||||
|
run: |
|
||||||
|
source emsdk/emsdk_env.sh
|
||||||
|
emcmake cmake -B build-wasm \
|
||||||
|
-DGGML_WEBGPU=ON \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||||
|
|
||||||
|
cmake --build build-wasm --target test-backend-ops -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-hip:
|
ubuntu-22-cmake-hip:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
|
|
|
||||||
|
|
@ -134,3 +134,5 @@ poetry.toml
|
||||||
# IDE
|
# IDE
|
||||||
/*.code-workspace
|
/*.code-workspace
|
||||||
/.windsurf/
|
/.windsurf/
|
||||||
|
# emscripten
|
||||||
|
a.out.*
|
||||||
|
|
|
||||||
|
|
@ -33,10 +33,24 @@ endif()
|
||||||
|
|
||||||
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
||||||
|
|
||||||
|
option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
# Use 64-bit memory to support backend_get_memory queries
|
||||||
|
# TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
|
||||||
|
if (LLAMA_WASM_MEM64)
|
||||||
|
add_compile_options("-sMEMORY64=1")
|
||||||
|
add_link_options("-sMEMORY64=1")
|
||||||
|
endif()
|
||||||
|
add_link_options("-sALLOW_MEMORY_GROWTH=1")
|
||||||
|
|
||||||
|
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
|
||||||
|
option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
|
||||||
|
if (LLAMA_BUILD_HTML)
|
||||||
|
set(CMAKE_EXECUTABLE_SUFFIX ".html")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@
|
||||||
#include <thread> // for hardware_concurrency
|
#include <thread> // for hardware_concurrency
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
|
|
@ -41,6 +42,8 @@
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
|
||||||
|
|
@ -902,6 +902,8 @@ std::string fs_get_cache_directory() {
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
cache_directory = std::getenv("LOCALAPPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
|
#elif defined(__EMSCRIPTEN__)
|
||||||
|
GGML_ABORT("not implemented on this platform");
|
||||||
#else
|
#else
|
||||||
# error Unknown architecture
|
# error Unknown architecture
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
#include "http.h"
|
#include "http.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
|
|
@ -35,6 +36,8 @@
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
// isatty
|
// isatty
|
||||||
|
|
|
||||||
|
|
@ -226,7 +226,7 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
||||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||||
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
||||||
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
|
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
|
||||||
|
option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON)
|
||||||
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
||||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||||
|
|
|
||||||
|
|
@ -2698,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||||
|
// Emscripten without pthreads support can only use a single thread
|
||||||
|
n_threads = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
|
|
||||||
struct ggml_cplan cplan;
|
struct ggml_cplan cplan;
|
||||||
|
|
|
||||||
|
|
@ -39,8 +39,23 @@ add_dependencies(ggml-webgpu generate_shaders)
|
||||||
if(EMSCRIPTEN)
|
if(EMSCRIPTEN)
|
||||||
set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
|
set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
|
||||||
|
|
||||||
|
if(NOT EMDAWNWEBGPU_DIR)
|
||||||
|
# default built-in port
|
||||||
|
target_compile_options(ggml-webgpu PRIVATE "--use-port=emdawnwebgpu")
|
||||||
|
target_link_options(ggml-webgpu INTERFACE "--use-port=emdawnwebgpu")
|
||||||
|
else()
|
||||||
|
# custom port
|
||||||
target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
|
target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
|
||||||
target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
|
target_link_options(ggml-webgpu INTERFACE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_WEBGPU_JSPI)
|
||||||
|
target_compile_options(ggml-webgpu PRIVATE "-fwasm-exceptions")
|
||||||
|
target_link_options(ggml-webgpu INTERFACE "-sJSPI" "-fwasm-exceptions")
|
||||||
|
else()
|
||||||
|
target_compile_options(ggml-webgpu PRIVATE "-fexceptions")
|
||||||
|
target_link_options(ggml-webgpu INTERFACE "-sASYNCIFY" "-exceptions")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
find_package(Dawn REQUIRED)
|
find_package(Dawn REQUIRED)
|
||||||
set(DawnWebGPU_TARGET dawn::webgpu_dawn)
|
set(DawnWebGPU_TARGET dawn::webgpu_dawn)
|
||||||
|
|
@ -48,6 +63,9 @@ endif()
|
||||||
|
|
||||||
if (GGML_WEBGPU_DEBUG)
|
if (GGML_WEBGPU_DEBUG)
|
||||||
target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
|
target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
|
||||||
|
if(EMSCRIPTEN)
|
||||||
|
target_link_options(ggml-webgpu INTERFACE "-sASSERTIONS=2")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_WEBGPU_CPU_PROFILE)
|
if (GGML_WEBGPU_CPU_PROFILE)
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,10 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-wgsl-shaders.hpp"
|
#include "ggml-wgsl-shaders.hpp"
|
||||||
|
|
||||||
|
#ifdef __EMSCRIPTEN__
|
||||||
|
# include <emscripten/emscripten.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <webgpu/webgpu_cpp.h>
|
#include <webgpu/webgpu_cpp.h>
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
@ -261,9 +265,12 @@ struct webgpu_context_struct {
|
||||||
wgpu::Queue queue;
|
wgpu::Queue queue;
|
||||||
wgpu::Limits limits;
|
wgpu::Limits limits;
|
||||||
|
|
||||||
bool supports_subgroup_matrix = false;
|
|
||||||
uint32_t subgroup_size;
|
uint32_t subgroup_size;
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
|
bool supports_subgroup_matrix = false;
|
||||||
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Separate this out from limits since on some Metal systems, the limit returned by
|
// Separate this out from limits since on some Metal systems, the limit returned by
|
||||||
// querying the limits is higher than the actual allowed maximum.
|
// querying the limits is higher than the actual allowed maximum.
|
||||||
|
|
@ -449,8 +456,8 @@ static void ggml_backend_webgpu_wait(webgpu_context & ct
|
||||||
// If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
|
// If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
|
||||||
// inflight_max may be 0, meaning that we must wait on all futures.
|
// inflight_max may be 0, meaning that we must wait on all futures.
|
||||||
uint64_t timeout_ms = block ? UINT64_MAX : 0;
|
uint64_t timeout_ms = block ? UINT64_MAX : 0;
|
||||||
uint inflight_threads = ctx->inflight_threads;
|
uint32_t inflight_threads = ctx->inflight_threads;
|
||||||
uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
|
uint32_t inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
|
||||||
while (futures.size() >= inflight_max && futures.size() > 0) {
|
while (futures.size() >= inflight_max && futures.size() > 0) {
|
||||||
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
|
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
|
||||||
futures.erase(futures.begin());
|
futures.erase(futures.begin());
|
||||||
|
|
@ -986,6 +993,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||||
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
||||||
uint32_t wg_m;
|
uint32_t wg_m;
|
||||||
uint32_t wg_n;
|
uint32_t wg_n;
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
if (ctx->supports_subgroup_matrix) {
|
if (ctx->supports_subgroup_matrix) {
|
||||||
// The total number of subgroups/workgroups needed per matrix.
|
// The total number of subgroups/workgroups needed per matrix.
|
||||||
uint32_t wg_m_sg_tile =
|
uint32_t wg_m_sg_tile =
|
||||||
|
|
@ -995,11 +1003,15 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||||
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
||||||
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
||||||
} else {
|
} else {
|
||||||
|
#endif
|
||||||
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
||||||
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
||||||
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
||||||
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1419,8 +1431,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||||
commands.push_back(*cmd);
|
commands.push_back(*cmd);
|
||||||
}
|
}
|
||||||
// compute the batch size based on the number of inflight threads
|
// compute the batch size based on the number of inflight threads
|
||||||
uint inflight_threads = ctx->inflight_threads;
|
uint32_t inflight_threads = ctx->inflight_threads;
|
||||||
uint batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
|
uint32_t batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
|
||||||
WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
|
WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
|
||||||
if (commands.size() >= batch_size) {
|
if (commands.size() >= batch_size) {
|
||||||
futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
|
futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
|
||||||
|
|
@ -1758,6 +1770,17 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
||||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
||||||
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
||||||
|
|
||||||
|
std::string proc_mul_mat_f32_f32;
|
||||||
|
std::string proc_mul_mat_f32_f32_vec;
|
||||||
|
std::string proc_mul_mat_f16_f32;
|
||||||
|
std::string proc_mul_mat_f16_f32_vec;
|
||||||
|
std::string proc_mul_mat_f16_f16;
|
||||||
|
std::string proc_mul_mat_f16_f16_vec;
|
||||||
|
std::string proc_mul_mat_q4_0_f32;
|
||||||
|
std::string proc_mul_mat_q4_0_f32_vec;
|
||||||
|
|
||||||
|
std::vector<wgpu::ConstantEntry> mul_mat_constants;
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
if (webgpu_ctx->supports_subgroup_matrix) {
|
if (webgpu_ctx->supports_subgroup_matrix) {
|
||||||
std::map<std::string, std::string> sg_matrix_repls;
|
std::map<std::string, std::string> sg_matrix_repls;
|
||||||
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
||||||
|
|
@ -1770,100 +1793,57 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
||||||
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
||||||
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
||||||
|
|
||||||
std::string proc_mul_mat_subgroup_matrix_f32_f32 =
|
proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
proc_mul_mat_f32_f32_vec =
|
||||||
std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
||||||
std::string proc_mul_mat_subgroup_matrix_f16_f32 =
|
proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
proc_mul_mat_f16_f32_vec =
|
||||||
std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
||||||
std::string proc_mul_mat_subgroup_matrix_f16_f16 =
|
proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
proc_mul_mat_f16_f16_vec =
|
||||||
std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
||||||
std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
|
proc_mul_mat_q4_0_f32 =
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
||||||
std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
|
proc_mul_mat_q4_0_f32_vec =
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
||||||
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
||||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
|
|
||||||
"mul_mat_subgroup_matrix_f32_f32_vec");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
||||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
|
|
||||||
"mul_mat_subgroup_matrix_f16_f32_vec");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
|
||||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
|
|
||||||
"mul_mat_subgroup_matrix_f16_f16_vec");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
||||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
|
|
||||||
"mul_mat_subgroup_matrix_q4_0_f32_vec");
|
|
||||||
} else {
|
} else {
|
||||||
std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
|
#endif
|
||||||
mul_mat_reg_tile_constants[0].key = "TILE_K";
|
mul_mat_constants.push_back({ .key = "TILE_K", .value = WEBGPU_MUL_MAT_TILE_K });
|
||||||
mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
|
mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_M", .value = WEBGPU_MUL_MAT_WG_SIZE_M });
|
||||||
mul_mat_reg_tile_constants[1].key = "WORKGROUP_SIZE_M";
|
mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_N", .value = WEBGPU_MUL_MAT_WG_SIZE_N });
|
||||||
mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
||||||
mul_mat_reg_tile_constants[2].key = "WORKGROUP_SIZE_N";
|
|
||||||
mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
||||||
|
|
||||||
std::map<std::string, std::string> reg_repls;
|
std::map<std::string, std::string> reg_repls;
|
||||||
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
||||||
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
||||||
|
|
||||||
// Process each reg-tile shader with tile replacements.
|
proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
||||||
// Keep the processed strings in-scope so .c_str() remains valid.
|
proc_mul_mat_f32_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
||||||
std::string proc_mul_mat_reg_tile_f32_f32 =
|
proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
proc_mul_mat_f16_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
||||||
std::string proc_mul_mat_reg_tile_f32_f32_vec =
|
proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
proc_mul_mat_f16_f16_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
||||||
std::string proc_mul_mat_reg_tile_f16_f32 =
|
proc_mul_mat_q4_0_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
proc_mul_mat_q4_0_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
||||||
std::string proc_mul_mat_reg_tile_f16_f32_vec =
|
#ifndef __EMSCRIPTEN__
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
|
||||||
std::string proc_mul_mat_reg_tile_f16_f16 =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
|
||||||
std::string proc_mul_mat_reg_tile_f16_f16_vec =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
|
||||||
std::string proc_mul_mat_reg_tile_q4_0_f32 =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
|
||||||
std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
|
|
||||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
|
||||||
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
|
|
||||||
"mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
|
|
||||||
"mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
|
|
||||||
"mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
|
|
||||||
"mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
|
|
||||||
"mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
|
|
||||||
"mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
|
|
||||||
"mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
|
|
||||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
||||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
|
|
||||||
"mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f32_f32.c_str(), "mul_mat_f32_f32", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f32_f32_vec.c_str(), "mul_mat_f32_f32_vec", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f16_f32.c_str(), "mul_mat_f16_f32", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f16_f32_vec.c_str(), "mul_mat_f16_f32_vec", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f16_f16.c_str(), "mul_mat_f16_f16", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_f16_f16_vec.c_str(), "mul_mat_f16_f16_vec", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_q4_0_f32.c_str(), "mul_mat_q4_0_f32", mul_mat_constants);
|
||||||
|
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
||||||
|
webgpu_ctx->device, proc_mul_mat_q4_0_f32_vec.c_str(), "mul_mat_q4_0_f32_vec", mul_mat_constants);
|
||||||
|
|
||||||
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
||||||
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
||||||
|
|
@ -2384,13 +2364,17 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||||
|
|
||||||
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
||||||
|
|
||||||
|
wgpu::RequestAdapterOptions options = {};
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
||||||
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
||||||
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
||||||
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
||||||
adapterTogglesDesc.enabledToggleCount = 2;
|
adapterTogglesDesc.enabledToggleCount = 2;
|
||||||
wgpu::RequestAdapterOptions options = {};
|
|
||||||
options.nextInChain = &adapterTogglesDesc;
|
options.nextInChain = &adapterTogglesDesc;
|
||||||
|
#endif
|
||||||
|
|
||||||
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
||||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||||
|
|
@ -2407,10 +2391,12 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||||
ctx->max_wg_size_x = 288; // default value
|
ctx->max_wg_size_x = 288; // default value
|
||||||
|
|
||||||
wgpu::AdapterInfo info{};
|
wgpu::AdapterInfo info{};
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
||||||
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
||||||
info.nextInChain = &subgroup_matrix_configs;
|
info.nextInChain = &subgroup_matrix_configs;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
ctx->adapter.GetInfo(&info);
|
ctx->adapter.GetInfo(&info);
|
||||||
|
|
||||||
wgpu::SupportedFeatures features;
|
wgpu::SupportedFeatures features;
|
||||||
|
|
@ -2418,6 +2404,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||||
// we require f16 support
|
// we require f16 support
|
||||||
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
// Only support square f16 matrices of size 8 or 16 for now
|
// Only support square f16 matrices of size 8 or 16 for now
|
||||||
bool valid_subgroup_matrix_config = false;
|
bool valid_subgroup_matrix_config = false;
|
||||||
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
||||||
|
|
@ -2433,36 +2420,27 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
||||||
|
#endif
|
||||||
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
||||||
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
||||||
ctx->subgroup_size = info.subgroupMaxSize;
|
ctx->subgroup_size = info.subgroupMaxSize;
|
||||||
ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
|
||||||
|
|
||||||
// Initialize device
|
// Initialize device
|
||||||
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
|
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
|
||||||
wgpu::FeatureName::ImplicitDeviceSynchronization };
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
|
required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
|
||||||
if (ctx->supports_subgroup_matrix) {
|
if (ctx->supports_subgroup_matrix) {
|
||||||
required_features.push_back(wgpu::FeatureName::Subgroups);
|
required_features.push_back(wgpu::FeatureName::Subgroups);
|
||||||
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||||
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Enable Dawn-specific toggles to increase native performance
|
|
||||||
// TODO: Don't enable for WASM builds, they won't have an effect anyways
|
|
||||||
// TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
|
|
||||||
// only for native performance?
|
|
||||||
const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
|
|
||||||
"disable_polyfills_on_integer_div_and_mod" };
|
|
||||||
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
|
|
||||||
wgpu::DawnTogglesDescriptor deviceTogglesDesc;
|
|
||||||
deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
|
|
||||||
deviceTogglesDesc.enabledToggleCount = 4;
|
|
||||||
deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
|
|
||||||
deviceTogglesDesc.disabledToggleCount = 1;
|
|
||||||
|
|
||||||
wgpu::DeviceDescriptor dev_desc;
|
wgpu::DeviceDescriptor dev_desc;
|
||||||
dev_desc.requiredLimits = &ctx->limits;
|
dev_desc.requiredLimits = &ctx->limits;
|
||||||
dev_desc.requiredFeatures = required_features.data();
|
dev_desc.requiredFeatures = required_features.data();
|
||||||
|
|
@ -2480,7 +2458,23 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||||
GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
||||||
std::string(message).c_str());
|
std::string(message).c_str());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
|
// Enable Dawn-specific toggles to increase native performance
|
||||||
|
// TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
|
||||||
|
// only for native performance?
|
||||||
|
const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
|
||||||
|
"disable_polyfills_on_integer_div_and_mod" };
|
||||||
|
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
|
||||||
|
wgpu::DawnTogglesDescriptor deviceTogglesDesc;
|
||||||
|
deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
|
||||||
|
deviceTogglesDesc.enabledToggleCount = 4;
|
||||||
|
deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
|
||||||
|
deviceTogglesDesc.disabledToggleCount = 1;
|
||||||
|
|
||||||
dev_desc.nextInChain = &deviceTogglesDesc;
|
dev_desc.nextInChain = &deviceTogglesDesc;
|
||||||
|
#endif
|
||||||
|
|
||||||
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
||||||
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
||||||
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
||||||
|
|
@ -2578,18 +2572,27 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
|
||||||
ctx.name = GGML_WEBGPU_NAME;
|
ctx.name = GGML_WEBGPU_NAME;
|
||||||
ctx.device_count = 1;
|
ctx.device_count = 1;
|
||||||
|
|
||||||
const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
|
|
||||||
|
|
||||||
wgpu::DawnTogglesDescriptor instanceTogglesDesc;
|
|
||||||
instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
|
|
||||||
instanceTogglesDesc.enabledToggleCount = 1;
|
|
||||||
wgpu::InstanceDescriptor instance_descriptor{};
|
wgpu::InstanceDescriptor instance_descriptor{};
|
||||||
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
|
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
|
||||||
instance_descriptor.requiredFeatures = instance_features.data();
|
instance_descriptor.requiredFeatures = instance_features.data();
|
||||||
instance_descriptor.requiredFeatureCount = instance_features.size();
|
instance_descriptor.requiredFeatureCount = instance_features.size();
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
|
const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
|
||||||
|
wgpu::DawnTogglesDescriptor instanceTogglesDesc;
|
||||||
|
instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
|
||||||
|
instanceTogglesDesc.enabledToggleCount = 1;
|
||||||
instance_descriptor.nextInChain = &instanceTogglesDesc;
|
instance_descriptor.nextInChain = &instanceTogglesDesc;
|
||||||
|
#endif
|
||||||
|
|
||||||
webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
|
webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
|
||||||
|
|
||||||
|
#ifdef __EMSCRIPTEN__
|
||||||
|
if (webgpu_ctx->instance == nullptr) {
|
||||||
|
GGML_LOG_ERROR("ggml_webgpu: Failed to create WebGPU instance. Make sure either -sASYNCIFY or -sJSPI is set\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
GGML_ASSERT(webgpu_ctx->instance != nullptr);
|
GGML_ASSERT(webgpu_ctx->instance != nullptr);
|
||||||
|
|
||||||
static ggml_backend_reg reg = {
|
static ggml_backend_reg reg = {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
const http = require('http');
|
||||||
|
const fs = require('fs').promises;
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
// This file is used for testing wasm build from emscripten
|
||||||
|
// Example build command:
|
||||||
|
// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF
|
||||||
|
// cmake --build build-wasm --target test-backend-ops -j
|
||||||
|
|
||||||
|
const PORT = 8080;
|
||||||
|
const STATIC_DIR = path.join(__dirname, '../build-wasm/bin');
|
||||||
|
console.log(`Serving static files from: ${STATIC_DIR}`);
|
||||||
|
|
||||||
|
const mimeTypes = {
|
||||||
|
'.html': 'text/html',
|
||||||
|
'.js': 'text/javascript',
|
||||||
|
'.css': 'text/css',
|
||||||
|
'.png': 'image/png',
|
||||||
|
'.jpg': 'image/jpeg',
|
||||||
|
'.gif': 'image/gif',
|
||||||
|
'.svg': 'image/svg+xml',
|
||||||
|
'.json': 'application/json',
|
||||||
|
'.woff': 'font/woff',
|
||||||
|
'.woff2': 'font/woff2',
|
||||||
|
};
|
||||||
|
|
||||||
|
async function generateDirListing(dirPath, reqUrl) {
|
||||||
|
const files = await fs.readdir(dirPath);
|
||||||
|
let html = `
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Directory Listing</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: Arial, sans-serif; padding: 20px; }
|
||||||
|
ul { list-style: none; padding: 0; }
|
||||||
|
li { margin: 5px 0; }
|
||||||
|
a { text-decoration: none; color: #0066cc; }
|
||||||
|
a:hover { text-decoration: underline; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Directory: ${reqUrl}</h1>
|
||||||
|
<ul>
|
||||||
|
`;
|
||||||
|
|
||||||
|
if (reqUrl !== '/') {
|
||||||
|
html += `<li><a href="../">../ (Parent Directory)</a></li>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const filePath = path.join(dirPath, file);
|
||||||
|
const stats = await fs.stat(filePath);
|
||||||
|
const link = encodeURIComponent(file) + (stats.isDirectory() ? '/' : '');
|
||||||
|
html += `<li><a href="${link}">${file}${stats.isDirectory() ? '/' : ''}</a></li>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
html += `
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
const server = http.createServer(async (req, res) => {
|
||||||
|
try {
|
||||||
|
// Set COOP and COEP headers
|
||||||
|
res.setHeader('Cross-Origin-Opener-Policy', 'same-origin');
|
||||||
|
res.setHeader('Cross-Origin-Embedder-Policy', 'require-corp');
|
||||||
|
res.setHeader('Cache-Control', 'no-store, no-cache, must-revalidate, proxy-revalidate');
|
||||||
|
res.setHeader('Pragma', 'no-cache');
|
||||||
|
res.setHeader('Expires', '0');
|
||||||
|
|
||||||
|
const filePath = path.join(STATIC_DIR, decodeURIComponent(req.url));
|
||||||
|
const stats = await fs.stat(filePath);
|
||||||
|
|
||||||
|
if (stats.isDirectory()) {
|
||||||
|
const indexPath = path.join(filePath, 'index.html');
|
||||||
|
try {
|
||||||
|
const indexData = await fs.readFile(indexPath);
|
||||||
|
res.writeHeader(200, { 'Content-Type': 'text/html' });
|
||||||
|
res.end(indexData);
|
||||||
|
} catch {
|
||||||
|
// No index.html, generate directory listing
|
||||||
|
const dirListing = await generateDirListing(filePath, req.url);
|
||||||
|
res.writeHeader(200, { 'Content-Type': 'text/html' });
|
||||||
|
res.end(dirListing);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const ext = path.extname(filePath).toLowerCase();
|
||||||
|
const contentType = mimeTypes[ext] || 'application/octet-stream';
|
||||||
|
const data = await fs.readFile(filePath);
|
||||||
|
res.writeHeader(200, { 'Content-Type': contentType });
|
||||||
|
res.end(data);
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (err.code === 'ENOENT') {
|
||||||
|
res.writeHeader(404, { 'Content-Type': 'text/plain' });
|
||||||
|
res.end('404 Not Found');
|
||||||
|
} else {
|
||||||
|
res.writeHeader(500, { 'Content-Type': 'text/plain' });
|
||||||
|
res.end('500 Internal Server Error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
server.listen(PORT, () => {
|
||||||
|
console.log(`Server running at http://localhost:${PORT}/`);
|
||||||
|
});
|
||||||
|
|
@ -41,12 +41,18 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#ifdef __EMSCRIPTEN__
|
||||||
|
# define N_THREADS 1
|
||||||
|
#else
|
||||||
|
# define N_THREADS std::thread::hardware_concurrency()
|
||||||
|
#endif
|
||||||
|
|
||||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||||
size_t nels = ggml_nelements(tensor);
|
size_t nels = ggml_nelements(tensor);
|
||||||
std::vector<float> data(nels);
|
std::vector<float> data(nels);
|
||||||
{
|
{
|
||||||
// parallel initialization
|
// parallel initialization
|
||||||
static const size_t n_threads = std::thread::hardware_concurrency();
|
static const size_t n_threads = N_THREADS;
|
||||||
// static RNG initialization (revisit if n_threads stops being constant)
|
// static RNG initialization (revisit if n_threads stops being constant)
|
||||||
static std::vector<std::default_random_engine> generators = []() {
|
static std::vector<std::default_random_engine> generators = []() {
|
||||||
std::random_device rd;
|
std::random_device rd;
|
||||||
|
|
@ -65,6 +71,9 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (n_threads == 1) {
|
||||||
|
init_thread(0, 0, nels);
|
||||||
|
} else {
|
||||||
std::vector<std::future<void>> tasks;
|
std::vector<std::future<void>> tasks;
|
||||||
tasks.reserve(n_threads);
|
tasks.reserve(n_threads);
|
||||||
for (size_t i = 0; i < n_threads; i++) {
|
for (size_t i = 0; i < n_threads; i++) {
|
||||||
|
|
@ -76,6 +85,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
t.get();
|
t.get();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||||
ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
|
ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
|
||||||
|
|
@ -105,19 +115,25 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
};
|
};
|
||||||
|
|
||||||
const size_t min_blocks_per_thread = 1;
|
const size_t min_blocks_per_thread = 1;
|
||||||
const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
|
const size_t n_quant_threads = std::min<size_t>(std::max<size_t>(N_THREADS/2, 1),
|
||||||
std::max<size_t>(1, n_blocks / min_blocks_per_thread));
|
std::max<size_t>(1, n_blocks / min_blocks_per_thread));
|
||||||
|
|
||||||
|
if (n_quant_threads == 1) {
|
||||||
|
// single-threaded quantization: do all blocks in the current thread
|
||||||
|
quantize_thread(0, n_blocks);
|
||||||
|
} else {
|
||||||
std::vector<std::future<void>> tasks;
|
std::vector<std::future<void>> tasks;
|
||||||
tasks.reserve(n_threads);
|
tasks.reserve(n_quant_threads);
|
||||||
for (size_t i = 0; i < n_threads; i++) {
|
for (size_t i = 0; i < n_quant_threads; i++) {
|
||||||
size_t start = i*n_blocks/n_threads;
|
size_t start = i*n_blocks/n_quant_threads;
|
||||||
size_t end = (i+1)*n_blocks/n_threads;
|
size_t end = (i+1)*n_blocks/n_quant_threads;
|
||||||
tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
|
tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
|
||||||
}
|
}
|
||||||
for (auto & t : tasks) {
|
for (auto & t : tasks) {
|
||||||
t.get();
|
t.get();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||||
// This is going to create some weird integers though.
|
// This is going to create some weird integers though.
|
||||||
|
|
@ -8363,7 +8379,7 @@ int main(int argc, char ** argv) {
|
||||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||||
if (ggml_backend_set_n_threads_fn) {
|
if (ggml_backend_set_n_threads_fn) {
|
||||||
// TODO: better value for n_threads
|
// TODO: better value for n_threads
|
||||||
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
|
ggml_backend_set_n_threads_fn(backend, N_THREADS);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t free, total; // NOLINT
|
size_t free, total; // NOLINT
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue