Merge branch 'master' into quantize

This commit is contained in:
Ed Addario 2025-08-24 20:47:53 +01:00
commit ccaab24441
No known key found for this signature in database
GPG Key ID: E7875815A3230993
78 changed files with 5197 additions and 1557 deletions

View File

@ -2,14 +2,30 @@ ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools # Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL # Install build tools
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ RUN apt update && apt install -y git build-essential cmake wget xz-utils
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt update -y && \ # Install Vulkan SDK
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl ARG VULKAN_VERSION=1.4.321.1
RUN ARCH=$(uname -m) && \
wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
mkdir -p /opt/vulkan && \
tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
mv /tmp/${ARCH}/* /opt/vulkan/ && \
rm -rf /tmp/*
# Install cURL and Vulkan SDK dependencies
RUN apt install -y libcurl4-openssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
# Set environment variables
ENV VULKAN_SDK=/opt/vulkan
ENV PATH=$VULKAN_SDK/bin:$PATH
ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
# Build it # Build it
WORKDIR /app WORKDIR /app

View File

@ -151,6 +151,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge) - [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) - [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)
</details> </details>

View File

@ -1755,7 +1755,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(
@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"-dt", "--defrag-thold"}, "N", {"-dt", "--defrag-thold"}, "N",
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), string_format("KV cache defragmentation threshold (DEPRECATED)"),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.defrag_thold = std::stof(value); GGML_UNUSED(params);
GGML_UNUSED(value);
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
} }
).set_env("LLAMA_ARG_DEFRAG_THOLD")); ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(common_arg( add_opt(common_arg(

View File

@ -1361,6 +1361,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
"<|end|>", "<|end|>",
}; };
if (!inputs.json_schema.is_null()) {
data.grammar_lazy = false;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
auto not_end = builder.add_rule("not-end",
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
auto analysis = builder.add_rule("analysis",
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
auto final = builder.add_rule("final",
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
builder.add_schema("response", schema)
);
builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
});
}
if (inputs.tools.is_array() && !inputs.tools.empty()) { if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) { data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@ -2121,7 +2141,7 @@ static common_chat_params common_chat_templates_apply_jinja(
} }
// GPT-OSS // GPT-OSS
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) { if (src.find("<|channel|>") != std::string::npos) {
return common_chat_params_init_gpt_oss(tmpl, params); return common_chat_params_init_gpt_oss(tmpl, params);
} }

View File

@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type; cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval; cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload; cparams.offload_kqv = !params.no_kv_offload;

View File

@ -288,7 +288,6 @@ struct common_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

View File

@ -5854,6 +5854,11 @@ class OlmoModel(TextModel):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("SeedOssForCausalLM")
class SeedOssModel(TextModel):
model_arch = gguf.MODEL_ARCH.SEED_OSS
@ModelBase.register("Olmo2ForCausalLM") @ModelBase.register("Olmo2ForCausalLM")
class Olmo2Model(TextModel): class Olmo2Model(TextModel):
model_arch = gguf.MODEL_ARCH.OLMO2 model_arch = gguf.MODEL_ARCH.OLMO2

View File

@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
| BF16 | 🚫 | 🚫 | ❓ | ❓ | | BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ | | Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ | | Q4_1 | ✅ | ✅ | ❓ | ❓ |
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ | | MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ | | Q5_0 | ✅ | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ | | Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ | | Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ | | Q3_K | ✅ | ✅ | ❓ | ❓ |
@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation - 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself - ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025. Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.

View File

@ -17,7 +17,7 @@
" "
" start the llama.cpp server with a FIM-compatible model. for example: " start the llama.cpp server with a FIM-compatible model. for example:
" "
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
" "
" --batch-size [512, model max context] " --batch-size [512, model max context]
" "

View File

@ -512,6 +512,7 @@ extern "C" {
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D, GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW, GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
@ -1940,6 +1941,23 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
struct ggml_tensor * b, // input [W, H, D, C * N]
int s0, // stride
int s1,
int s2,
int p0, // padding
int p1,
int p2,
int d0, // dilation
int d1,
int d2,
int n_channels,
int n_batch,
int n_channels_out);
enum ggml_op_pool { enum ggml_op_pool {
GGML_OP_POOL_MAX, GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG, GGML_OP_POOL_AVG,

View File

@ -1355,15 +1355,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
std::vector<int32_t> ids; std::vector<int32_t> ids;
std::vector<ggml_bitset_t> used_ids; std::vector<ggml_bitset_t> used_ids;
for (int i = 0; i < sched->n_splits; i++) { for (int split_id = 0; split_id < sched->n_splits; split_id++) {
struct ggml_backend_sched_split * split = &splits[i]; struct ggml_backend_sched_split * split = &splits[split_id];
int split_backend_id = split->backend_id; int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id]; ggml_backend_t split_backend = sched->backends[split_backend_id];
// copy the input tensors to the split backend // copy the input tensors to the split backend
for (int j = 0; j < split->n_inputs; j++) { for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[input_id];
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) { if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@ -1398,10 +1398,22 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// get the ids // get the ids
ggml_tensor * ids_tensor = node->src[2]; ggml_tensor * ids_tensor = node->src[2];
ggml_backend_t ids_backend = split_backend;
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
// in that case, we use the original ids tensor
for (int i = input_id + 1; i < split->n_inputs; i++) {
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
ids_tensor = split->inputs[i];
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
break;
}
}
if (ids_tensor != prev_ids_tensor) { if (ids_tensor != prev_ids_tensor) {
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t)); ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor)); ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
ggml_backend_synchronize(split_backend); ggml_backend_synchronize(ids_backend);
// find the used experts // find the used experts
used_ids.clear(); used_ids.clear();
@ -1409,6 +1421,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) { for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) { for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)]; int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
GGML_ASSERT(id >= 0 && id < n_expert);
ggml_bitset_set(used_ids.data(), id); ggml_bitset_set(used_ids.data(), id);
} }
} }

View File

@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
return acl_tensor; return acl_tensor;
} }
/**
* @brief Fills a tensor with a scalar value.
*
* This function fills the destination tensor `acl_dst` with the scalar value
* `scalar`.
*
* @param ctx The context for the CANN backend operations.
* @param scalar The scalar value used to fill the tensor.
* @param acl_dst The destination tensor to be filled with the scalar value.
*/
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
aclTensor* acl_dst) {
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
ggml_cann_release_resources(ctx, acl_scalar);
}
/**
* @brief Get or expand a cached float32 tensor filled with a scalar value.
*
* This function manages cached device memory for float32 tensors. If the current
* cache size is insufficient for the requested tensor shape, the old memory will
* be released and new memory will be allocated. The allocated buffer is then
* initialized either with zeros (when @p value == 0.0f) or with the given scalar
* value using CANN operations. Finally, an aclTensor object is created from the
* cached memory and returned.
*
* @param ctx The CANN backend context that manages device memory.
* @param buffer A pointer to the cached device buffer (will be allocated
* or reallocated if necessary).
* @param cache_element The current number of cached elements. This will be
* updated when the cache is expanded.
* @param ne The tensor shape array (number of elements in each dimension).
* @param nb The stride size for each dimension.
* @param dims The number of tensor dimensions.
* @param value The scalar value used to fill the tensor (supports zero
* initialization via memset or arbitrary values via fill_scalar).
* @return An aclTensor pointer created from the cached buffer.
*/
static aclTensor* get_f32_cache_acl_tensor(
ggml_backend_cann_context& ctx,
void** buffer,
int64_t &cache_element,
int64_t* ne,
size_t* nb,
int64_t dims,
float value) {
// Calculate total number of elements
int64_t n_element = 1;
for (int i = 0; i < dims; i++) {
n_element *= ne[i];
}
size_t size = n_element * sizeof(float);
// Allocate or expand cache if needed
if (cache_element < n_element) {
if (*buffer != nullptr) {
aclrtFree(*buffer);
*buffer = nullptr;
}
ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
cache_element = n_element;
// Initialize cache
if (value == 0.0f) {
ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
} else {
int64_t pool_ne[1] = { n_element };
size_t pool_nb[1] = { sizeof(float) };
aclTensor* acl_value = ggml_cann_create_tensor(
*buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
aclnn_fill_scalar(ctx, 1, acl_value);
ggml_cann_release_resources(ctx, acl_value);
}
}
return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
}
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0]; ggml_tensor* src = dst->src[0];
@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* acl_gamma = aclnn_values( // build gamma, one...
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, size_t acl_gamma_nb[GGML_MAX_DIMS];
ggml_cann_type_mapping(src->type), ggml_element_size(src)); acl_gamma_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
}
aclTensor* acl_gamma = get_f32_cache_acl_tensor(
ctx,
&ctx.f32_one_cache,
ctx.f32_one_cache_element,
src->ne,
acl_gamma_nb,
1, // dims
1.0f // value
);
// build rstd, zero...
size_t acl_rstd_nb[GGML_MAX_DIMS];
acl_rstd_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
}
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
ctx,
&ctx.f32_zero_cache,
ctx.f32_zero_cache_element,
src->ne,
acl_rstd_nb,
GGML_MAX_DIMS,
0.0f // value
);
size_t zero_tensor_n_bytes =
src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
aclTensor* acl_rstd =
aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src));
GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
} }
@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
const int n_past = ((int32_t*)dst->op_params)[0]; const int n_past = ((int32_t*)dst->op_params)[0];
size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
src->ne[3] * ggml_element_size(src); void* buffer = one_tensor_allocator.get();
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* mask_tensor = aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value); aclnn_fill_scalar(ctx, value, mask_tensor);
aclScalar* alpha = nullptr; aclScalar* alpha = nullptr;
float alphaValue = 1.0f; float alphaValue = 1.0f;
@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
tmp_permute_tensor, tmp_mul_tensor, acl_dst); tmp_permute_tensor, tmp_mul_tensor, acl_dst);
} }
/**
* @brief Fills a tensor with a scalar value.
*
* This function fills the destination tensor `acl_dst` with the scalar value
* `scalar`.
*
* @param ctx The context for the CANN backend operations.
* @param scalar The scalar value used to fill the tensor.
* @param acl_dst The destination tensor to be filled with the scalar value.
*/
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
aclTensor* acl_dst) {
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
ggml_cann_release_resources(ctx, acl_scalar);
}
/** /**
* @brief Raises each element of a tensor to the power of the corresponding * @brief Raises each element of a tensor to the power of the corresponding
* element in another tensor. * element in another tensor.

View File

@ -379,6 +379,10 @@ struct ggml_backend_cann_context {
cann_task_queue task_queue; cann_task_queue task_queue;
bool async_mode; bool async_mode;
bool support_set_rows; bool support_set_rows;
void* f32_zero_cache = nullptr;
void* f32_one_cache = nullptr;
int64_t f32_zero_cache_element = 0;
int64_t f32_one_cache_element = 0;
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */

View File

@ -150,8 +150,6 @@
#elif defined(__s390x__) #elif defined(__s390x__)
// quants.c // quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K #define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K

View File

@ -23,6 +23,27 @@
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
#if defined(__VXE__) || defined(__VXE2__)
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
// precomputed tables for expanding 8bits to 8 bytes:
static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
// permute mask for byteswapping
static const uint8x16_t v_kperm = (const uint8x16_t){
7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8
};
#endif
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32); assert(QK8_0 == 32);
assert(k % QK8_0 == 0); assert(k % QK8_0 == 0);
@ -241,6 +262,301 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif #endif
} }
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
assert(n % qk == 0);
assert(qk == QK5_0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
int ib = 0;
float sumf = 0.0f;
#if defined(__VXE__) || defined(__VXE2__)
float32x4_t v_sum0 = vec_splats(0.0f);
float32x4_t v_sum1 = vec_splats(0.0f);
uint32_t qh0, qh1;
uint64_t tmp0[4], tmp1[4];
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
#pragma GCC unroll 4
for (; ib + 1 < nb; ib += 2) {
const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
memcpy(&qh0, x0->qh, sizeof(qh0));
memcpy(&qh1, x1->qh, sizeof(qh1));
tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
tmp0[3] = table_b2b_1[(qh0 >> 24) ];
tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
tmp1[3] = table_b2b_1[(qh1 >> 24) ];
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
// required for fixing the byteorder
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
const float32x4_t v_xy0f = vec_float(v_xy0);
const float32x4_t v_xy1f = vec_float(v_xy1);
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
uint64_t tmp[4];
tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_1[(qh >> 24) ];
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
// required for fixing the byteorder
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
const float32x4_t v_xyf = vec_float(v_xy);
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
sumf += vec_hsum(v_acc);
}
*s = sumf;
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
assert(n % qk == 0);
assert(qk == QK5_1);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_1 * GGML_RESTRICT x = vx;
const block_q8_1 * GGML_RESTRICT y = vy;
int ib = 0;
float sumf = 0.0f;
#if defined(__VXE__) || defined(__VXE2__)
float32x4_t v_sum0 = vec_splats(0.0f);
float32x4_t v_sum1 = vec_splats(0.0f);
float summs0 = 0.0f;
float summs1 = 0.0f;
uint32_t qh0;
uint32_t qh1;
uint64_t tmp0[4];
uint64_t tmp1[4];
const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
#pragma GCC unroll 4
for (; ib + 1 < nb; ib += 2) {
const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
memcpy(&qh0, x0->qh, sizeof(qh0));
memcpy(&qh1, x1->qh, sizeof(qh1));
tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
tmp0[3] = table_b2b_0[(qh0 >> 24) ];
tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
tmp1[3] = table_b2b_0[(qh1 >> 24) ];
int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
// required for fixing the byteorder
v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
const int8x16_t v_y0l = vec_xl(0 , y0->qs);
const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
const int8x16_t v_y1l = vec_xl(0 , y1->qs);
const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
const float32x4_t v_xy0f = vec_float(v_xy0);
const float32x4_t v_xy1f = vec_float(v_xy1);
const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
uint64_t tmp[4];
tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_0[(qh >> 24) ];
int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
// required for fixing the byteorder
v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
const uint8x16_t v_x = vec_xl(0, x0->qs);
const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
const int8x16_t v_yl = vec_xl(0 , y0->qs);
const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
const float32x4_t v_xyf = vec_float(v_xy);
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
sumf += vec_hsum(v_acc) + summs;
}
*s = sumf;
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;

View File

@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
return v_abo + v_abe; return v_abo + v_abe;
} }
/**
* @see https://github.com/ggml-org/llama.cpp/pull/14037
*/
inline float vec_hsum(float32x4_t v) {
float32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) { inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b); const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
return acc + (vec_unpackh(p) + vec_unpackl(p)); return acc + (vec_unpackh(p) + vec_unpackl(p));

View File

@ -1880,6 +1880,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_conv_2d(params, tensor); ggml_compute_forward_conv_2d(params, tensor);
} break; } break;
case GGML_OP_CONV_3D:
{
ggml_compute_forward_conv_3d(params, tensor);
} break;
case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_2D_DW:
{ {
ggml_compute_forward_conv_2d_dw(params, tensor); ggml_compute_forward_conv_2d_dw(params, tensor);
@ -2252,6 +2256,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK: case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D: case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
@ -2773,6 +2778,7 @@ struct ggml_cplan ggml_graph_plan(
} }
} break; } break;
case GGML_OP_CONV_2D: case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
{ {
cur = GGML_IM2COL_WORK_SIZE; cur = GGML_IM2COL_WORK_SIZE;
} break; } break;

View File

@ -7207,6 +7207,148 @@ void ggml_compute_forward_conv_2d(
ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
} }
// ggml_compute_forward_conv_3d
static void ggml_compute_forward_conv_3d_impl(const ggml_compute_params * params,
const ggml_tensor * kernel,
const ggml_tensor * src,
ggml_tensor * dst,
ggml_type kernel_type) {
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
GGML_ASSERT(kernel->type == kernel_type);
const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
const int32_t s0 = dst->op_params[0];
const int32_t s1 = dst->op_params[1];
const int32_t s2 = dst->op_params[2];
const int32_t p0 = dst->op_params[3];
const int32_t p1 = dst->op_params[4];
const int32_t p2 = dst->op_params[5];
const int32_t d0 = dst->op_params[6];
const int32_t d1 = dst->op_params[7];
const int32_t d2 = dst->op_params[8];
const int32_t c = dst->op_params[9];
const int32_t n = dst->op_params[10];
const int32_t oc = dst->op_params[11];
const int64_t src_w = src->ne[0];
const int64_t src_h = src->ne[1];
const int64_t src_d = src->ne[2];
const int64_t knl_w = kernel->ne[0];
const int64_t knl_h = kernel->ne[1];
const int64_t knl_d = kernel->ne[2];
const int64_t dst_w = dst->ne[0];
const int64_t dst_h = dst->ne[1];
const int64_t dst_d = dst->ne[2];
const float * src_data = (float *) src->data;
void * knl_data = kernel->data;
float * dst_data = (float *) dst->data;
const int64_t knl_n_per_channel = knl_w * knl_h * knl_d;
const int64_t knl_n_total = knl_n_per_channel * c;
const int64_t patch_total = n * dst_w * dst_h * dst_d;
const int64_t space_per_patch = knl_n_total * traits->type_size + oc * sizeof(float);
const int64_t batch_size = params->wsize / space_per_patch;
const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
void * tmp = params->wdata;
for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
const int64_t patch_start_batch = batch_i * patches_per_batch;
const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, patch_total);
const int64_t patch_n_in_batch = patch_end_batch - patch_start_batch;
const int64_t patch_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch);
for (int64_t p = patch_start; p < patch_end; ++p) {
const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
const int64_t dst_z = p_in_batch / (dst_w * dst_h);
const int64_t dst_y = p_in_depth / dst_w;
const int64_t dst_x = p_in_depth % dst_w;
char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n_total * traits->type_size;
for (int64_t ic = 0; ic < c; ++ic) {
for (int64_t kz = 0; kz < knl_d; ++kz) {
for (int64_t ky = 0; ky < knl_h; ++ky) {
for (int64_t kx = 0; kx < knl_w; ++kx) {
const int64_t sz = dst_z * s2 + kz * d2 - p2;
const int64_t sy = dst_y * s1 + ky * d1 - p1;
const int64_t sx = dst_x * s0 + kx * d0 - p0;
int64_t dst_idx = ic * knl_n_per_channel + kz * (knl_h * knl_w) + ky * knl_w + kx;
float src_val;
if (sz < 0 || sz >= src_d || sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
src_val = 0.0f;
} else {
const int64_t cn_idx = batch_idx * c + ic;
const float * src_ptr = (const float *)((const char *)src_data + sx*src->nb[0] + sy*src->nb[1] + sz*src->nb[2] + cn_idx*src->nb[3]);
src_val = *src_ptr;
}
char * element_ptr = dst_row + dst_idx * traits->type_size;
if (kernel_type == GGML_TYPE_F32) {
*(float *)element_ptr = src_val;
} else if (kernel_type == GGML_TYPE_F16) {
*(ggml_fp16_t *)element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
}
}
}
}
}
}
ggml_barrier(params->threadpool);
float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n_total * traits->type_size);
ggml_call_mul_mat(kernel_type, params, patch_n_in_batch, oc, knl_n_total, tmp, knl_data, gemm_output);
ggml_barrier(params->threadpool);
const int64_t permute_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth;
const int64_t permute_start = params->ith * permute_per_thread;
const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n_in_batch);
for (int64_t i = permute_start; i < permute_end; ++i) {
const int64_t p = patch_start_batch + i;
const int64_t p_in_batch = p % (dst_w * dst_h * dst_d);
const int64_t p_in_depth = p_in_batch % (dst_w * dst_h);
const int64_t batch_idx = p / (dst_w * dst_h * dst_d);
const int64_t dst_z = p_in_batch / (dst_w * dst_h);
const int64_t dst_y = p_in_depth / dst_w;
const int64_t dst_x = p_in_depth % dst_w;
for (int64_t ioc = 0; ioc < oc; ++ioc) {
const float value = gemm_output[i * oc + ioc];
const int64_t ocn_idx = batch_idx * oc + ioc;
float * dst_ptr = (float *)((char *)dst_data + dst_x*dst->nb[0] + dst_y*dst->nb[1] + dst_z*dst->nb[2] + ocn_idx*dst->nb[3]);
*dst_ptr = value;
}
}
}
}
void ggml_compute_forward_conv_3d(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
}
// ggml_compute_forward_conv_transpose_2d // ggml_compute_forward_conv_transpose_2d
void ggml_compute_forward_conv_transpose_2d( void ggml_compute_forward_conv_transpose_2d(

View File

@ -70,6 +70,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@ -258,7 +258,7 @@ static __global__ void flash_attn_tile_ext_f16(
const half val = hexp(sink - kqmax[j0/nwarps]); const half val = hexp(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale; kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val); kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
} }
#pragma unroll #pragma unroll

View File

@ -49,6 +49,7 @@
#include "ggml-cuda/wkv.cuh" #include "ggml-cuda/wkv.cuh"
#include "ggml-cuda/gla.cuh" #include "ggml-cuda/gla.cuh"
#include "ggml-cuda/set-rows.cuh" #include "ggml-cuda/set-rows.cuh"
#include "ggml-cuda/pad_reflect_1d.cuh"
#include "ggml.h" #include "ggml.h"
#include <algorithm> #include <algorithm>
@ -2352,6 +2353,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
case GGML_OP_PAD_REFLECT_1D:
ggml_cuda_op_pad_reflect_1d(ctx, dst);
break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@ -3481,15 +3485,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM: case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
return true; return true;
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:

View File

@ -0,0 +1,82 @@
#include "pad_reflect_1d.cuh"
static __global__ void pad_reflect_1d_kernel_f32(
const void * __restrict__ src0,
void * __restrict__ dst,
const int64_t ne0,
const int64_t ne00,
const int64_t ne01,
const int64_t ne02,
const int64_t ne03,
const int64_t nb00,
const int64_t nb01,
const int64_t nb02,
const int64_t nb03,
const int64_t nb0,
const int64_t nb1,
const int64_t nb2,
const int64_t nb3,
const int p0,
const int p1) {
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;
const int64_t i1 = blockIdx.x;
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
return;
}
const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
float value;
if (i0 < p0) {
// Left padding - reflect
value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
} else if (i0 < ne0 - p1) {
// Middle - copy
value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
} else {
// Right padding - reflect
int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
value = *(const float *)(src0_ptr + src_idx * nb00);
}
*(float *)(dst_ptr + i0 * nb0) = value;
}
}
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne0 = dst->ne[0];
GGML_ASSERT(ne0 == ne00 + p0 + p1);
const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
const dim3 grid_dims(ne01, ne02, ne03);
pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
src0->data, dst->data,
ne0, ne00, ne01, ne02, ne03,
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
p0, p1
);
}

View File

@ -0,0 +1,5 @@
#include "common.cuh"
#define CUDA_PAD_REFLECT_1D_BLOCK_SIZE 256
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -4391,10 +4391,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true; return true;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_POOL_2D:
case GGML_OP_SUM: case GGML_OP_SUM:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_POOL_2D:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:

File diff suppressed because it is too large Load Diff

View File

@ -1,20 +1,34 @@
#version 450 #version 450
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
#if ADD_RMS
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable
#endif
#include "types.comp" #include "types.comp"
#include "generic_binary_head.comp" #include "generic_binary_head.comp"
const uint num_threads = 256; const uint num_threads = 256;
layout (binding = 3, std430) buffer PartialBuf {float partial_sums[];};
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
#if ADD_RMS
// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant
shared FLOAT_TYPE sumsh[num_threads];
#endif
void main() { void main() {
uint idx = get_idx(); uint idx = get_idx();
uint orig_idx = idx;
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
const uint num_iter = 2; const uint num_iter = 2;
FLOAT_TYPE sum_sq = 0;
[[unroll]] for (uint i = 0; i < num_iter; ++i) { [[unroll]] for (uint i = 0; i < num_iter; ++i) {
if (idx >= p.ne) { if (idx >= p.ne) {
continue; continue;
@ -22,8 +36,34 @@ void main() {
uint i00, i01, i02, i03; uint i00, i01, i02, i03;
get_indices(idx, i00, i01, i02, i03); get_indices(idx, i00, i01, i02, i03);
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)])); FLOAT_TYPE sum = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]);
sum_sq += sum*sum;
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(sum);
idx += num_threads; idx += num_threads;
} }
#if ADD_RMS
if (p.param3 != 0) {
// reduce the sum within each subgroup, then across subgroups
const uint NumSubgroups = num_threads / gl_SubgroupSize;
sum_sq = subgroupAdd(sum_sq);
if (gl_SubgroupInvocationID == 0) {
sumsh[gl_SubgroupID] = sum_sq;
}
barrier();
[[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) {
if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) {
sum_sq += sumsh[gl_SubgroupID + s];
sumsh[gl_SubgroupID] = sum_sq;
}
barrier();
}
if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) {
partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq;
}
}
#endif
} }

View File

@ -9,6 +9,10 @@ layout (constant_id = 4) const uint32_t HSV = 32;
layout (constant_id = 5) const uint32_t Clamp = 0; layout (constant_id = 5) const uint32_t Clamp = 0;
layout (constant_id = 6) const uint32_t D_split = 16; layout (constant_id = 6) const uint32_t D_split = 16;
// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
const uint32_t HSK_pad = (HSK + 15) & ~15;
const uint32_t HSV_pad = (HSV + 15) & ~15;
layout (push_constant) uniform parameter { layout (push_constant) uniform parameter {
uint32_t N; uint32_t N;
uint32_t KV; uint32_t KV;

View File

@ -46,14 +46,14 @@ const uint32_t MatBc = 16;
shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x]; shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x]; shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4 const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
shared f16vec4 Qf[Br * qstride]; shared f16vec4 Qf[Br * qstride];
// Avoid padding for hsk==256 to make it fit in 48KB shmem. // Avoid padding for hsk==256 to make it fit in 48KB shmem.
const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br; const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
shared ACC_TYPE sfsh[Bc * sfshstride]; shared ACC_TYPE sfsh[Bc * sfshstride];
const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4 const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4
shared f16vec4 ksh[Bc * kshstride]; shared f16vec4 ksh[Bc * kshstride];
shared float slope[Br]; shared float slope[Br];
@ -74,6 +74,21 @@ void main() {
#define tile_row(r) (row_tid * rows_per_thread + (r)) #define tile_row(r) (row_tid * rows_per_thread + (r))
// Zero-initialize shared memory for Q/K when HSK is not a multiple of 16 (HSK_pad > HSK).
if ((HSK % 16) != 0) {
[[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
if (i + tid < Br * qstride) {
Qf[i + tid] = f16vec4(0);
}
}
[[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) {
if (i + tid < Bc * kshstride) {
ksh[i + tid] = f16vec4(0);
}
}
barrier();
}
uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4; uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
[[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) { [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
@ -151,14 +166,14 @@ void main() {
} }
barrier(); barrier();
// K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br // K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br
// Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16 // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
// This is written transposed in order to allow for N being 8 if implementations need it // This is written transposed in order to allow for N being 8 if implementations need it
coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0); coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat; coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat; coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
for (uint32_t d = 0; d < HSK / 16; ++d) { for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor); coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4; uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;

View File

@ -104,16 +104,16 @@ void main() {
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1); tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseAccumulator> Q; coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA> Qf16; coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK)); coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA>(Q); Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
Qf16 *= float16_t(p.scale); Qf16 *= float16_t(p.scale);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0); coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M; coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
@ -140,10 +140,10 @@ void main() {
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0); coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeWorkgroup, HSK, Bc, gl_MatrixUseB> K_T; coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC); coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC);
S = coopMatMulAdd(Qf16, K_T, S); S = coopMatMulAdd(Qf16, K_T, S);
if (p.logit_softcap != 0.0f) { if (p.logit_softcap != 0.0f) {
@ -208,31 +208,31 @@ void main() {
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0); rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
rowsum = coopMatMulAdd(P_A, One, rowsum); rowsum = coopMatMulAdd(P_A, One, rowsum);
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV, gl_MatrixUseB> V; coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC); coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) DECODEFUNC);
L = eM*L + rowsum; L = eM*L + rowsum;
// This is the "diagonal" matrix in the paper, but since we do componentwise // This is the "diagonal" matrix in the paper, but since we do componentwise
// multiply rather than matrix multiply it has the diagonal element smeared // multiply rather than matrix multiply it has the diagonal element smeared
// across the row // across the row
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> eMdiag; coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> eMdiag;
// resize eM by using smear/reduce // resize eM by using smear/reduce
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
// multiply with fp16 accumulation, then add to O. // multiply with fp16 accumulation, then add to O.
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0); coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
PV = coopMatMulAdd(P_A, V, PV); PV = coopMatMulAdd(P_A, V, PV);
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(PV); O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(PV);
} }
// If there is split_k, then the split_k resolve shader does the final // If there is split_k, then the split_k resolve shader does the final
// division by L. Store the intermediate O value and per-row m and L values. // division by L. Store the intermediate O value and per-row m and L values.
if (p.k_num > 1) { if (p.k_num > 1) {
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O); coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num); uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
@ -243,16 +243,16 @@ void main() {
return; return;
} }
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Ldiag; coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Ldiag;
// resize L by using smear/reduce // resize L by using smear/reduce
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) { if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) {
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> S; coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> S;
coopMatPerElementNV(S, S, perElemOpGetSink, iq2); coopMatPerElementNV(S, S, perElemOpGetSink, iq2);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Mr; coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> Mr;
// resize M by using smear/reduce // resize M by using smear/reduce
coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce); coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce);
@ -285,7 +285,7 @@ void main() {
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV; uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O); coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
if (p.gqa_ratio > 1) { if (p.gqa_ratio > 1) {
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
} else { } else {
@ -295,6 +295,6 @@ void main() {
// permute dimensions // permute dimensions
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute); coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV_pad), tensorViewPermute);
} }
} }

View File

@ -17,6 +17,9 @@
#ifdef COOPMAT #ifdef COOPMAT
#extension GL_KHR_cooperative_matrix : enable #extension GL_KHR_cooperative_matrix : enable
#extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_memory_scope_semantics : enable
#endif
#if defined(COOPMAT) || defined(MUL_MAT_ID_USE_SUBGROUPS)
#extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable #extension GL_KHR_shader_subgroup_ballot : enable
#endif #endif
@ -103,15 +106,75 @@ layout (constant_id = 10) const uint WARP = 32;
shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE]; shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
#define NUM_WARPS (BLOCK_SIZE / WARP)
#ifdef MUL_MAT_ID #ifdef MUL_MAT_ID
shared u16vec2 row_ids[4096]; shared u16vec2 row_ids[4096];
uint _ne1; uint _ne1;
#ifdef COOPMAT
shared uint _ne1_sh;
#endif
#endif // MUL_MAT_ID
#define NUM_WARPS (BLOCK_SIZE / WARP) #ifdef MUL_MAT_ID_USE_SUBGROUPS
shared uvec4 ballots_sh[NUM_WARPS];
void load_row_ids(uint expert_idx, bool nei0_is_pow2) {
_ne1 = 0;
uint num_elements = p.nei1 * p.nei0;
uint nei0shift = findLSB(p.nei0);
uint ids[16];
uint iter = 0;
for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
// prefetch up to 16 elements
if (iter == 0) {
[[unroll]] for (uint k = 0; k < 16; ++k) {
uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE;
bool in_range = i < num_elements;
uint ii1;
if (nei0_is_pow2) {
ii1 = i >> nei0shift;
} else {
ii1 = i / p.nei0;
}
uint ii0 = i - ii1 * p.nei0;
ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
}
}
uint i = j + gl_LocalInvocationIndex;
bool in_range = i < num_elements;
uint ii1;
if (nei0_is_pow2) {
ii1 = i >> nei0shift;
} else {
ii1 = i / p.nei0;
}
uint ii0 = i - ii1 * p.nei0;
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
ballots_sh[gl_SubgroupID] = ballot;
barrier();
uint subgroup_base = 0;
uint total = 0;
for (uint k = 0; k < gl_NumSubgroups; ++k) {
if (k == gl_SubgroupID) {
subgroup_base = total;
}
total += subgroupBallotBitCount(ballots_sh[k]);
}
barrier();
uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot);
if (in_range && id == expert_idx) {
row_ids[_ne1 + idx] = u16vec2(ii0, ii1);
}
_ne1 += total;
iter &= 15;
}
barrier();
}
#endif // MUL_MAT_ID_USE_SUBGROUPS
#endif // MUL_MAT_ID
#ifdef COOPMAT #ifdef COOPMAT
shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
@ -177,45 +240,12 @@ void main() {
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK; const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;
#ifdef MUL_MAT_ID #ifdef MUL_MAT_ID
#ifdef COOPMAT #ifdef MUL_MAT_ID_USE_SUBGROUPS
// Spread the search across all elements in the first subgroup if (bitCount(p.nei0) == 1) {
if (gl_SubgroupID == 0) { load_row_ids(expert_idx, true);
_ne1 = 0; } else {
uint num_elements = p.nei1 * p.nei0; load_row_ids(expert_idx, false);
uint ids[16];
uint iter = 0;
for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
// prefetch up to 16 elements
if (iter == 0) {
[[unroll]] for (uint k = 0; k < 16; ++k) {
uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
bool in_range = i < num_elements;
uint ii1 = i / p.nei0;
uint ii0 = i % p.nei0;
ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
} }
}
uint i = j + gl_SubgroupInvocationID;
bool in_range = i < num_elements;
uint ii1 = i / p.nei0;
uint ii0 = i % p.nei0;
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
uint idx = subgroupBallotExclusiveBitCount(ballot);
if (in_range && id == expert_idx) {
row_ids[_ne1 + idx] = u16vec2(ii0, ii1);
}
_ne1 += subgroupBallotBitCount(ballot);
iter &= 15;
}
_ne1_sh = _ne1;
}
barrier();
_ne1 = _ne1_sh;
#else #else
_ne1 = 0; _ne1 = 0;
for (uint ii1 = 0; ii1 < p.nei1; ii1++) { for (uint ii1 = 0; ii1 < p.nei1; ii1++) {

View File

@ -19,6 +19,7 @@
#endif #endif
#include "types.comp" #include "types.comp"
#include "utils.comp"
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
@ -99,7 +100,8 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
}; };
uint _ne1; uint _ne1;
shared uint _ne1_sh; layout (constant_id = 5) const uint subgroup_size = 32;
shared uvec4 ballots_sh[BLOCK_SIZE / subgroup_size];
B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{ {
@ -128,6 +130,64 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
return elem; return elem;
} }
void load_row_ids(uint expert_idx, bool nei0_is_pow2) {
_ne1 = 0;
uint num_elements = p.nei1 * p.nei0;
uint nei0shift = findLSB(p.nei0);
uint ids[16];
uint iter = 0;
for (uint j = 0; j < num_elements; j += BLOCK_SIZE) {
// prefetch up to 16 elements
if (iter == 0) {
[[unroll]] for (uint k = 0; k < 16; ++k) {
uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE;
bool in_range = i < num_elements;
uint ii1;
if (nei0_is_pow2) {
ii1 = i >> nei0shift;
} else {
ii1 = i / p.nei0;
}
uint ii0 = i - ii1 * p.nei0;
ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
}
}
uint i = j + gl_LocalInvocationIndex;
bool in_range = i < num_elements;
uint ii1;
if (nei0_is_pow2) {
ii1 = i >> nei0shift;
} else {
ii1 = i / p.nei0;
}
uint ii0 = i - ii1 * p.nei0;
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
ballots_sh[gl_SubgroupID] = ballot;
barrier();
uint subgroup_base = 0;
uint total = 0;
for (uint k = 0; k < gl_NumSubgroups; ++k) {
if (k == gl_SubgroupID) {
subgroup_base = total;
}
total += subgroupBallotBitCount(ballots_sh[k]);
}
barrier();
uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot);
if (in_range && id == expert_idx) {
row_ids[_ne1 + idx] = u16vec4(fastmod(ii0, p.ne11), ii1, ii0, 0);
}
_ne1 += total;
iter &= 15;
}
barrier();
}
#endif #endif
void main() { void main() {
@ -157,44 +217,11 @@ void main() {
const uint ic = gl_WorkGroupID.y; const uint ic = gl_WorkGroupID.y;
#ifdef MUL_MAT_ID #ifdef MUL_MAT_ID
// Spread the search across all elements in the first subgroup if (bitCount(p.nei0) == 1) {
if (gl_SubgroupID == 0) { load_row_ids(expert_idx, true);
_ne1 = 0; } else {
uint num_elements = p.nei1 * p.nei0; load_row_ids(expert_idx, false);
uint ids[16];
uint iter = 0;
for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
// prefetch up to 16 elements
if (iter == 0) {
[[unroll]] for (uint k = 0; k < 16; ++k) {
uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
bool in_range = i < num_elements;
uint ii1 = i / p.nei0;
uint ii0 = i % p.nei0;
ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
} }
}
uint i = j + gl_SubgroupInvocationID;
bool in_range = i < num_elements;
uint ii1 = i / p.nei0;
uint ii0 = i % p.nei0;
uint id = ids[iter++];
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
uint idx = subgroupBallotExclusiveBitCount(ballot);
if (in_range && id == expert_idx) {
row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
}
_ne1 += subgroupBallotBitCount(ballot);
iter &= 15;
}
_ne1_sh = _ne1;
}
barrier();
_ne1 = _ne1_sh;
// Workgroup has no work // Workgroup has no work
if (ic * BN >= _ne1) return; if (ic * BN >= _ne1) return;

View File

@ -3,6 +3,10 @@
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_nonuniform_qualifier : enable #extension GL_EXT_nonuniform_qualifier : enable
#extension GL_EXT_control_flow_attributes : require #extension GL_EXT_control_flow_attributes : require
#if ADD_RMS
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable
#endif
#include "rte.comp" #include "rte.comp"
#include "types.comp" #include "types.comp"
@ -14,11 +18,18 @@ layout (push_constant) uniform parameter2
uint ne20; uint ne21; uint ne22; uint ne23; uint ne20; uint ne21; uint ne22; uint ne23;
// strides for srcs+dst // strides for srcs+dst
uint nb[8][4]; uint nb[12][4];
uint rms_partials;
} p; } p;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; // Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498
layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; // layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[];
// layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[];
layout (binding = 0) buffer A {A_TYPE data_a[];} a[];
layout (binding = 0) buffer D {D_TYPE data_d[];} d[];
layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[];
layout(constant_id = 0) const uint num_srcs = 2; layout(constant_id = 0) const uint num_srcs = 2;
@ -42,14 +53,22 @@ const uint num_threads = 256;
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
#if ADD_RMS
// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant
shared FLOAT_TYPE sumsh[num_threads];
#endif
void main() { void main() {
uint idx = get_idx(); uint idx = get_idx();
uint orig_idx = idx;
uint ne = p.ne20 * p.ne21 * p.ne22 * p.ne23; uint ne = p.ne20 * p.ne21 * p.ne22 * p.ne23;
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
const uint num_iter = 2; const uint num_iter = 2;
FLOAT_TYPE sum_sq = 0;
[[unroll]] for (uint i = 0; i < num_iter; ++i) { [[unroll]] for (uint i = 0; i < num_iter; ++i) {
if (idx >= ne) { if (idx >= ne) {
continue; continue;
@ -61,8 +80,32 @@ void main() {
[[unroll]] for (uint s = 0; s < num_srcs; ++s) { [[unroll]] for (uint s = 0; s < num_srcs; ++s) {
sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]); sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]);
} }
sum_sq += sum*sum;
d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum); d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum);
idx += num_threads; idx += num_threads;
} }
#if ADD_RMS
if (p.rms_partials != 0) {
// reduce the sum within each subgroup, then across subgroups
const uint NumSubgroups = num_threads / gl_SubgroupSize;
sum_sq = subgroupAdd(sum_sq);
if (gl_SubgroupInvocationID == 0) {
sumsh[gl_SubgroupID] = sum_sq;
}
barrier();
[[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) {
if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) {
sum_sq += sumsh[gl_SubgroupID + s];
sumsh[gl_SubgroupID] = sum_sq;
}
barrier();
}
if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) {
partials[num_srcs + 1].partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq;
}
}
#endif
} }

View File

@ -10,9 +10,9 @@ layout (constant_id = 1) const bool do_multiply = false;
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sum[BLOCK_SIZE]; shared FLOAT_TYPE sumsh[BLOCK_SIZE];
void main() { void rms_norm(uint num_iters) {
const uint ncols = p.ne00; const uint ncols = p.ne00;
const uint nrows = gl_NumWorkGroups.x; const uint nrows = gl_NumWorkGroups.x;
const uint nchannels = gl_NumWorkGroups.y; const uint nchannels = gl_NumWorkGroups.y;
@ -30,38 +30,76 @@ void main() {
uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_offset + col]); FLOAT_TYPE xi = FLOAT_TYPE(0);
sum[tid] += xi * xi; if (col < ncols) {
xi = FLOAT_TYPE(data_a[a_offset + col]);
}
sum += xi * xi;
} }
sumsh[tid] = sum;
// sum up partial sums and write back result // sum up partial sums and write back result
barrier(); barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) { if (tid < s) {
sum[tid] += sum[tid + s]; sum += sumsh[tid + s];
sumsh[tid] = sum;
} }
barrier(); barrier();
} }
sum = sumsh[0];
const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols); const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols);
const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
if (do_multiply) { if (do_multiply) {
if (ncols > p.ne10) { if (ncols > p.ne10) {
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
if (col >= ncols) {
continue;
}
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)])); data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)]));
} }
} else { } else {
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
if (col >= ncols) {
continue;
}
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));
} }
} }
} else { } else {
[[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
if (col >= ncols) {
continue;
}
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
} }
} }
} }
void main() {
// instantiate the rms_norm function for several different
// dimensions, to allow loop unrolling
uint num_blocks = (p.ne00 + BLOCK_SIZE - 1) / BLOCK_SIZE;
if (num_blocks > 32) {
rms_norm(num_blocks);
} else if (num_blocks > 16) {
rms_norm(32);
} else if (num_blocks > 8) {
rms_norm(16);
} else if (num_blocks > 4) {
rms_norm(8);
} else if (num_blocks == 4) {
rms_norm(4);
} else if (num_blocks == 3) {
rms_norm(3);
} else if (num_blocks == 2) {
rms_norm(2);
} else if (num_blocks == 1) {
rms_norm(1);
}
}

View File

@ -0,0 +1,65 @@
#version 450
#include "generic_binary_head.comp"
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_basic : enable
#define BLOCK_SIZE 128
layout (constant_id = 1) const bool do_multiply = false;
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 3, std430) readonly buffer PartialsBuf {float partial_sums[];};
shared FLOAT_TYPE sumsh[BLOCK_SIZE];
void main() {
const uint ncols = p.ne00;
const uint nrows = gl_NumWorkGroups.x;
const uint nchannels = gl_NumWorkGroups.y;
const uint row = 0;
const uint channel = gl_WorkGroupID.y;
const uint samp = gl_WorkGroupID.z;
// The work is split across multiple workgroups in the x dimension. Each invocation
// processes one element
const uint tid = gl_GlobalInvocationID.x;
const uint stride_row = p.nb01;
const uint stride_channel = p.nb02;
const uint stride_sample = p.nb03;
uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp
uint32_t num_partials = p.param3;
for (uint32_t i = gl_SubgroupInvocationID; i < num_partials; i += gl_SubgroupSize) {
sum += partial_sums[i];
}
sum = subgroupAdd(sum);
uint col = tid;
if (col >= ncols) {
return;
}
const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols);
const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
if (do_multiply) {
if (ncols > p.ne10) {
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)]));
} else {
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col]));
}
} else {
data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
}
}

View File

@ -1,9 +1,9 @@
#version 450 #version 450
#include "generic_head.comp"
#include "types.comp" #include "types.comp"
#extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_control_flow_attributes : enable
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -11,16 +11,49 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (push_constant) uniform parameter
{
uint n_cols;
uint ne01, ne02;
uint nb01, nb02, nb03;
uint nb11, nb12, nb13;
float weight;
uint misalign_offsets;
uint ne0_12mp, ne0_12L;
uint ne0_1mp, ne0_1L;
} p;
uint get_aoffset() { return p.misalign_offsets >> 16; }
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
shared FLOAT_TYPE tmp[BLOCK_SIZE]; shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() { void main() {
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint col = gl_LocalInvocationID.x; const uint col = gl_LocalInvocationID.x;
const float weight = p.weight;
tmp[col] = FLOAT_TYPE(0.0f); const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L);
const uint i03_offset = i03 * p.ne01*p.ne02;
const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L);
const uint i01 = row - i03_offset - i02*p.ne01;
for (uint i = col; i < p.KX; i += BLOCK_SIZE) { const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]); const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13;
tmp[col] = FLOAT_TYPE(0.0);
for (uint i = col; i < p.n_cols; i += BLOCK_SIZE) {
tmp[col] += FLOAT_TYPE(data_a[src_idx + i]);
} }
barrier(); barrier();
@ -32,6 +65,6 @@ void main() {
} }
if (col == 0) { if (col == 0) {
data_d[row] = D_TYPE(tmp[0]); data_d[dst_idx] = D_TYPE(tmp[0] * weight);
} }
} }

View File

@ -68,6 +68,12 @@ const std::vector<std::string> type_names = {
"bf16", "bf16",
}; };
enum MatMulIdType {
NONE,
DEFAULT,
SUBGROUP,
};
namespace { namespace {
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) { void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
#ifdef _WIN32 #ifdef _WIN32
@ -293,7 +299,7 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc)); compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc));
} }
void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) { void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool coopmat2, bool f16acc) {
std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4";
std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
@ -303,9 +309,13 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
}; };
std::string shader_name = "matmul"; std::string shader_name = "matmul";
if (matmul_id) { if (matmul_id_type == MatMulIdType::DEFAULT) {
base_dict["MUL_MAT_ID"] = "1"; base_dict["MUL_MAT_ID"] = "1";
shader_name = "matmul_id"; shader_name = "matmul_id";
} else if (matmul_id_type == MatMulIdType::SUBGROUP) {
base_dict["MUL_MAT_ID"] = "1";
base_dict["MUL_MAT_ID_USE_SUBGROUPS"] = "1";
shader_name = "matmul_id_subgroup";
} }
if (fp16) { if (fp16) {
@ -389,7 +399,7 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
} }
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
if (!coopmat && !coopmat2 && !matmul_id && (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "q8_0")) { if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "q8_0")) {
string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc); string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
} }
#endif #endif
@ -401,27 +411,29 @@ void process_shaders() {
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}}; std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
// matmul // matmul
for (const auto& matmul_id : {false, true}) { for (const MatMulIdType& matmul_id_type : {MatMulIdType::NONE, MatMulIdType::DEFAULT, MatMulIdType::SUBGROUP}) {
// No coopmats // No coopmats
// fp32 // fp32
matmul_shaders(false, matmul_id, false, false, false); matmul_shaders(false, matmul_id_type, false, false, false);
// fp16, fp32acc and fp16acc // fp16, fp32acc and fp16acc
matmul_shaders(true, matmul_id, false, false, false); matmul_shaders(true, matmul_id_type, false, false, false);
matmul_shaders(true, matmul_id, false, false, true); matmul_shaders(true, matmul_id_type, false, false, true);
if (matmul_id_type != MatMulIdType::DEFAULT) {
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) #if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
// Coopmat, fp32acc and fp16acc // Coopmat, fp32acc and fp16acc
matmul_shaders(true, matmul_id, true, false, false); matmul_shaders(true, matmul_id_type, true, false, false);
matmul_shaders(true, matmul_id, true, false, true); matmul_shaders(true, matmul_id_type, true, false, true);
#endif #endif
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
// Coopmat2, fp32acc and fp16acc // Coopmat2, fp32acc and fp16acc
matmul_shaders(true, matmul_id, false, true, false); matmul_shaders(true, matmul_id_type, false, true, false);
matmul_shaders(true, matmul_id, false, true, true); matmul_shaders(true, matmul_id_type, false, true, true);
#endif #endif
} }
}
// flash attention // flash attention
for (const auto& f16acc : {false, true}) { for (const auto& f16acc : {false, true}) {
@ -503,6 +515,7 @@ void process_shaders() {
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("rms_norm_partials_f32", "rms_norm_partials.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
@ -538,13 +551,15 @@ void process_shaders() {
s += std::string(dst_f16 ? "_f16" : "_f32"); s += std::string(dst_f16 ? "_f16" : "_f32");
return s; return s;
}; };
for (std::string op : {"add", "sub", "mul", "div"}) { for (std::string op : {"add", "sub", "mul", "div", "add_rms", }) {
for (auto src0_f16 : {false, true}) { for (auto src0_f16 : {false, true}) {
for (auto src1_f16 : {false, true}) { for (auto src1_f16 : {false, true}) {
for (auto dst_f16 : {false, true}) { for (auto dst_f16 : {false, true}) {
for (auto rte : {false, true}) { for (auto rte : {false, true}) {
auto source = op == "add_rms" ? std::string("add") : op;
auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : ""); auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : "");
string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); auto add_rms = op == "add_rms" ? "1" : "0";
string_to_spv(name.c_str(), source + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}, {"ADD_RMS" , add_rms}});
} }
} }
} }
@ -687,7 +702,8 @@ void process_shaders() {
string_to_spv("add_id_f32", "add_id.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("add_id_f32", "add_id.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}}); string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "0"}});
string_to_spv("multi_add_rms_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "1"}});
for (auto &c : compiles) { for (auto &c : compiles) {
c.wait(); c.wait();
@ -745,7 +761,7 @@ void write_output_files() {
} }
std::string suffixes[2] = {"_f32", "_f16"}; std::string suffixes[2] = {"_f32", "_f16"};
for (const char *op : {"add", "sub", "mul", "div"}) { for (const char *op : {"add", "sub", "mul", "div", "add_rms"}) {
fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op); fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op);
fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op); fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op);
std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = "; std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = ";

View File

@ -20,8 +20,8 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR} COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR}
COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8 COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8
${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py
--input "${SHADER_DIR}" --input_dir "${SHADER_DIR}"
--output "${SHADER_HEADER}" --output_file "${SHADER_HEADER}"
DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py
VERBATIM VERBATIM
) )

View File

@ -118,13 +118,11 @@ struct webgpu_context_struct {
std::recursive_mutex mutex; std::recursive_mutex mutex;
bool device_init = false;
webgpu_buf_pool param_buf_pool; webgpu_buf_pool param_buf_pool;
webgpu_buf_pool set_rows_error_buf_pool; webgpu_buf_pool set_rows_error_buf_pool;
wgpu::ComputePipeline memset_pipeline; wgpu::ComputePipeline memset_pipeline;
wgpu::ComputePipeline mul_mat_pipeline; wgpu::ComputePipeline mul_mat_pipeline[30][2];
wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline set_rows_pipeline;
wgpu::ComputePipeline cpy_pipeline; wgpu::ComputePipeline cpy_pipeline;
@ -238,7 +236,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
wgpu::CallbackMode::AllowSpontaneous, wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) { if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
} }
}), }),
UINT64_MAX); UINT64_MAX);
@ -278,7 +276,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
wgpu::CallbackMode::AllowSpontaneous, wgpu::CallbackMode::AllowSpontaneous,
[ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) { if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
} }
// Free the staged buffers // Free the staged buffers
ctx->param_buf_pool.free_bufs(staged_param_bufs); ctx->param_buf_pool.free_bufs(staged_param_bufs);
@ -294,7 +292,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
wgpu::CallbackMode::AllowSpontaneous, wgpu::CallbackMode::AllowSpontaneous,
[ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) { [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
if (status != wgpu::MapAsyncStatus::Success) { if (status != wgpu::MapAsyncStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", message.data); GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
} else { } else {
const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange(); const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange();
if (*error_data) { if (*error_data) {
@ -331,6 +329,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
// To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
// debug statements in the shader, and then call this function after encoding the commands and submitting them. // debug statements in the shader, and then call this function after encoding the commands and submitting them.
static void ggml_backend_webgpu_debug(webgpu_context & ctx) { static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
ggml_backend_webgpu_submit_queue(ctx);
wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
wgpu::CommandBuffer commands = encoder.Finish(); wgpu::CommandBuffer commands = encoder.Finish();
@ -421,15 +420,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true); ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true);
} }
static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor * tensor) {
return webgpu_tensor_offset(tensor) + tensor->view_offs;
}
static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor * tensor) {
ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
return ctx->buffer;
}
/** End WebGPU Actions */ /** End WebGPU Actions */
/** GGML Backend Interface */ /** GGML Backend Interface */
@ -447,19 +437,36 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
GGML_UNUSED(ctx); GGML_UNUSED(ctx);
} }
static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
return webgpu_tensor_offset(tensor) + tensor->view_offs;
}
static wgpu::Buffer ggml_webgpu_tensor_buf(const ggml_tensor * tensor) {
ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
return ctx->buffer;
}
static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, ggml_tensor * t) {
size_t offset = ggml_webgpu_tensor_offset(t);
return offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
}
static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor * t) {
size_t offset = ggml_webgpu_tensor_offset(t);
return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
}
static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) {
return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) &
~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
}
static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
size_t src_offset = ggml_backend_webgpu_tensor_offset(src);
// assumes power of 2 offset alignment
size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
// align to minimum offset alignment
src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
uint32_t ne = (uint32_t) ggml_nelements(dst); uint32_t ne = (uint32_t) ggml_nelements(dst);
std::vector<uint32_t> params = { ne, std::vector<uint32_t> params = { ne,
(uint32_t) (src_misalignment / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
(uint32_t) (dst_misalignment / ggml_type_size(dst->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
// Convert byte-strides to element-strides // Convert byte-strides to element-strides
(uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[0] / ggml_type_size(src->type)),
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
@ -477,15 +484,13 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
std::vector<wgpu::BindGroupEntry> entries = { std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0, { .binding = 0,
.buffer = ggml_backend_webgpu_tensor_buf(src), .buffer = ggml_webgpu_tensor_buf(src),
.offset = src_offset, .offset = ggml_webgpu_tensor_align_offset(ctx, src),
.size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & .size = ggml_webgpu_tensor_binding_size(ctx, src) },
~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) },
{ .binding = 1, { .binding = 1,
.buffer = ggml_backend_webgpu_tensor_buf(dst), .buffer = ggml_webgpu_tensor_buf(dst),
.offset = dst_offset, .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & .size = ggml_webgpu_tensor_binding_size(ctx, dst) }
~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }
}; };
size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
@ -504,21 +509,9 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
error_bufs.host_buf.Unmap(); error_bufs.host_buf.Unmap();
} }
size_t src_offset = ggml_backend_webgpu_tensor_offset(src); std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
// assumes power of 2 offset alignment (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
// align to minimum offset alignment
src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
size_t idx_offset = ggml_backend_webgpu_tensor_offset(idx);
size_t idx_misalignment = idx_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
idx_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
std::vector<uint32_t> params = { (uint32_t) (src_misalignment / ggml_type_size(src->type)),
(uint32_t) (idx_misalignment / ggml_type_size(idx->type)),
(uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
// Convert byte-strides to element-strides // Convert byte-strides to element-strides
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
(uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
@ -540,17 +533,17 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
std::vector<wgpu::BindGroupEntry> entries = { std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0, { .binding = 0,
.buffer = ggml_backend_webgpu_tensor_buf(src), .buffer = ggml_webgpu_tensor_buf(src),
.offset = ggml_backend_webgpu_tensor_offset(src), .offset = ggml_webgpu_tensor_align_offset(ctx, src),
.size = ggml_nbytes(src) }, .size = ggml_webgpu_tensor_binding_size(ctx, src) },
{ .binding = 1, { .binding = 1,
.buffer = ggml_backend_webgpu_tensor_buf(idx), .buffer = ggml_webgpu_tensor_buf(idx),
.offset = ggml_backend_webgpu_tensor_offset(idx), .offset = ggml_webgpu_tensor_align_offset(ctx, idx),
.size = ggml_nbytes(idx) }, .size = ggml_webgpu_tensor_binding_size(ctx, idx) },
{ .binding = 2, { .binding = 2,
.buffer = ggml_backend_webgpu_tensor_buf(dst), .buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_backend_webgpu_tensor_offset(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_nbytes(dst) }, .size = ggml_webgpu_tensor_binding_size(ctx, dst) },
{ .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() }
}; };
@ -565,15 +558,18 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
std::vector<uint32_t> params = { std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) dst->ne[1], // number of rows in result (M) (uint32_t) dst->ne[1], // number of rows in result (M)
(uint32_t) dst->ne[0], // number of columns in result (N) (uint32_t) dst->ne[0], // number of columns in result (N)
(uint32_t) src0->ne[0], // number of columns in src0/src1 (K) (uint32_t) src0->ne[0], // number of columns in src0/src1 (K)
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 1 (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 1 (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 2 (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 2
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 2 (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 2
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 3 (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 3
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 3 (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 3
(uint32_t) src0->ne[2], // batch size in dimension 2 (uint32_t) src0->ne[2], // batch size in dimension 2
(uint32_t) src0->ne[3], // batch size in dimension 3 (uint32_t) src0->ne[3], // batch size in dimension 3
(uint32_t) (src1->ne[2] / src0->ne[2]), // broadcast in dimension 2 (uint32_t) (src1->ne[2] / src0->ne[2]), // broadcast in dimension 2
@ -582,22 +578,22 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
std::vector<wgpu::BindGroupEntry> entries = { std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0, { .binding = 0,
.buffer = ggml_backend_webgpu_tensor_buf(src0), .buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_backend_webgpu_tensor_offset(src0), .offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_nbytes(src0) }, .size = ggml_webgpu_tensor_binding_size(ctx, src0) },
{ .binding = 1, { .binding = 1,
.buffer = ggml_backend_webgpu_tensor_buf(src1), .buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_backend_webgpu_tensor_offset(src1), .offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_nbytes(src1) }, .size = ggml_webgpu_tensor_binding_size(ctx, src1) },
{ .binding = 2, { .binding = 2,
.buffer = ggml_backend_webgpu_tensor_buf(dst), .buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_backend_webgpu_tensor_offset(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_nbytes(dst) } .size = ggml_webgpu_tensor_binding_size(ctx, dst) },
}; };
uint32_t wg_x = uint32_t wg_x =
(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x); ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
} }
// Returns true if node has enqueued work into the queue, false otherwise // Returns true if node has enqueued work into the queue, false otherwise
@ -827,7 +823,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
wgpu::Buffer buf; wgpu::Buffer buf;
ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
buf, buf,
size, (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
"allocated_buffer"); "allocated_buffer");
@ -907,7 +903,94 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
} }
static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
wgsl_mul_mat_f32_f32,
"mul_mat_f32_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
wgsl_mul_mat_f16_f16,
"mul_mat_f16_f16");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
wgsl_mul_mat_f16_f32,
"mul_mat_f16_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
wgsl_mul_mat_q4_0_f32,
"mul_mat_q4_0_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
wgsl_mul_mat_q4_1_f32,
"mul_mat_q4_1_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
wgsl_mul_mat_q5_0_f32,
"mul_mat_q5_0_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
wgsl_mul_mat_q5_1_f32,
"mul_mat_q5_1_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
wgsl_mul_mat_q8_0_f32,
"mul_mat_q8_0_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
wgsl_mul_mat_q2_k_f32,
"mul_mat_q2_k_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
wgsl_mul_mat_q3_k_f32,
"mul_mat_q3_k_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
wgsl_mul_mat_q4_k_f32,
"mul_mat_q4_k_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
wgsl_mul_mat_q5_k_f32,
"mul_mat_q5_k_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
wgsl_mul_mat_q6_k_f32,
"mul_mat_q6_k_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
wgsl_mul_mat_iq2_xxs_f32,
"mul_mat_iq2_xxs_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
wgsl_mul_mat_iq2_xs_f32,
"mul_mat_iq2_xs_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
wgsl_mul_mat_iq2_s_f32,
"mul_mat_iq2_s_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
wgsl_mul_mat_iq3_xxs_f32,
"mul_mat_iq3_xxs_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
wgsl_mul_mat_iq3_s_f32,
"mul_mat_iq3_s_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
wgsl_mul_mat_iq1_s_f32,
"mul_mat_iq1_s_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
wgsl_mul_mat_iq1_m_f32,
"mul_mat_iq1_m_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
wgsl_mul_mat_iq4_nl_f32,
"mul_mat_iq4_nl_f32");
ggml_webgpu_create_pipeline(webgpu_ctx->device,
webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
wgsl_mul_mat_iq4_xs_f32,
"mul_mat_iq4_xs_f32");
} }
static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
@ -933,79 +1016,6 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context); ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
// Multiple threads may try to initialize the device
std::lock_guard<std::recursive_mutex> lock(webgpu_ctx->mutex);
if (!webgpu_ctx->device_init) {
// Initialize device
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
wgpu::FeatureName::ImplicitDeviceSynchronization };
wgpu::DeviceDescriptor dev_desc;
dev_desc.requiredLimits = &webgpu_ctx->limits;
dev_desc.requiredFeatures = required_features.data();
dev_desc.requiredFeatureCount = required_features.size();
dev_desc.SetDeviceLostCallback(
wgpu::CallbackMode::AllowSpontaneous,
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
GGML_UNUSED(device);
GGML_LOG_ERROR(
"ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
});
dev_desc.SetUncapturedErrorCallback(
[](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
GGML_UNUSED(device);
GGML_LOG_ERROR(
"ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
});
webgpu_ctx->instance.WaitAny(
webgpu_ctx->adapter.RequestDevice(
&dev_desc,
wgpu::CallbackMode::AllowSpontaneous,
[webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
if (status != wgpu::RequestDeviceStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
return;
}
webgpu_ctx->device = std::move(device);
}),
UINT64_MAX);
GGML_ASSERT(webgpu_ctx->device != nullptr);
// Initialize (compute) queue
webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
// Create buffer pool for shader parameters
webgpu_ctx->param_buf_pool.init(webgpu_ctx->device,
WEBGPU_NUM_PARAM_BUFS,
WEBGPU_PARAMS_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->device,
WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
ggml_webgpu_init_memset_pipeline(webgpu_ctx);
ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
ggml_webgpu_init_set_rows_pipeline(webgpu_ctx);
ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
#ifdef GGML_WEBGPU_DEBUG
// Initialize debug buffers
ggml_webgpu_create_buffer(webgpu_ctx->device,
webgpu_ctx->debug_host_buf,
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
"debug_host_buf");
ggml_webgpu_create_buffer(webgpu_ctx->device,
webgpu_ctx->debug_dev_buf,
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
"debug_dev_buf");
#endif
webgpu_ctx->device_init = true;
}
static ggml_backend_webgpu_context backend_ctx; static ggml_backend_webgpu_context backend_ctx;
backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
backend_ctx.webgpu_ctx = webgpu_ctx; backend_ctx.webgpu_ctx = webgpu_ctx;
@ -1053,10 +1063,45 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
return true; return true;
case GGML_OP_CPY | GGML_OP_SET_ROWS: case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; {
switch (op->src[1]->type) {
case GGML_TYPE_F16:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_TYPE_F32:
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
return true;
default:
return false;
}
default:
return false;
}
}
default: default:
return false; return false;
} }
@ -1123,20 +1168,87 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
wgpu::AdapterInfo info{}; wgpu::AdapterInfo info{};
ctx->adapter.GetInfo(&info); ctx->adapter.GetInfo(&info);
// Initialize device
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
wgpu::FeatureName::ImplicitDeviceSynchronization };
wgpu::DeviceDescriptor dev_desc;
dev_desc.requiredLimits = &ctx->limits;
dev_desc.requiredFeatures = required_features.data();
dev_desc.requiredFeatureCount = required_features.size();
dev_desc.SetDeviceLostCallback(
wgpu::CallbackMode::AllowSpontaneous,
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
GGML_UNUSED(device);
GGML_LOG_ERROR(
"ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
});
dev_desc.SetUncapturedErrorCallback(
[](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
GGML_UNUSED(device);
GGML_LOG_ERROR(
"ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), std::string(message).c_str());
});
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
&dev_desc,
wgpu::CallbackMode::AllowSpontaneous,
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
if (status != wgpu::RequestDeviceStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
return;
}
ctx->device = std::move(device);
}),
UINT64_MAX);
GGML_ASSERT(ctx->device != nullptr);
// Initialize (compute) queue
ctx->queue = ctx->device.GetQueue();
// Create buffer pool for shader parameters
ctx->param_buf_pool.init(ctx->device,
WEBGPU_NUM_PARAM_BUFS,
WEBGPU_PARAMS_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
ctx->set_rows_error_buf_pool.init(ctx->device,
WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
ggml_webgpu_init_memset_pipeline(ctx);
ggml_webgpu_init_mul_mat_pipeline(ctx);
ggml_webgpu_init_set_rows_pipeline(ctx);
ggml_webgpu_init_cpy_pipeline(ctx);
#ifdef GGML_WEBGPU_DEBUG
// Initialize debug buffers
ggml_webgpu_create_buffer(ctx->device,
ctx->debug_host_buf,
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
"debug_host_buf");
ggml_webgpu_create_buffer(ctx->device,
ctx->debug_dev_buf,
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
"debug_dev_buf");
#endif
static ggml_backend_webgpu_device_context device_ctx; static ggml_backend_webgpu_device_context device_ctx;
device_ctx.webgpu_ctx = ctx; device_ctx.webgpu_ctx = ctx;
device_ctx.device_name = GGML_WEBGPU_NAME; device_ctx.device_name = GGML_WEBGPU_NAME;
device_ctx.device_desc = std::string(info.description.data); device_ctx.device_desc = info.description;
GGML_LOG_INFO( GGML_LOG_INFO(
"ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | " "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
"device_desc: %s\n", "device_desc: %s\n",
info.vendorID, info.vendorID,
info.vendor.data, std::string(info.vendor).c_str(),
info.architecture.data, std::string(info.architecture).c_str(),
info.deviceID, info.deviceID,
info.device.data, std::string(info.device).c_str(),
info.description.data); std::string(info.description).c_str());
// See GGML Backend Device Interface section // See GGML Backend Device Interface section
static ggml_backend_device device = { static ggml_backend_device device = {

View File

@ -1,35 +1,85 @@
import os import os
import re
import ast
import argparse import argparse
def escape_triple_quotes(wgsl): def extract_block(text, name):
# Simple defense in case of embedded """ pattern = rf'#define\({name}\)\s*(.*?)#end\({name}\)'
return wgsl.replace('"""', '\\"""') match = re.search(pattern, text, re.DOTALL)
if not match:
raise ValueError(f"Missing block: {name}")
return match.group(1).strip()
def to_cpp_string_literal(varname, content): def parse_decls(decls_text):
return f'const char* wgsl_{varname} = R"({content})";\n' decls = {}
for name, code in re.findall(r'#decl\((.*?)\)\s*(.*?)#enddecl\(\1\)', decls_text, re.DOTALL):
decls[name.strip()] = code.strip()
return decls
def replace_placeholders(shader_text, replacements):
for key, val in replacements.items():
# Match {{KEY}} literally, where KEY is escaped
pattern = r'{{\s*' + re.escape(key) + r'\s*}}'
shader_text = re.sub(pattern, str(val), shader_text)
return shader_text
def write_shader(shader_name, shader_code, output_dir, outfile):
if output_dir:
wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl")
with open(wgsl_filename, "w", encoding="utf-8") as f_out:
f_out.write(shader_code)
outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n')
def generate_variants(shader_path, output_dir, outfile):
shader_base_name = shader_path.split("/")[-1].split(".")[0]
with open(shader_path, "r", encoding="utf-8") as f:
text = f.read()
try:
variants = ast.literal_eval(extract_block(text, "VARIANTS"))
except ValueError:
write_shader(shader_base_name, text, output_dir, outfile)
else:
decls_map = parse_decls(extract_block(text, "DECLS"))
shader_template = extract_block(text, "SHADER")
for variant in variants:
decls = variant["DECLS"]
decls_code = ""
for key in decls:
if key not in decls_map:
raise ValueError(f"DECLS key '{key}' not found.")
decls_code += decls_map[key] + "\n\n"
shader_variant = replace_placeholders(shader_template, variant["REPLS"])
final_shader = re.sub(r'\bDECLS\b', decls_code, shader_variant)
output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
write_shader(output_name, final_shader, output_dir, outfile)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True) parser.add_argument("--input_dir", required=True)
parser.add_argument('--output', required=True) parser.add_argument("--output_file", required=True)
parser.add_argument("--output_dir")
args = parser.parse_args() args = parser.parse_args()
with open(args.output, 'w', encoding='utf-8') as out: if args.output_dir:
os.makedirs(args.output_dir, exist_ok=True)
with open(args.output_file, "w", encoding="utf-8") as out:
out.write("// Auto-generated shader embedding\n\n") out.write("// Auto-generated shader embedding\n\n")
for fname in sorted(os.listdir(args.input)): for fname in sorted(os.listdir(args.input_dir)):
if not fname.endswith('.wgsl'): if fname.endswith(".wgsl"):
continue generate_variants(os.path.join(args.input_dir, fname), args.output_dir, out)
shader_path = os.path.join(args.input, fname)
varname = os.path.splitext(fname)[0]
with open(shader_path, 'r', encoding='utf-8') as f:
content = f.read()
content = escape_triple_quotes(content)
out.write(to_cpp_string_literal(varname, content))
out.write('\n')
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -19,20 +19,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let start = params.offset; let start = params.offset;
let end = params.offset + params.size; let end = params.offset + params.size;
for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) { for (var j: u32 = 0u; j < bytes_per_thread; j += 4) {
let byte_index = start + i + j; let byte_index = start + i + j;
if (byte_index + 4u <= end) { if (byte_index + 4 <= end) {
output_buffer[(byte_index >> 2u)] = params.value; output_buffer[byte_index >> 2] = params.value;
} else { } else {
// Handle tail (unaligned) // Handle tail (unaligned)
for (var k: u32 = 0u; k < 4u; k = k + 1u) { for (var k: u32 = 0; k < 4; k++) {
let idx = byte_index + k; let idx = byte_index + k;
if (idx < end) { if (idx < end) {
let word_idx = idx >> 2u; let word_idx = idx >> 2;
let byte_offset = (idx & 3u) * 8u; let bit_offset = (idx & 3) * 8u;
let mask = ~(0xffu << byte_offset); let mask = ~(0xffu << bit_offset);
let existing = output_buffer[word_idx]; let existing = output_buffer[word_idx];
output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset); output_buffer[word_idx] = (existing & mask) | (params.value & (0xffu << bit_offset));
} }
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,56 +0,0 @@
struct MulMatParams {
m: u32,
n: u32,
k: u32,
// all strides are in elements
stride_01: u32,
stride_11: u32,
stride_02: u32,
stride_12: u32,
stride_03: u32,
stride_13: u32,
bs02: u32,
bs03: u32,
broadcast2: u32,
broadcast3: u32
};
@group(0) @binding(0) var<storage, read_write> src0: array<f32>; // N rows, K columns
@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // M rows, K columns (transposed)
@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
@group(0) @binding(3) var<uniform> params: MulMatParams;
@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
if (global_id.x >= total) {
return;
}
let dst2_stride = params.m * params.n;
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
let dst3_idx = global_id.x / dst3_stride;
let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension
let src13_idx = dst3_idx; // src1 is not broadcast
let dst3_rem = global_id.x % dst3_stride;
let dst2_idx = dst3_rem / dst2_stride;
let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
let src12_idx = dst2_idx; // src1 is not broadcast
let dst2_rem = dst3_rem % dst2_stride;
let row = dst2_rem / params.n; // output row
let col = dst2_rem % params.n; // output column
var sum = 0.0;
for (var i: u32 = 0u; i < params.k; i = i + 1u) {
let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i;
let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i;
sum = sum + src0[src0_idx] * src1[src1_idx];
}
dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
}

View File

@ -975,6 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"IM2COL", "IM2COL",
"IM2COL_BACK", "IM2COL_BACK",
"CONV_2D", "CONV_2D",
"CONV_3D",
"CONV_2D_DW", "CONV_2D_DW",
"CONV_TRANSPOSE_2D", "CONV_TRANSPOSE_2D",
"POOL_1D", "POOL_1D",
@ -1017,7 +1018,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU", "GLU",
}; };
static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88"); static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1077,6 +1078,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"im2col(x)", "im2col(x)",
"im2col_back(x)", "im2col_back(x)",
"conv_2d(x)", "conv_2d(x)",
"conv_3d(x)",
"conv_2d_dw(x)", "conv_2d_dw(x)",
"conv_transpose_2d(x)", "conv_transpose_2d(x)",
"pool_1d(x)", "pool_1d(x)",
@ -1119,7 +1121,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)", "glu(x)",
}; };
static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88"); static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4480,6 +4482,56 @@ struct ggml_tensor * ggml_conv_2d_direct(
return result; return result;
} }
// ggml_conv_3d
struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int s2,
int p0,
int p1,
int p2,
int d0,
int d1,
int d2,
int c,
int n,
int oc) {
GGML_ASSERT(a->ne[3] == (int64_t) c * oc);
GGML_ASSERT(b->ne[3] == (int64_t) c * n);
int64_t ne[4];
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
ne[2] = ggml_calc_conv_output_size(b->ne[2], a->ne[2], s2, p2, d2);
ne[3] = (int64_t) oc * n;
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_i32(result, 0, s0);
ggml_set_op_params_i32(result, 1, s1);
ggml_set_op_params_i32(result, 2, s2);
ggml_set_op_params_i32(result, 3, p0);
ggml_set_op_params_i32(result, 4, p1);
ggml_set_op_params_i32(result, 5, p2);
ggml_set_op_params_i32(result, 6, d0);
ggml_set_op_params_i32(result, 7, d1);
ggml_set_op_params_i32(result, 8, d2);
ggml_set_op_params_i32(result, 9, c);
ggml_set_op_params_i32(result, 10, n);
ggml_set_op_params_i32(result, 11, oc);
result->op = GGML_OP_CONV_3D;
result->src[0] = a;
result->src[1] = b;
return result;
}
// ggml_conv_transpose_2d_p0 // ggml_conv_transpose_2d_p0
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

View File

@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum):
DREAM = auto() DREAM = auto()
SMALLTHINKER = auto() SMALLTHINKER = auto()
LLADA = auto() LLADA = auto()
SEED_OSS = auto()
class VISION_PROJECTOR_TYPE(IntEnum): class VISION_PROJECTOR_TYPE(IntEnum):
@ -717,6 +718,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.DREAM: "dream", MODEL_ARCH.DREAM: "dream",
MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.SMALLTHINKER: "smallthinker",
MODEL_ARCH.LLADA: "llada", MODEL_ARCH.LLADA: "llada",
MODEL_ARCH.SEED_OSS: "seed_oss",
} }
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -1973,6 +1975,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.SEED_OSS: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
],
MODEL_ARCH.OLMOE: [ MODEL_ARCH.OLMOE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -2590,6 +2606,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.OUTPUT,
], ],
MODEL_ARCH.SMALLTHINKER: [ MODEL_ARCH.SMALLTHINKER: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,

View File

@ -312,7 +312,7 @@ extern "C" {
float yarn_beta_fast; // YaRN low correction dim float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default) float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;

View File

@ -28,7 +28,6 @@ LLAMA_BENCH_DB_FIELDS = [
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"defrag_thold",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
] ]
@ -38,7 +37,6 @@ LLAMA_BENCH_DB_TYPES = [
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER", "TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT", "TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT",
"REAL",
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
] ]

View File

@ -93,6 +93,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -2010,6 +2011,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
} }
}, },
{ {
@ -2067,6 +2069,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_SEED_OSS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {

View File

@ -97,6 +97,7 @@ enum llm_arch {
LLM_ARCH_DREAM, LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER, LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA, LLM_ARCH_LLADA,
LLM_ARCH_SEED_OSS,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };

View File

@ -16,10 +16,10 @@
static std::string trim(const std::string & str) { static std::string trim(const std::string & str) {
size_t start = 0; size_t start = 0;
size_t end = str.size(); size_t end = str.size();
while (start < end && isspace(str[start])) { while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
start += 1; start += 1;
} }
while (end > start && isspace(str[end - 1])) { while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
end -= 1; end -= 1;
} }
return str.substr(start, end - start); return str.substr(start, end - start);
@ -69,6 +69,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -201,6 +202,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2; return LLM_CHAT_TEMPLATE_KIMI_K2;
} else if (tmpl_contains("<seed:bos>")) {
return LLM_CHAT_TEMPLATE_SEED_OSS;
} }
return LLM_CHAT_TEMPLATE_UNKNOWN; return LLM_CHAT_TEMPLATE_UNKNOWN;
} }
@ -752,6 +755,14 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|im_assistant|>assistant<|im_middle|>"; ss << "<|im_assistant|>assistant<|im_middle|>";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
for (auto message: chat) {
std::string role(message->role);
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
}
if (add_ass) {
ss << "<seed:bos>assistant\n";
}
} else { } else {
// template not supported // template not supported
return -1; return -1;

View File

@ -49,6 +49,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };

View File

@ -39,7 +39,6 @@ llama_context::llama_context(
cparams.yarn_attn_factor = params.yarn_attn_factor; cparams.yarn_attn_factor = params.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings; cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv; cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn; cparams.flash_attn = params.flash_attn;
@ -978,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
bool did_optimize = false; bool did_optimize = false;
// handle any pending defrags/shifts // handle any pending shifts/copies
memory_update(false); memory_update(false);
llama_memory_context_ptr mctx; llama_memory_context_ptr mctx;

View File

@ -24,7 +24,6 @@ struct llama_cparams {
float yarn_attn_factor; float yarn_attn_factor;
float yarn_beta_fast; float yarn_beta_fast;
float yarn_beta_slow; float yarn_beta_slow;
float defrag_thold;
bool embeddings; bool embeddings;
bool causal_attn; bool causal_attn;

View File

@ -153,3 +153,28 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
bool llama_hparams::has_kv(uint32_t il) const {
if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;
}
return false;
}
// by default, all layers have kv
return true;
}
uint32_t llama_hparams::n_layer_kv() const {
uint32_t res = 0;
for (uint32_t il = 0; il < n_layer; ++il) {
if (has_kv(il)) {
res++;
}
}
return res;
}

View File

@ -41,6 +41,7 @@ struct llama_hparams {
uint32_t n_embd; uint32_t n_embd;
uint32_t n_embd_features = 0; uint32_t n_embd_features = 0;
uint32_t n_layer; uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot; uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@ -221,6 +222,11 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const; uint32_t n_pos_per_embd() const;
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
bool has_kv(uint32_t il) const;
// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -22,9 +22,26 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
uint32_t kv_size, uint32_t kv_size,
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_ubatch, uint32_t n_ubatch,
uint32_t n_pad) : hparams(model.hparams), unified(unified) { uint32_t n_pad,
llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; const layer_filter_cb & filter,
llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
// chain filters
const layer_filter_cb filter_base = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return !model.hparams.is_swa(il);
};
const layer_filter_cb filter_swa = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return model.hparams.is_swa(il);
};
const uint32_t size_base = kv_size; const uint32_t size_base = kv_size;
@ -41,16 +58,16 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique<llama_kv_cache>( kv_base = std::make_unique<llama_kv_cache>(
model, std::move(filter_base), type_k, type_v, model, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad, v_trans, offload, unified, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE); 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique<llama_kv_cache>( kv_swa = std::make_unique<llama_kv_cache>(
model, std::move(filter_swa), type_k, type_v, model, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad, v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type); hparams.n_swa, hparams.swa_type, filter_swa, reuse);
} }
void llama_kv_cache_iswa::clear(bool data) { void llama_kv_cache_iswa::clear(bool data) {

View File

@ -20,11 +20,13 @@ public:
bool v_trans, bool v_trans,
bool offload, bool offload,
bool swa_full, bool swa_full,
bool , bool unified,
uint32_t kv_size, uint32_t kv_size,
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_ubatch, uint32_t n_ubatch,
uint32_t n_pad); uint32_t n_pad,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_iswa() = default; ~llama_kv_cache_iswa() = default;

View File

@ -18,7 +18,6 @@
llama_kv_cache::llama_kv_cache( llama_kv_cache::llama_kv_cache(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter,
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
bool v_trans, bool v_trans,
@ -28,21 +27,15 @@ llama_kv_cache::llama_kv_cache(
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_pad, uint32_t n_pad,
uint32_t n_swa, uint32_t n_swa,
llama_swa_type swa_type) : llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) :
model(model), hparams(model.hparams), v_trans(v_trans), model(model), hparams(model.hparams), v_trans(v_trans),
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % n_pad == 0); GGML_ASSERT(kv_size % n_pad == 0);
// TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] const uint32_t n_layer_kv = hparams.n_layer_kv();
auto n_layer_cache = hparams.n_layer;
if (model.arch == LLM_ARCH_GEMMA3N) {
n_layer_cache = 20;
}
if (model.arch == LLM_ARCH_GLM4_MOE) {
// GLM-4.5: Only process up to last layer, skip final NextN layer
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
}
// create a context for each buffer type // create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@ -50,7 +43,7 @@ llama_kv_cache::llama_kv_cache(
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
ggml_init_params params = { ggml_init_params params = {
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()), /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
@ -97,9 +90,14 @@ llama_kv_cache::llama_kv_cache(
__func__, hparams.n_embd_v_gqa_max()); __func__, hparams.n_embd_v_gqa_max());
} }
for (uint32_t il = 0; il < n_layer_cache; il++) { for (uint32_t il = 0; il < hparams.n_layer; il++) {
if (!hparams.has_kv(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
continue;
}
if (filter && !filter(il)) { if (filter && !filter(il)) {
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
continue; continue;
} }
@ -147,23 +145,27 @@ llama_kv_cache::llama_kv_cache(
layers.push_back({ il, k, v, k_stream, v_stream, }); layers.push_back({ il, k, v, k_stream, v_stream, });
} }
// TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] if (reuse) {
if (model.arch == LLM_ARCH_GEMMA3N) { LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) { for (uint32_t il = 0; il < hparams.n_layer; il++) {
if (filter && !filter(il)) { const int32_t il_reuse = reuse(il);
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
if (il_reuse < 0) {
LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
continue; continue;
} }
const bool is_swa = hparams.is_swa(il); if (filter && !filter(il)) {
const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1); LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
continue;
}
GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end()); GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
map_layer_ids[il] = map_layer_ids[il_reuse]; map_layer_ids[il] = map_layer_ids[il_reuse];
LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa); LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
} }
} }
@ -525,39 +527,11 @@ llama_memory_context_ptr llama_kv_cache::init_full() {
} }
llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) { llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
GGML_UNUSED(optimize);
bool do_shift = get_has_shift(); bool do_shift = get_has_shift();
defrag_info dinfo; return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
// see if we need to defrag
if (n_stream == 1) {
// note : for now do not consider defrag for n_stream > 1
const auto & cells = v_cells[seq_to_stream[0]];
bool do_defrag = optimize;
const auto thold = lctx->get_cparams().defrag_thold;
if (!do_defrag && thold > 0.0f) {
const auto n_kv = cells.used_max_p1();
// - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
if (fragmentation > thold) {
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
do_defrag = true;
}
}
if (do_defrag) {
dinfo = defrag_prepare(lctx->graph_max_nodes());
}
}
return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(dinfo), std::move(sc_info));
} }
llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) { llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
@ -629,7 +603,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
return res; return res;
} }
bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) { bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
bool updated = false; bool updated = false;
auto * sched = lctx->get_sched(); auto * sched = lctx->get_sched();
@ -699,53 +673,6 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_in
} }
} }
if (!dinfo.empty()) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
// note: for now do not consider defrag for n_stream > 1
auto & cells = v_cells[seq_to_stream[0]];
auto & head = v_heads[seq_to_stream[0]];
// apply moves:
{
const auto n_kv = dinfo.ids.size();
for (uint32_t i = 0; i < n_kv; ++i) {
assert(dinfo.ids[i] <= n_kv);
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
continue;
}
cells.mv(i, dinfo.ids[i]);
}
// reset the head so we can find the first free slot during the next ubatch
head = 0;
}
ggml_backend_sched_reset(sched);
auto * res = lctx->get_gf_res_reserve();
res->reset();
auto * gf = build_graph_defrag(res, lctx, dinfo);
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
return updated;
}
res->set_inputs(nullptr);
if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
return updated;
}
updated = true;
}
return updated; return updated;
} }
@ -1525,283 +1452,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
return gf; return gf;
} }
ggml_cgraph * llama_kv_cache::build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const {
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
const auto & cells = v_cells[0];
const auto & ids = dinfo.ids;
const auto & cparams = lctx->get_cparams();
#if 0
// CPU defrag
//
// TODO: optimizations are possible:
// - multiple threads
// - avoid copying to the host memory when already there
//
// likely not worth the effort, as we have ggml_graph based defrag
//
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t kv_size = size;
std::vector<uint8_t> buf_k;
std::vector<uint8_t> buf_v;
for (uint32_t il = 0; il < n_layer; ++il) {
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
const size_t v_size_el = ggml_type_size(v_l[il]->type);
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
buf_k.resize(k_size);
buf_v.resize(v_size);
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
// batch move [i, i+nm) to [id, id+nm)
// note: cells can move only to a lower index
for (uint32_t i = 0; i < n_kv; ++i) {
const uint32_t id = ids[i];
if (i == id || id == n_kv) {
continue;
}
uint32_t nm = 1;
while (i + nm < n_kv && ids[i + nm] == id + nm) {
nm++;
}
// move keys
{
const int64_t os = i*k_size_row;
const int64_t od = id*k_size_row;
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
}
// move values (note: they are transposed)
{
const int64_t os = i;
const int64_t od = id;
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
}
}
i += nm - 1;
}
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
if (i == id || id == ids.size()) {
continue;
}
uint32_t nm = 1;
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
nm++;
}
for (const auto & layer : layers) {
const uint32_t il = layer.il;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
n_embd_k_gqa, nm,
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_k_gqa*i));
ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
n_embd_k_gqa, nm,
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_k_gqa*id));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx, layer.v,
n_embd_v_gqa, nm,
ggml_row_size(layer.v->type, n_embd_v_gqa),
ggml_row_size(layer.v->type, n_embd_v_gqa*i));
view_v_dst = ggml_view_2d(ctx, layer.v,
n_embd_v_gqa, nm,
ggml_row_size(layer.v->type, n_embd_v_gqa),
ggml_row_size(layer.v->type, n_embd_v_gqa*id));
} else {
view_v_src = ggml_view_2d(ctx, layer.v,
nm, n_embd_v_gqa,
ggml_row_size(layer.v->type, cells.size()),
ggml_row_size(layer.v->type, i));
view_v_dst = ggml_view_2d(ctx, layer.v,
nm, n_embd_v_gqa,
ggml_row_size(layer.v->type, cells.size()),
ggml_row_size(layer.v->type, id));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
}
i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return gf;
}
llama_kv_cache::defrag_info llama_kv_cache::defrag_prepare(int32_t n_max_nodes) const {
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
const auto & cells = v_cells[0];
const uint32_t n_layer = layers.size();
const uint32_t n_kv = cells.used_max_p1();
const uint32_t n_used = cells.get_used();
assert(n_used <= n_kv);
//const int64_t t_start = ggml_time_us();
// number of cells moved
uint32_t n_moves = 0;
// each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
//const uint32_t max_moves = max_nodes()/(6*n_layer);
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
// determine which KV cells to move where
defrag_info res;
auto & ids = res.ids;
ids.resize(n_kv, n_kv);
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
if (!cells.is_empty(i0)) {
ids[i0] = i0;
continue;
}
// found a hole - fill it with data from the end of the cache
uint32_t nh = 1;
// determine the size of the hole
while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
nh++;
}
uint32_t nf = 0;
uint32_t is = n_kv - 1;
// starting from the end, find nh non-empty cells
for (; is > i0; --is) {
if (cells.is_empty(is) || ids[is] != n_kv) {
continue;
}
// non-empty cell which is not yet moved
nf++;
if (nf == nh) {
break;
}
}
// this can only happen if `n_used` is not accurate, which would be a bug
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
nf = 0;
uint32_t i1 = is;
// are we moving a continuous block of memory?
bool cont = false;
// should we stop searching for the next move?
bool stop = false;
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
if (cells.is_empty(i1) || ids[i1] != n_kv) {
if (n_moves == max_moves) {
stop = true;
break;
}
cont = false;
continue;
}
// this cell goes to (i0 + nf)
ids[i1] = i0 + nf;
if (!cont) {
n_moves++;
cont = true;
}
nf++;
if (nf == nh) {
break;
}
}
if (stop || n_moves == max_moves) {
break;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
if (n_moves == 0) {
return {};
}
LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
return res;
}
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const { bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
assert(p0 >= 0 && p1 >= 0); assert(p0 >= 0 && p1 >= 0);
@ -2300,9 +1950,8 @@ llama_kv_cache_context::llama_kv_cache_context(
llama_kv_cache * kv, llama_kv_cache * kv,
llama_context * lctx, llama_context * lctx,
bool do_shift, bool do_shift,
defrag_info dinfo, stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) { if (!do_shift && this->sc_info.empty()) {
if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
status = LLAMA_MEMORY_STATUS_NO_UPDATE; status = LLAMA_MEMORY_STATUS_NO_UPDATE;
} }
} }
@ -2330,7 +1979,7 @@ bool llama_kv_cache_context::apply() {
// no ubatches -> this is a KV cache update // no ubatches -> this is a KV cache update
if (ubatches.empty()) { if (ubatches.empty()) {
kv->update(lctx, do_shift, dinfo, sc_info); kv->update(lctx, do_shift, sc_info);
return true; return true;
} }

View File

@ -21,20 +21,6 @@ class llama_kv_cache : public llama_memory_i {
public: public:
static uint32_t get_padding(const llama_cparams & cparams); static uint32_t get_padding(const llama_cparams & cparams);
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
struct defrag_info {
bool empty() const {
return ids.empty();
}
// contains information about which cell moves where:
// - cell i moves to ids[i]
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
std::vector<uint32_t> ids;
};
struct stream_copy_info { struct stream_copy_info {
bool empty() const { bool empty() const {
assert(ssrc.size() == sdst.size()); assert(ssrc.size() == sdst.size());
@ -94,7 +80,6 @@ public:
llama_kv_cache( llama_kv_cache(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter,
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
bool v_trans, bool v_trans,
@ -104,7 +89,9 @@ public:
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_pad, uint32_t n_pad,
uint32_t n_swa, uint32_t n_swa,
llama_swa_type swa_type); llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache() = default; ~llama_kv_cache() = default;
@ -173,7 +160,7 @@ public:
// return empty vector on failure // return empty vector on failure
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches); slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info); bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
// find a slot of kv cells that can hold the ubatch // find a slot of kv cells that can hold the ubatch
// if cont == true, then the slot must be continuous // if cont == true, then the slot must be continuous
@ -254,9 +241,6 @@ private:
// model layer id -> KV cache layer id // model layer id -> KV cache layer id
std::unordered_map<int32_t, int32_t> map_layer_ids; std::unordered_map<int32_t, int32_t> map_layer_ids;
// return non-empty vector if cells have been moved
defrag_info defrag_prepare(int32_t n_max_nodes) const;
size_t total_size() const; size_t total_size() const;
size_t size_k_bytes() const; size_t size_k_bytes() const;
@ -277,11 +261,6 @@ private:
llm_graph_result * res, llm_graph_result * res,
llama_context * lctx) const; llama_context * lctx) const;
ggml_cgraph * build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const;
struct cell_ranges_t { struct cell_ranges_t {
uint32_t strm; uint32_t strm;
@ -299,7 +278,6 @@ class llama_kv_cache_context : public llama_memory_context_i {
public: public:
// some shorthands // some shorthands
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
using defrag_info = llama_kv_cache::defrag_info;
using stream_copy_info = llama_kv_cache::stream_copy_info; using stream_copy_info = llama_kv_cache::stream_copy_info;
// used for errors // used for errors
@ -314,7 +292,6 @@ public:
llama_kv_cache * kv, llama_kv_cache * kv,
llama_context * lctx, llama_context * lctx,
bool do_shift, bool do_shift,
defrag_info dinfo,
stream_copy_info sc_info); stream_copy_info sc_info);
// used to create a batch procesing context from a batch // used to create a batch procesing context from a batch
@ -374,8 +351,6 @@ private:
bool do_shift = false; bool do_shift = false;
defrag_info dinfo;
stream_copy_info sc_info; stream_copy_info sc_info;
// //

View File

@ -77,24 +77,24 @@ public:
} }
// move cell isrc to idst (used during defrag) // move cell isrc to idst (used during defrag)
void mv(uint32_t isrc, uint32_t idst) { //void mv(uint32_t isrc, uint32_t idst) {
assert(isrc < pos.size()); // assert(isrc < pos.size());
assert(idst < pos.size()); // assert(idst < pos.size());
assert(pos[idst] == -1); // assert(pos[idst] == -1);
assert(pos[isrc] != -1); // assert(pos[isrc] != -1);
pos [idst] = pos [isrc]; // pos [idst] = pos [isrc];
shift[idst] = shift[isrc]; // shift[idst] = shift[isrc];
seq [idst] = seq [isrc]; // seq [idst] = seq [isrc];
pos [isrc] = -1; // pos [isrc] = -1;
shift[isrc] = 0; // shift[isrc] = 0;
seq [isrc].reset(); // seq [isrc].reset();
used.erase (isrc); // used.erase (isrc);
used.insert(idst); // used.insert(idst);
} //}
// copy the state of cells [i, i + n) (used for save/restore the state of the cells) // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
llama_kv_cells cp(uint32_t i, uint32_t n) const { llama_kv_cells cp(uint32_t i, uint32_t n) const {

View File

@ -27,14 +27,11 @@ llama_memory_hybrid::llama_memory_hybrid(
bool offload, bool offload,
bool unified, bool unified,
/* layer filters */ /* layer filters */
layer_filter_cb && filter_attn, const layer_filter_cb & filter_attn,
layer_filter_cb && filter_recr) : const layer_filter_cb & filter_recr) :
hparams(model.hparams), hparams(model.hparams),
mem_attn(new llama_kv_cache( mem_attn(new llama_kv_cache(
model, model,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
type_k, type_k,
type_v, type_v,
v_trans, v_trans,
@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
n_seq_max, n_seq_max,
n_pad, n_pad,
n_swa, n_swa,
swa_type swa_type,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
nullptr
)), )),
mem_recr(new llama_memory_recurrent( mem_recr(new llama_memory_recurrent(
model, model,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr,
type_r, type_r,
type_s, type_s,
offload, offload,
rs_size, rs_size,
n_seq_max n_seq_max,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
)) {} )) {}
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {

View File

@ -18,10 +18,6 @@
class llama_memory_hybrid : public llama_memory_i { class llama_memory_hybrid : public llama_memory_i {
public: public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_hybrid( llama_memory_hybrid(
const llama_model & model, const llama_model & model,
/* attn */ /* attn */
@ -41,8 +37,8 @@ public:
bool offload, bool offload,
bool unified, bool unified,
/* layer filters */ /* layer filters */
layer_filter_cb && filter_attn = nullptr, const layer_filter_cb & filter_attn = nullptr,
layer_filter_cb && filter_recr = nullptr); const layer_filter_cb & filter_recr = nullptr);
~llama_memory_hybrid() = default; ~llama_memory_hybrid() = default;

View File

@ -17,12 +17,12 @@
llama_memory_recurrent::llama_memory_recurrent( llama_memory_recurrent::llama_memory_recurrent(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter,
ggml_type type_r, ggml_type type_r,
ggml_type type_s, ggml_type type_s,
bool offload, bool offload,
uint32_t mem_size, uint32_t mem_size,
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { uint32_t n_seq_max,
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer; const int32_t n_layer = hparams.n_layer;
head = 0; head = 0;

View File

@ -15,18 +15,14 @@
// see the implementation of llama_kv_cache_context_i for an example how to do it // see the implementation of llama_kv_cache_context_i for an example how to do it
class llama_memory_recurrent : public llama_memory_i { class llama_memory_recurrent : public llama_memory_i {
public: public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_recurrent( llama_memory_recurrent(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter,
ggml_type type_r, ggml_type type_r,
ggml_type type_s, ggml_type type_s,
bool offload, bool offload,
uint32_t mem_size, uint32_t mem_size,
uint32_t n_seq_max); uint32_t n_seq_max,
const layer_filter_cb & filter);
~llama_memory_recurrent() = default; ~llama_memory_recurrent() = default;

View File

@ -3,6 +3,7 @@
#include "llama.h" #include "llama.h"
#include <memory> #include <memory>
#include <functional>
struct llama_ubatch; struct llama_ubatch;
@ -64,6 +65,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
// general concept of LLM memory // general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types // the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i { struct llama_memory_i {
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
// this callback is used to specify which layers should reuse memory from other layers
// return negative value to indicate that the layer il should not reuse memory
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
virtual ~llama_memory_i() = default; virtual ~llama_memory_i() = default;
// split the input batch into a set of ubatches and verify that they can fit into the cache // split the input batch into a set of ubatches and verify that they can fit into the cache
@ -77,7 +85,7 @@ struct llama_memory_i {
// simulate full cache, used for allocating worst-case compute buffers // simulate full cache, used for allocating worst-case compute buffers
virtual llama_memory_context_ptr init_full() = 0; virtual llama_memory_context_ptr init_full() = 0;
// prepare for any pending memory updates, such as shifts, defrags, etc. // prepare for any pending memory updates, such as shifts, copies, etc.
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0; virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;

View File

@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_32B: return "32B"; case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B"; case LLM_TYPE_34B: return "34B";
case LLM_TYPE_35B: return "35B"; case LLM_TYPE_35B: return "35B";
case LLM_TYPE_36B: return "36B";
case LLM_TYPE_40B: return "40B"; case LLM_TYPE_40B: return "40B";
case LLM_TYPE_65B: return "65B"; case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B"; case LLM_TYPE_70B: return "70B";
@ -1114,6 +1115,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(5); hparams.set_swa_pattern(5);
hparams.n_layer_kv_from_start = 20;
hparams.rope_freq_base_train_swa = 10000.0f; hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f; hparams.rope_freq_scale_train_swa = 1.0f;
hparams.f_attention_scale = 1.0f; hparams.f_attention_scale = 1.0f;
@ -1288,6 +1290,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_SEED_OSS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 64: type = LLM_TYPE_36B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1471,6 +1481,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// NextN/MTP parameters // NextN/MTP parameters
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
// TODO: when MTP is implemented, this should probably be updated if needed
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
@ -3967,6 +3980,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
} }
} break; } break;
case LLM_ARCH_SEED_OSS:
{
const uint32_t head_dim = hparams.n_embd_head_k;
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
}
} break;
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5476,6 +5526,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
@ -10473,7 +10528,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
const int64_t n_embd_altup; const int64_t n_embd_altup;
const int64_t n_altup; const int64_t n_altup;
const int i_altup_act; const int i_altup_act;
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
const int n_layer_sparsity = 10; // number of layers using activation sparsity const int n_layer_sparsity = 10; // number of layers using activation sparsity
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
@ -10523,8 +10577,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
const bool has_kv = (il < n_layer_kv);
const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@ -10544,7 +10596,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens] ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
// self-attention // self-attention
if (has_kv) { if (hparams.has_kv(il)) {
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -10584,7 +10636,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
} else { } else {
// no KV layers // reuse KV cache of earlier layers
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@ -17787,8 +17839,7 @@ struct llm_build_lfm2 : public llm_graph_context {
cb(cur, "model.embedding_norm", -1); cb(cur, "model.embedding_norm", -1);
res->t_embd = cur; res->t_embd = cur;
// lm_head is tied with embeddings cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.tok_embd, cur);
cb(cur, "lm_head", -1); cb(cur, "lm_head", -1);
res->t_logits = cur; res->t_logits = cur;
@ -17930,6 +17981,137 @@ struct llm_build_lfm2 : public llm_graph_context {
} }
}; };
struct llm_build_seed_oss : public llm_graph_context {
llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
template <bool iswa> template <bool iswa>
struct llm_build_smallthinker : public llm_graph_context{ struct llm_build_smallthinker : public llm_graph_context{
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
@ -18075,12 +18257,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
if (llm_arch_is_recurrent(arch)) { if (llm_arch_is_recurrent(arch)) {
res = new llama_memory_recurrent( res = new llama_memory_recurrent(
*this, *this,
nullptr,
GGML_TYPE_F32, GGML_TYPE_F32,
GGML_TYPE_F32, GGML_TYPE_F32,
cparams.offload_kqv, cparams.offload_kqv,
std::max((uint32_t) 1, cparams.n_seq_max), std::max((uint32_t) 1, cparams.n_seq_max),
cparams.n_seq_max); cparams.n_seq_max,
nullptr);
} else if (llm_arch_is_hybrid(arch)) { } else if (llm_arch_is_hybrid(arch)) {
const auto padding = llama_kv_cache::get_padding(cparams); const auto padding = llama_kv_cache::get_padding(cparams);
@ -18121,6 +18303,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
llama_memory_i::layer_reuse_cb reuse = nullptr;
if (arch == LLM_ARCH_GEMMA3N) {
reuse = [&](int32_t il) {
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
}
return -1;
};
}
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(hparams.is_swa_any()); GGML_ASSERT(hparams.is_swa_any());
@ -18135,13 +18329,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
n_ctx_per_stream, n_ctx_per_stream,
cparams.n_seq_max, cparams.n_seq_max,
cparams.n_ubatch, cparams.n_ubatch,
padding); padding,
nullptr,
reuse);
} else { } else {
GGML_ASSERT(!hparams.is_swa_any()); GGML_ASSERT(!hparams.is_swa_any());
res = new llama_kv_cache( res = new llama_kv_cache(
*this, *this,
nullptr,
params.type_k, params.type_k,
params.type_v, params.type_v,
!cparams.flash_attn, !cparams.flash_attn,
@ -18151,7 +18346,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
cparams.n_seq_max, cparams.n_seq_max,
padding, padding,
hparams.n_swa, hparams.n_swa,
hparams.swa_type); hparams.swa_type,
nullptr,
nullptr);
} }
} }
} }
@ -18468,6 +18665,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_bailingmoe>(*this, params); llm = std::make_unique<llm_build_bailingmoe>(*this, params);
} break; } break;
case LLM_ARCH_SEED_OSS:
{
llm = std::make_unique<llm_build_seed_oss>(*this, params);
} break;
case LLM_ARCH_DOTS1: case LLM_ARCH_DOTS1:
{ {
llm = std::make_unique<llm_build_dots1>(*this, params); llm = std::make_unique<llm_build_dots1>(*this, params);
@ -18526,6 +18727,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
return llm->res->get_gf(); return llm->res->get_gf();
} }
// //
// interface implementation // interface implementation
// //
@ -18720,6 +18922,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_LFM2: case LLM_ARCH_LFM2:
case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_SMALLTHINKER:
case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM4_MOE:
case LLM_ARCH_SEED_OSS:
return LLAMA_ROPE_TYPE_NEOX; return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL: case LLM_ARCH_QWEN2VL:

View File

@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, LLM_TYPE_34B,
LLM_TYPE_35B, LLM_TYPE_35B,
LLM_TYPE_36B,
LLM_TYPE_40B, LLM_TYPE_40B,
LLM_TYPE_65B, LLM_TYPE_65B,
LLM_TYPE_70B, LLM_TYPE_70B,

View File

@ -2858,6 +2858,7 @@ struct test_rms_norm_mul_add : public test_case {
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
const float eps; const float eps;
const bool broadcast; const bool broadcast;
const bool multi_add; // test a sequence of adds feeding into rms_norm
std::string op_desc(ggml_tensor * t) override { std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t); GGML_UNUSED(t);
@ -2867,13 +2868,13 @@ struct test_rms_norm_mul_add : public test_case {
bool run_whole_graph() override { return true; } bool run_whole_graph() override { return true; }
std::string vars() override { std::string vars() override {
return VARS_TO_STR4(type, ne, eps, broadcast); return VARS_TO_STR5(type, ne, eps, broadcast, multi_add);
} }
test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32, test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 5, 4, 3}, std::array<int64_t, 4> ne = {64, 5, 4, 3},
float eps = 1e-6f, bool broadcast = false) float eps = 1e-6f, bool broadcast = false, bool multi_add = false)
: type(type), ne(ne), eps(eps), broadcast(broadcast) {} : type(type), ne(ne), eps(eps), broadcast(broadcast), multi_add(multi_add) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
std::array<int64_t, 4> broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4}; std::array<int64_t, 4> broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4};
@ -2891,6 +2892,9 @@ struct test_rms_norm_mul_add : public test_case {
// Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul // Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul
a = ggml_add(ctx, ggml_add(ctx, a, b), c); a = ggml_add(ctx, ggml_add(ctx, a, b), c);
if (multi_add) {
a = ggml_add(ctx, ggml_add(ctx, a, b), c);
}
ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c); ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c);
ggml_set_name(out, "out"); ggml_set_name(out, "out");
@ -4091,6 +4095,75 @@ struct test_conv_2d_dw : public test_case {
} }
}; };
// GGML_OP_CONV_3D
struct test_conv_3d : public test_case {
// Logical 5D dimensions
const int64_t N, IC, ID, IH, IW;
const int64_t OC, KD, KH, KW;
// Conv params
const int s0, s1, s2;
const int p0, p1, p2;
const int d0, d1, d2;
// Types
const ggml_type type_kernel;
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "CONV_3D";
}
std::string vars() override {
return VARS_TO_STR11(N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1) + "," +
VARS_TO_STR8(s2, p0, p1, p2, d0, d1, d2, type_kernel);
}
double max_nmse_err() override {
return 5e-4;
}
uint64_t op_flops(ggml_tensor * t) override {
GGML_UNUSED(t);
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
const int64_t OD = calc_conv_output_size(ID, KD, s2, p2, d2);
const int64_t OH = calc_conv_output_size(IH, KH, s1, p1, d1);
const int64_t OW = calc_conv_output_size(IW, KW, s0, p0, d0);
return (uint64_t)N * OC * OD * OH * OW * (2 * IC * KD * KH * KW - 1);
}
test_conv_3d(
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW,
int64_t OC, int64_t KD, int64_t KH, int64_t KW,
int s0, int s1, int s2,
int p0, int p1, int p2,
int d0, int d1, int d2,
ggml_type type_kernel
) : N(N), IC(IC), ID(ID), IH(IH), IW(IW),
OC(OC), KD(KD), KH(KH), KW(KW),
s0(s0), s1(s1), s2(s2),
p0(p0), p1(p1), p2(p2),
d0(d0), d1(d1), d2(d2),
type_kernel(type_kernel) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
// GGML input tensor is packed as [W, H, D, C*N]
const int64_t ne_input[] = {IW, IH, ID, IC * N};
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input);
ggml_set_name(input, "input");
// GGML kernel tensor is packed as [KW, KH, KD, IC*OC]
const int64_t ne_kernel[] = {KW, KH, KD, IC * OC};
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel);
ggml_set_name(kernel, "kernel");
ggml_tensor * out = ggml_conv_3d(ctx, kernel, input, s0, s1, s2, p0, p1, p2, d0, d1, d2, (int)IC, (int)N, (int)OC);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_CONCAT // GGML_OP_CONCAT
struct test_concat : public test_case { struct test_concat : public test_case {
const ggml_type type; const ggml_type type;
@ -4231,20 +4304,32 @@ struct test_sum : public test_case {
struct test_sum_rows : public test_case { struct test_sum_rows : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
const bool permute;
const bool slice;
std::string vars() override { std::string vars() override {
return VARS_TO_STR2(type, ne); return VARS_TO_STR4(type, ne, permute, slice);
} }
test_sum_rows(ggml_type type = GGML_TYPE_F32, test_sum_rows(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 5, 4, 3}) std::array<int64_t, 4> ne = {10, 5, 4, 3},
: type(type), ne(ne) {} bool permute = false, bool slice = false)
: type(type), ne(ne), permute(permute), slice(slice) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(a); ggml_set_param(a);
ggml_set_name(a, "a"); ggml_set_name(a, "a");
if (slice) {
a = ggml_view_4d(ctx, a,
ne[0], ne[1], ne[2] / 2, ne[3] - 1,
a->nb[1], a->nb[2] * 2, a->nb[3], /*offset=*/a->nb[3]);
}
if (permute) {
a = ggml_permute(ctx, a, 0, 2, 3, 1);
}
ggml_tensor * out = ggml_sum_rows(ctx, a); ggml_tensor * out = ggml_sum_rows(ctx, a);
ggml_set_name(out, "out"); ggml_set_name(out, "out");
@ -5528,6 +5613,61 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
// CONV_3D
auto calc_conv_output_size_3d = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
for (int N : {1, 2}) {
for (int IC : {1, 3}) {
for (int OC : {1, 4}) {
for (int s0 : {1, 2}) {
for (int p1 : {0, 1}) {
for (int d2 : {1, 2}) {
int64_t IW = 20, IH = 22, ID = 18;
int64_t KW = 3, KH = 3, KD = 3;
int s1 = s0, s2 = s0;
int p0 = p1, p2 = p1;
int d0 = d2, d1 = d2;
if (calc_conv_output_size_3d(IW, KW, s0, p0, d0) <= 0 ||
calc_conv_output_size_3d(IH, KH, s1, p1, d1) <= 0 ||
calc_conv_output_size_3d(ID, KD, s2, p2, d2) <= 0) {
continue;
}
test_cases.emplace_back(new test_conv_3d(
N, IC, ID, IH, IW,
OC, KD, KH, KW,
s0, s1, s2, p0, p1, p2, d0, d1, d2,
kernel_type));
// Asymmetric kernel and params
int64_t asym_KW = 5, asym_KH = 1, asym_KD = 3;
int asym_s0 = 2, asym_s1 = 1, asym_s2 = 1;
int asym_p0 = 2, asym_p1 = 0, asym_p2 = 1;
int asym_d0 = 1, asym_d1 = 1, asym_d2 = 2;
if (calc_conv_output_size_3d(IW, asym_KW, asym_s0, asym_p0, asym_d0) <= 0 ||
calc_conv_output_size_3d(IH, asym_KH, asym_s1, asym_p1, asym_d1) <= 0 ||
calc_conv_output_size_3d(ID, asym_KD, asym_s2, asym_p2, asym_d2) <= 0) {
continue;
}
test_cases.emplace_back(new test_conv_3d(
N, IC, ID, IH, IW,
OC, asym_KD, asym_KH, asym_KW,
asym_s0, asym_s1, asym_s2, asym_p0, asym_p1, asym_p2, asym_d0, asym_d1, asym_d2,
kernel_type));
}
}
}
}
}
}
// Case with kernel size 1
test_cases.emplace_back(new test_conv_3d(1, 4, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, kernel_type));
}
for(uint32_t Cout : {1, 9}){ for(uint32_t Cout : {1, 9}){
for(uint32_t Cin : {1, 7}){ for(uint32_t Cin : {1, 7}){
for(uint32_t K : {1, 3, 1337}){ for(uint32_t K : {1, 3, 1337}){
@ -5706,6 +5846,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
} }
for (uint32_t n : {1, 511, 1025, 8192, 33*512}) {
for (bool multi_add : {false, true}) {
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false, multi_add));
}
}
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
@ -6071,6 +6216,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum());
test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_sum_rows());
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, false));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, false, true));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, true));
test_cases.emplace_back(new test_mean()); test_cases.emplace_back(new test_mean());
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 })); test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 })); test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
@ -6091,8 +6239,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu()); test_cases.emplace_back(new test_leaky_relu());
for (int hsk : { 64, 80, 128, 192, 256, 576 }) { for (int hsk : { 40, 64, 80, 128, 192, 256, 576 }) {
for (int hsv : { 64, 80, 128, 192, 256, 512 }) { for (int hsv : { 40, 64, 80, 128, 192, 256, 512 }) {
if (hsk != 192 && hsk != 576 && hsk != hsv) continue; if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA

View File

@ -290,6 +290,14 @@ int main(void) {
/* .bos_token= */ "", /* .bos_token= */ "",
/* .eos_token= */ "", /* .eos_token= */ "",
}, },
{
/* .name= */ "ByteDance-Seed/Seed-OSS-36B-Instruct",
/* .template_str */ "{# <seed:bos> #}{%- for message in messages %}{%- if message.role in [\"user\", \"system\"] %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- elif message.role == \"assistant\" %}{{ bos_token + message.role }}{%- if message.content is defined and message.content is string and message.content|trim|length > 0 %}{{ \"\\n\" + message.content|trim + eos_token }}{%- endif %}{%- else %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- endif %}{%- endfor %}{%- if add_generation_prompt %}{{ bos_token + \"assistant\\n\" }}{%- endif %}",
/* .expected_output= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
/* .expected_output_jinja= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
/* .bos_token= */ "<seed:bos>",
/* .eos_token= */ "<seed:eos>",
}
}; };
std::vector<char> formatted_chat(1024); std::vector<char> formatted_chat(1024);
int32_t res; int32_t res;

View File

@ -358,7 +358,7 @@ static std::pair<int, int> test_forward_backward(
double accuracy; double accuracy;
double accuracy_unc; double accuracy_unc;
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc); const bool subtest_ok = ndata == 0 && almost_equal(loss, 0.0, 1e-6) && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
} }
@ -381,10 +381,12 @@ static std::pair<int, int> test_forward_backward(
{ {
float weights; float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == ndata/2; const bool subtest_ok = almost_equal(weights, ndata/2, 1e-10);
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
} }
{ {
constexpr double atol = 1e-10;
int64_t ndata; int64_t ndata;
ggml_opt_result_ndata(cd.result, &ndata); ggml_opt_result_ndata(cd.result, &ndata);
bool subtest_ok = ndata == 6; bool subtest_ok = ndata == 6;
@ -392,7 +394,7 @@ static std::pair<int, int> test_forward_backward(
double loss; double loss;
double loss_unc; double loss_unc;
ggml_opt_result_loss(cd.result, &loss, &loss_unc); ggml_opt_result_loss(cd.result, &loss, &loss_unc);
subtest_ok = subtest_ok && loss == 33.0 && almost_equal(loss_unc, sqrt(3.5), 1e-10); subtest_ok = subtest_ok && almost_equal(loss, 33.0, atol) && almost_equal(loss_unc, sqrt(3.5), atol);
double accuracy; double accuracy;
double accuracy_unc; double accuracy_unc;
@ -437,7 +439,7 @@ static std::pair<int, int> test_forward_backward(
{ {
float weights; float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == -ndata * .5; const bool subtest_ok = almost_equal(weights, -ndata * 0.5, 1e-10);
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
} }
{ {
@ -448,7 +450,7 @@ static std::pair<int, int> test_forward_backward(
double loss; double loss;
double loss_unc; double loss_unc;
ggml_opt_result_loss(cd.result, &loss, &loss_unc); ggml_opt_result_loss(cd.result, &loss, &loss_unc);
subtest_ok = subtest_ok && loss == 18.0 && (shuffle || loss_unc == 0.0); subtest_ok = subtest_ok && almost_equal(loss, 18.0, 1e-10) && (shuffle || loss_unc == 0.0);
double accuracy; double accuracy;
double accuracy_unc; double accuracy_unc;
@ -550,10 +552,12 @@ static std::pair<int, int> test_idata_split(
if (adamw) { if (adamw) {
float weights; float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == ndata/2 - epoch*idata_split; const bool subtest_ok = almost_equal(weights, ndata/2 - epoch*idata_split, 1e-10);
helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass); helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
} }
if (adamw) { if (adamw) {
constexpr double atol = 1e-10;
int64_t ndata_result; int64_t ndata_result;
ggml_opt_result_ndata(cd.result, &ndata_result); ggml_opt_result_ndata(cd.result, &ndata_result);
bool subtest_ok = ndata_result == idata_split; bool subtest_ok = ndata_result == idata_split;
@ -561,7 +565,7 @@ static std::pair<int, int> test_idata_split(
double loss; double loss;
double loss_unc; double loss_unc;
ggml_opt_result_loss(cd.result, &loss, &loss_unc); ggml_opt_result_loss(cd.result, &loss, &loss_unc);
subtest_ok = subtest_ok && loss == 28.0 - epoch*16.0 && loss_unc == 0.0; subtest_ok = subtest_ok && almost_equal(loss, 28.0 - epoch*16.0, atol) && almost_equal(loss_unc, 0.0, atol);
double accuracy; double accuracy;
double accuracy_unc; double accuracy_unc;
@ -571,6 +575,8 @@ static std::pair<int, int> test_idata_split(
helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass); helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
} }
if (adamw) { if (adamw) {
constexpr double atol = 1e-10;
int64_t ndata_result; int64_t ndata_result;
ggml_opt_result_ndata(cd.result2, &ndata_result); ggml_opt_result_ndata(cd.result2, &ndata_result);
bool subtest_ok = ndata_result == ndata - idata_split; bool subtest_ok = ndata_result == ndata - idata_split;
@ -578,7 +584,7 @@ static std::pair<int, int> test_idata_split(
double loss; double loss;
double loss_unc; double loss_unc;
ggml_opt_result_loss(cd.result2, &loss, &loss_unc); ggml_opt_result_loss(cd.result2, &loss, &loss_unc);
subtest_ok = subtest_ok && loss == 15.0 - epoch*8 && almost_equal(loss_unc, sqrt(0.5), 1e-10); subtest_ok = subtest_ok && almost_equal(loss, 15.0 - epoch*8, atol) && almost_equal(loss_unc, sqrt(0.5), atol);
double accuracy; double accuracy;
double accuracy_unc; double accuracy_unc;
@ -687,22 +693,24 @@ static std::pair<int, int> test_gradient_accumulation(
} }
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
if (adamw) { if (adamw) {
constexpr double atol = 1e-6;
float weights; float weights;
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
const bool subtest_ok = weights == (ndata/2) - epoch; const bool subtest_ok = almost_equal(weights, (ndata/2) - epoch, atol);
helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass); helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
} }
{ {
constexpr double atol = 1e-6;
int64_t ndata_result; int64_t ndata_result;
ggml_opt_result_ndata(cd.result, &ndata_result); ggml_opt_result_ndata(cd.result, &ndata_result);
bool subtest_ok = ndata_result == ndata/nbatch_physical; bool subtest_ok = almost_equal(ndata_result, ndata/nbatch_physical, atol);
double loss; double loss;
ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr); ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr);
if (loss_type == GGML_OPT_LOSS_TYPE_SUM) { if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
subtest_ok = subtest_ok && loss == (39.0 - epoch*6.0); subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0), atol);
} else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) { } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, 1e-6); subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, atol);
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }

View File

@ -43,7 +43,6 @@ test parameters:
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-dt, --defrag-thold <f> (default: -1)
-t, --threads <n> (default: system dependent) -t, --threads <n> (default: system dependent)
-C, --cpu-mask <hex,hex> (default: 0x0) -C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0) --cpu-strict <0|1> (default: 0)

View File

@ -245,7 +245,6 @@ struct cmd_params {
std::vector<int> n_ubatch; std::vector<int> n_ubatch;
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<float> defrag_thold;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<std::string> cpu_mask; std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict; std::vector<bool> cpu_strict;
@ -282,7 +281,6 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ { 512 }, /* n_ubatch */ { 512 },
/* type_k */ { GGML_TYPE_F16 }, /* type_k */ { GGML_TYPE_F16 },
/* type_v */ { GGML_TYPE_F16 }, /* type_v */ { GGML_TYPE_F16 },
/* defrag_thold */ { -1.0f },
/* n_threads */ { cpu_get_num_math() }, /* n_threads */ { cpu_get_num_math() },
/* cpu_mask */ { "0x0" }, /* cpu_mask */ { "0x0" },
/* cpu_strict */ { false }, /* cpu_strict */ { false },
@ -346,8 +344,6 @@ static void print_usage(int /* argc */, char ** argv) {
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", printf(" -ctv, --cache-type-v <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -dt, --defrag-thold <f> (default: %s)\n",
join(cmd_params_defaults.defrag_thold, ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", printf(" -t, --threads <n> (default: %s)\n",
join(cmd_params_defaults.n_threads, ",").c_str()); join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@ -533,13 +529,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break; break;
} }
params.type_v.insert(params.type_v.end(), types.begin(), types.end()); params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-dt" || arg == "--defrag-thold") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<float>(argv[i], split_delim);
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
} else if (arg == "-t" || arg == "--threads") { } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -849,9 +838,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_v.empty()) { if (params.type_v.empty()) {
params.type_v = cmd_params_defaults.type_v; params.type_v = cmd_params_defaults.type_v;
} }
if (params.defrag_thold.empty()) {
params.defrag_thold = cmd_params_defaults.defrag_thold;
}
if (params.n_gpu_layers.empty()) { if (params.n_gpu_layers.empty()) {
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
} }
@ -910,7 +896,6 @@ struct cmd_params_instance {
int n_ubatch; int n_ubatch;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
float defrag_thold;
int n_threads; int n_threads;
std::string cpu_mask; std::string cpu_mask;
bool cpu_strict; bool cpu_strict;
@ -1007,7 +992,6 @@ struct cmd_params_instance {
cparams.n_ubatch = n_ubatch; cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.defrag_thold = defrag_thold;
cparams.offload_kqv = !no_kv_offload; cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn; cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings; cparams.embeddings = embeddings;
@ -1037,7 +1021,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & nub : params.n_ubatch) for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & defrag_thold : params.defrag_thold)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn) for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) for (const auto & nt : params.n_threads)
@ -1058,7 +1041,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1091,7 +1073,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1124,7 +1105,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1166,7 +1146,6 @@ struct test {
int poll; int poll;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
float defrag_thold;
int n_gpu_layers; int n_gpu_layers;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
@ -1201,7 +1180,6 @@ struct test {
poll = inst.poll; poll = inst.poll;
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
defrag_thold = inst.defrag_thold;
n_gpu_layers = inst.n_gpu_layers; n_gpu_layers = inst.n_gpu_layers;
split_mode = inst.split_mode; split_mode = inst.split_mode;
main_gpu = inst.main_gpu; main_gpu = inst.main_gpu;
@ -1257,7 +1235,6 @@ struct test {
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"defrag_thold",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
}; };
@ -1277,7 +1254,7 @@ struct test {
field == "use_mmap" || field == "embeddings") { field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { if (field == "avg_ts" || field == "stddev_ts") {
return FLOAT; return FLOAT;
} }
return STRING; return STRING;
@ -1344,7 +1321,6 @@ struct test {
std::to_string(flash_attn), std::to_string(flash_attn),
tensor_split_str, tensor_split_str,
tensor_buft_overrides_str, tensor_buft_overrides_str,
std::to_string(defrag_thold),
std::to_string(use_mmap), std::to_string(use_mmap),
std::to_string(embeddings), std::to_string(embeddings),
std::to_string(no_op_offload), std::to_string(no_op_offload),
@ -1611,9 +1587,6 @@ struct markdown_printer : public printer {
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
fields.emplace_back("type_v"); fields.emplace_back("type_v");
} }
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
fields.emplace_back("defrag_thold");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.emplace_back("main_gpu"); fields.emplace_back("main_gpu");
} }

View File

@ -3513,7 +3513,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
const int height = img->ny; const int height = img->ny;
const int total_factor = params.patch_size * params.proj_scale_factor; const int total_factor = params.patch_size * params.proj_scale_factor;
constexpr int min_image_tokens = 64; constexpr int min_image_tokens = 64;
constexpr int max_image_tokens = 256; constexpr int max_image_tokens = 1024;
const float min_pixels = min_image_tokens * total_factor * total_factor; const float min_pixels = min_image_tokens * total_factor * total_factor;
const float max_pixels = max_image_tokens * total_factor * total_factor; const float max_pixels = max_image_tokens * total_factor * total_factor;

View File

@ -66,7 +66,7 @@ The project is under active development, and we are [looking for feedback and co
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
@ -226,6 +226,10 @@ services:
### Multimodal support ### Multimodal support
Multimodal support was added in [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) and is currently an experimental feature. Multimodal support was added in [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) and is currently an experimental feature.
It is currently available in the following endpoints:
- The OAI-compatible chat endpoint.
- The non-OAI-compatible completions endpoint.
- The non-OAI-compatible embeddings endpoint.
For more details, please refer to [multimodal documentation](../../docs/multimodal.md) For more details, please refer to [multimodal documentation](../../docs/multimodal.md)
@ -400,12 +404,15 @@ These input shapes and data type are allowed for `prompt`:
- Single string: `"string"` - Single string: `"string"`
- Single sequence of tokens: `[12, 34, 56]` - Single sequence of tokens: `[12, 34, 56]`
- Mixed tokens and strings: `[12, 34, "string", 56, 78]` - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
- A JSON object which optionally contains multimodal data: `{ "prompt_string": "string", "multimodal_data": ["base64"] }`
Multiple prompts are also supported. In this case, the completion result will be an array. Multiple prompts are also supported. In this case, the completion result will be an array.
- Only strings: `["string1", "string2"]` - Only strings: `["string1", "string2"]`
- Strings and sequences of tokens: `["string1", [12, 34, 56]]` - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
- Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]` - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
`temperature`: Adjust the randomness of the generated text. Default: `0.8` `temperature`: Adjust the randomness of the generated text. Default: `0.8`
@ -477,8 +484,6 @@ These words will not be included in the completion, so make sure to add them to
`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled. `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
@ -638,12 +643,12 @@ Returns a JSON object with a field `prompt` containing a string of the input mes
The same as [the embedding example](../embedding) does. The same as [the embedding example](../embedding) does.
This endpoint also supports multimodal embeddings. See the documentation for the `/completions` endpoint for details on how to send a multimodal prompt.
*Options:* *Options:*
`content`: Set the text to process. `content`: Set the text to process.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`embd_normalize`: Normalization for pooled embeddings. Can be one of the following values: `embd_normalize`: Normalization for pooled embeddings. Can be one of the following values:
``` ```
-1: No normalization -1: No normalization

View File

@ -274,7 +274,6 @@ def start_server_background(args):
server_args.extend(['--batch-size', args.batch_size]) server_args.extend(['--batch-size', args.batch_size])
server_args.extend(['--ubatch-size', args.ubatch_size]) server_args.extend(['--ubatch-size', args.ubatch_size])
server_args.extend(['--n-predict', args.max_tokens * 2]) server_args.extend(['--n-predict', args.max_tokens * 2])
server_args.extend(['--defrag-thold', "0.1"])
server_args.append('--cont-batching') server_args.append('--cont-batching')
server_args.append('--metrics') server_args.append('--metrics')
server_args.append('--flash-attn') server_args.append('--flash-attn')

View File

@ -4309,6 +4309,7 @@ int main(int argc, char ** argv) {
}; };
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
bool has_mtmd = ctx_server.mctx != nullptr;
json data = { json data = {
{ {
"template", common_chat_templates_source(ctx_server.chat_templates.get()), "template", common_chat_templates_source(ctx_server.chat_templates.get()),
@ -4330,7 +4331,7 @@ int main(int argc, char ** argv) {
{"quantization_level", ""} {"quantization_level", ""}
}}, }},
{"model_info", ""}, {"model_info", ""},
{"capabilities", {"completion"}} {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
}; };
res_ok(res, data); res_ok(res, data);
@ -4356,56 +4357,15 @@ int main(int argc, char ** argv) {
// TODO: this log can become very long, put it behind a flag or think about a more compact format // TODO: this log can become very long, put it behind a flag or think about a more compact format
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str()); //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
// process files
mtmd::bitmaps bitmaps;
const bool has_mtmd = ctx_server.mctx != nullptr;
{
if (!has_mtmd && !files.empty()) {
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
}
// process prompt // process prompt
std::vector<server_tokens> inputs; std::vector<server_tokens> inputs;
if (oaicompat && has_mtmd) { if (oaicompat && ctx_server.mctx != nullptr) {
// multimodal // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
std::string prompt_str = prompt.get<std::string>(); inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
mtmd_input_text inp_txt = {
prompt_str.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
server_tokens tmp(chunks, true);
inputs.push_back(std::move(tmp));
} else { } else {
// non-multimodal version // Everything else, including multimodal completions.
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
for (auto & p : tokenized_prompts) {
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
inputs.push_back(std::move(tmp));
}
} }
tasks.reserve(inputs.size()); tasks.reserve(inputs.size());
@ -4574,7 +4534,7 @@ int main(int argc, char ** argv) {
data["input_extra"] = input_extra; // default to empty array if it's not exist data["input_extra"] = input_extra; // default to empty array if it's not exist
std::string prompt = json_value(data, "prompt", std::string()); std::string prompt = json_value(data, "prompt", std::string());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
data["prompt"] = format_infill( data["prompt"] = format_infill(
ctx_server.vocab, ctx_server.vocab,
@ -4585,7 +4545,7 @@ int main(int argc, char ** argv) {
ctx_server.params_base.n_predict, ctx_server.params_base.n_predict,
ctx_server.slots[0].n_ctx, // TODO: there should be a better way ctx_server.slots[0].n_ctx, // TODO: there should be a better way
ctx_server.params_base.spm_infill, ctx_server.params_base.spm_infill,
tokenized_prompts[0] tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
); );
std::vector<raw_buffer> files; // dummy std::vector<raw_buffer> files; // dummy
@ -4634,7 +4594,7 @@ int main(int argc, char ** argv) {
if (current_state == SERVER_STATE_READY) { if (current_state == SERVER_STATE_READY) {
model_meta = ctx_server.model_meta(); model_meta = ctx_server.model_meta();
} }
bool has_mtmd = ctx_server.mctx != nullptr;
json models = { json models = {
{"models", { {"models", {
{ {
@ -4646,7 +4606,7 @@ int main(int argc, char ** argv) {
{"type", "model"}, {"type", "model"},
{"description", ""}, {"description", ""},
{"tags", {""}}, {"tags", {""}},
{"capabilities", {"completion"}}, {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
{"parameters", ""}, {"parameters", ""},
{"details", { {"details", {
{"parent_model", ""}, {"parent_model", ""},
@ -4763,7 +4723,7 @@ int main(int argc, char ** argv) {
} }
} }
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
for (const auto & tokens : tokenized_prompts) { for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input // this check is necessary for models that do not add BOS token to the input
if (tokens.empty()) { if (tokens.empty()) {
@ -4791,7 +4751,7 @@ int main(int argc, char ** argv) {
task.id = ctx_server.queue_tasks.get_new_id(); task.id = ctx_server.queue_tasks.get_new_id();
task.index = i; task.index = i;
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr); task.prompt_tokens = std::move(tokenized_prompts[i]);
// OAI-compat // OAI-compat
task.params.oaicompat = oaicompat; task.params.oaicompat = oaicompat;
@ -4878,7 +4838,10 @@ int main(int argc, char ** argv) {
return; return;
} }
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
if (tokenized_queries.size() != 1) {
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
}
// create and queue the task // create and queue the task
json responses = json::array(); json responses = json::array();
@ -4886,14 +4849,14 @@ int main(int argc, char ** argv) {
std::unordered_set<int> task_ids; std::unordered_set<int> task_ids;
{ {
std::vector<server_task> tasks; std::vector<server_task> tasks;
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
tasks.reserve(tokenized_docs.size()); tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) { for (size_t i = 0; i < tokenized_docs.size(); i++) {
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
server_task task = server_task(SERVER_TASK_TYPE_RERANK); server_task task = server_task(SERVER_TASK_TYPE_RERANK);
task.id = ctx_server.queue_tasks.get_new_id(); task.id = ctx_server.queue_tasks.get_new_id();
task.index = i; task.index = i;
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr); task.prompt_tokens = std::move(tmp);
tasks.push_back(std::move(task)); tasks.push_back(std::move(task));
} }

View File

@ -6,6 +6,8 @@ from utils import *
server = ServerPreset.tinyllama2() server = ServerPreset.tinyllama2()
JSON_MULTIMODAL_KEY = "multimodal_data"
JSON_PROMPT_STRING_KEY = "prompt_string"
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def create_server(): def create_server():
@ -231,6 +233,28 @@ def test_nocache_long_input_prompt():
}) })
assert res.status_code == 400 assert res.status_code == 400
def test_json_prompt_no_mtmd():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is" },
"seed": 42,
"temperature": 1.0,
"cache_prompt": False,
})
assert res.status_code == 200
def test_json_prompt_mtm_error_when_not_supported():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is <__media__>", JSON_MULTIMODAL_KEY: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" },
"seed": 42,
"temperature": 1.0,
"cache_prompt": False,
})
# MTMD is disabled on this model, so this should fail.
assert res.status_code != 200
def test_completion_with_tokens_input(): def test_completion_with_tokens_input():
global server global server
@ -269,6 +293,20 @@ def test_completion_with_tokens_input():
assert len(res.body) == 2 assert len(res.body) == 2
assert res.body[0]["content"] == res.body[1]["content"] assert res.body[0]["content"] == res.body[1]["content"]
# mixed JSON and tokens
res = server.make_request("POST", "/completion", data={
"prompt": [
tokens,
{
JSON_PROMPT_STRING_KEY: "I believe the meaning of life is",
},
],
})
assert res.status_code == 200
assert type(res.body) == list
assert len(res.body) == 2
assert res.body[0]["content"] == res.body[1]["content"]
# mixed string and tokens in one sequence # mixed string and tokens in one sequence
res = server.make_request("POST", "/completion", data={ res = server.make_request("POST", "/completion", data={
"prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str], "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],

View File

@ -10,21 +10,48 @@ IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/9
response = requests.get(IMG_URL_0) response = requests.get(IMG_URL_0)
response.raise_for_status() # Raise an exception for bad status codes response.raise_for_status() # Raise an exception for bad status codes
IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8") IMG_BASE64_URI_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
IMG_BASE64_0 = base64.b64encode(response.content).decode("utf-8")
response = requests.get(IMG_URL_1)
response.raise_for_status() # Raise an exception for bad status codes
IMG_BASE64_URI_1 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
IMG_BASE64_1 = base64.b64encode(response.content).decode("utf-8")
JSON_MULTIMODAL_KEY = "multimodal_data"
JSON_PROMPT_STRING_KEY = "prompt_string"
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def create_server(): def create_server():
global server global server
server = ServerPreset.tinygemma3() server = ServerPreset.tinygemma3()
def test_models_supports_multimodal_capability():
global server
server.start() # vision model may take longer to load due to download size
res = server.make_request("GET", "/models", data={})
assert res.status_code == 200
model_info = res.body["models"][0]
print(model_info)
assert "completion" in model_info["capabilities"]
assert "multimodal" in model_info["capabilities"]
def test_v1_models_supports_multimodal_capability():
global server
server.start() # vision model may take longer to load due to download size
res = server.make_request("GET", "/v1/models", data={})
assert res.status_code == 200
model_info = res.body["models"][0]
print(model_info)
assert "completion" in model_info["capabilities"]
assert "multimodal" in model_info["capabilities"]
@pytest.mark.parametrize( @pytest.mark.parametrize(
"prompt, image_url, success, re_content", "prompt, image_url, success, re_content",
[ [
# test model is trained on CIFAR-10, but it's quite dumb due to small size # test model is trained on CIFAR-10, but it's quite dumb due to small size
("What is this:\n", IMG_URL_0, True, "(cat)+"), ("What is this:\n", IMG_URL_0, True, "(cat)+"),
("What is this:\n", "IMG_BASE64_0", True, "(cat)+"), # exceptional, so that we don't cog up the log ("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"), # exceptional, so that we don't cog up the log
("What is this:\n", IMG_URL_1, True, "(frog)+"), ("What is this:\n", IMG_URL_1, True, "(frog)+"),
("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache ("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache
("What is this:\n", "malformed", False, None), ("What is this:\n", "malformed", False, None),
@ -36,8 +63,8 @@ def create_server():
def test_vision_chat_completion(prompt, image_url, success, re_content): def test_vision_chat_completion(prompt, image_url, success, re_content):
global server global server
server.start(timeout_seconds=60) # vision model may take longer to load due to download size server.start(timeout_seconds=60) # vision model may take longer to load due to download size
if image_url == "IMG_BASE64_0": if image_url == "IMG_BASE64_URI_0":
image_url = IMG_BASE64_0 image_url = IMG_BASE64_URI_0
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0, "temperature": 0.0,
"top_k": 1, "top_k": 1,
@ -58,3 +85,61 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
else: else:
assert res.status_code != 200 assert res.status_code != 200
@pytest.mark.parametrize(
"prompt, image_data, success, re_content",
[
# test model is trained on CIFAR-10, but it's quite dumb due to small size
("What is this: <__media__>\n", IMG_BASE64_0, True, "(cat)+"),
("What is this: <__media__>\n", IMG_BASE64_1, True, "(frog)+"),
("What is this: <__media__>\n", "malformed", False, None), # non-image data
("What is this:\n", "", False, None), # empty string
]
)
def test_vision_completion(prompt, image_data, success, re_content):
global server
server.start() # vision model may take longer to load due to download size
res = server.make_request("POST", "/completions", data={
"temperature": 0.0,
"top_k": 1,
"prompt": { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
})
if success:
assert res.status_code == 200
content = res.body["content"]
assert match_regex(re_content, content)
else:
assert res.status_code != 200
@pytest.mark.parametrize(
"prompt, image_data, success",
[
# test model is trained on CIFAR-10, but it's quite dumb due to small size
("What is this: <__media__>\n", IMG_BASE64_0, True), # exceptional, so that we don't cog up the log
("What is this: <__media__>\n", IMG_BASE64_1, True),
("What is this: <__media__>\n", "malformed", False), # non-image data
("What is this:\n", "base64", False), # non-image data
]
)
def test_vision_embeddings(prompt, image_data, success):
global server
server.server_embeddings=True
server.n_batch=512
server.start() # vision model may take longer to load due to download size
res = server.make_request("POST", "/embeddings", data={
"content": [
{ JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
{ JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
{ JSON_PROMPT_STRING_KEY: prompt, },
],
})
if success:
assert res.status_code == 200
content = res.body
# Ensure embeddings are stable when multimodal.
assert content[0]['embedding'] == content[1]['embedding']
# Ensure embeddings without multimodal but same prompt do not match multimodal embeddings.
assert content[0]['embedding'] != content[2]['embedding']
else:
assert res.status_code != 200

View File

@ -123,6 +123,19 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
return false; return false;
} }
// does array have any individual integers/tokens?
static bool json_is_array_and_contains_numbers(const json & data) {
if (data.is_array()) {
for (const auto & e : data) {
if (e.is_number_integer()) {
return true;
}
}
return false;
}
return false;
}
// get value by path(key1 / key2) // get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) { static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object(); json result = json::object();
@ -186,48 +199,6 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
return prompt_tokens; return prompt_tokens;
} }
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
*/
static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
std::vector<llama_tokens> result;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed
result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
} else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens
result.push_back(json_prompt.get<llama_tokens>());
} else if (json_prompt.is_array()) {
// array of prompts
result.reserve(json_prompt.size());
for (const auto & p : json_prompt) {
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
} else if (json_is_array_of_numbers(p)) {
// array of tokens
result.push_back(p.get<llama_tokens>());
} else {
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
}
}
} else {
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
}
if (result.empty()) {
throw std::runtime_error("\"prompt\" must not be empty");
}
return result;
}
// return the last index of character that can form a valid string // return the last index of character that can form a valid string
// if the last character is potentially cut in half, return the index before the cut // if the last character is potentially cut in half, return the index before the cut
// if validate_utf8(text) == text.size(), then the whole text is valid utf8 // if validate_utf8(text) == text.size(), then the whole text is valid utf8
@ -262,35 +233,6 @@ static size_t validate_utf8(const std::string& text) {
// template utils // template utils
// //
// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result;
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}
result.reserve(doc.size() + query.size() + 4);
if (llama_vocab_get_add_bos(vocab)) {
result.push_back(llama_vocab_bos(vocab));
}
result.insert(result.end(), query.begin(), query.end());
if (llama_vocab_get_add_eos(vocab)) {
result.push_back(eos_token);
}
if (llama_vocab_get_add_sep(vocab)) {
result.push_back(llama_vocab_sep(vocab));
}
result.insert(result.end(), doc.begin(), doc.end());
if (llama_vocab_get_add_eos(vocab)) {
result.push_back(eos_token);
}
return result;
}
// format infill task // format infill task
static llama_tokens format_infill( static llama_tokens format_infill(
const llama_vocab * vocab, const llama_vocab * vocab,
@ -1186,6 +1128,24 @@ public:
} }
} }
// appends server tokens, updates the media map. copies media chunks.
void push_back(server_tokens & tokens) {
size_t start_pos = size();
for (size_t i = 0; i < tokens.size(); i++) {
push_back(tokens[i]);
}
if (tokens.has_mtmd) {
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
// We could also just check, but this will prevent silently dropping MTMD data.
GGML_ASSERT(has_mtmd);
for (auto it = tokens.map_pos_to_media.begin(); it != tokens.map_pos_to_media.end(); ) {
auto chunk = tokens.map_pos_to_media[it->first].get();
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_media[start_pos+it->first] = std::move(new_chunk);
}
}
}
// for compatibility with context shift and prompt truncation // for compatibility with context shift and prompt truncation
void insert(const llama_tokens & inp_tokens) { void insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
@ -1356,3 +1316,137 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
} }
return std::to_string(hash); return std::to_string(hash);
} }
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) {
server_tokens result = {};
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}
if (llama_vocab_get_add_bos(vocab)) {
result.push_back(llama_vocab_bos(vocab));
}
result.push_back(query);
if (llama_vocab_get_add_eos(vocab)) {
result.push_back(eos_token);
}
if (llama_vocab_get_add_sep(vocab)) {
result.push_back(llama_vocab_sep(vocab));
}
result.push_back(doc);
if (llama_vocab_get_add_eos(vocab)) {
result.push_back(eos_token);
}
return result;
}
static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
mtmd::bitmaps bitmaps;
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
// process prompt
std::vector<server_tokens> inputs;
// multimodal
mtmd_input_text inp_txt = {
prompt.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
auto result = server_tokens(chunks, true);
return result;
}
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* use tokenize_input_prompts() if the input could be an array.
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
*/
static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
const bool has_mtmd = mctx != nullptr;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed
llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
return server_tokens(tmp, false);
} else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens
llama_tokens tmp = json_prompt.get<llama_tokens>();
return server_tokens(tmp, false);
} else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
// JSON object with prompt key.
if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
if (!has_mtmd)
throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
// JSON object with prompt and multimodal key.
std::vector<raw_buffer> files;
for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
files.push_back(base64_decode(entry));
}
return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
} else {
// Not multimodal, but contains a subobject.
llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
return server_tokens(tmp, false);
}
} else {
throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
}
}
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
* and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
*/
static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
std::vector<server_tokens> result;
if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
result.reserve(json_prompt.size());
for (const auto & p : json_prompt) {
result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special));
}
} else {
result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
}
if (result.empty()) {
throw std::runtime_error("\"prompt\" must not be empty");
}
return result;
}