28 lines
1013 B
Plaintext
28 lines
1013 B
Plaintext
#include "common.cuh"
|
|
#include "ggml.h"
|
|
|
|
#include <initializer_list>
|
|
|
|
struct ggml_cuda_topk_moe_args {
|
|
bool sigmoid{};
|
|
bool softmax{};
|
|
bool delayed_softmax{};
|
|
bool prob_bias{};
|
|
bool norm{};
|
|
bool scale{};
|
|
};
|
|
|
|
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
|
const ggml_tensor * logits,
|
|
ggml_tensor * weights,
|
|
ggml_tensor * ids,
|
|
const ggml_tensor * clamp,
|
|
const ggml_tensor * scale,
|
|
const ggml_tensor * bias,
|
|
const ggml_cuda_topk_moe_args & args);
|
|
|
|
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
|
|
const ggml_tensor * weights,
|
|
const ggml_tensor * logits,
|
|
const ggml_tensor * ids);
|