llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh

28 lines
1013 B
Plaintext

#include "common.cuh"
#include "ggml.h"
#include <initializer_list>
struct ggml_cuda_topk_moe_args {
bool sigmoid{};
bool softmax{};
bool delayed_softmax{};
bool prob_bias{};
bool norm{};
bool scale{};
};
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
const ggml_tensor * logits,
ggml_tensor * weights,
ggml_tensor * ids,
const ggml_tensor * clamp,
const ggml_tensor * scale,
const ggml_tensor * bias,
const ggml_cuda_topk_moe_args & args);
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
const ggml_tensor * weights,
const ggml_tensor * logits,
const ggml_tensor * ids);