157 lines
4.5 KiB
C
157 lines
4.5 KiB
C
#ifndef HTP_MSG_H
|
|
#define HTP_MSG_H
|
|
|
|
#include <assert.h>
|
|
|
|
// ggml-common.h must be included prio to this header
|
|
|
|
// Mask to enable various stages of the Ops.
|
|
// Used for debugging and profiling.
|
|
enum {
|
|
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
|
|
HTP_OPMASK_QUANTIZE = (1 << 1), // Enable Quantize
|
|
HTP_OPMASK_COMPUTE = (1 << 2), // Enable Compute
|
|
};
|
|
|
|
// Op flags
|
|
enum {
|
|
HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0), // Skip dynamic quantization (reuse quantized tensors)
|
|
HTP_OPFLAGS_SKIP_COMPUTE = (1 << 1), // Skip actual computation (used for profiling)
|
|
HTP_OPFLAGS_EARLY_WAKEUP = (1 << 2) // Send early wakeup notification
|
|
};
|
|
|
|
enum htp_status {
|
|
HTP_STATUS_OK = 1,
|
|
HTP_STATUS_INTERNAL_ERR = 2,
|
|
HTP_STATUS_NO_SUPPORT = 3,
|
|
HTP_STATUS_INVAL_PARAMS = 4,
|
|
HTP_STATUS_VTCM_TOO_SMALL = 5,
|
|
};
|
|
|
|
// The values must match the ggml_type.
|
|
// Duplicated here because we can't include full ggml.h in the htp build.
|
|
// We have some static_asserts in the cpp code to ensure things are in sync.
|
|
enum htp_data_type {
|
|
HTP_TYPE_F32 = 0,
|
|
HTP_TYPE_F16 = 1,
|
|
HTP_TYPE_Q4_0 = 2,
|
|
HTP_TYPE_Q8_0 = 8,
|
|
HTP_TYPE_MXFP4 = 39,
|
|
HTP_TYPE_COUNT
|
|
};
|
|
|
|
// These values are manually translated over to HTP
|
|
// !!!! DO NOT ALTER THE ORDER OF THE FIRST FOUR ENUMS !!!!
|
|
enum htp_op {
|
|
HTP_OP_MUL = 0,
|
|
HTP_OP_ADD = 1,
|
|
HTP_OP_SUB = 2,
|
|
HTP_OP_DIV = 3,
|
|
HTP_OP_MUL_MAT = 4,
|
|
HTP_OP_MUL_MAT_ID = 5,
|
|
HTP_OP_RMS_NORM = 6,
|
|
HTP_OP_UNARY_SILU = 7,
|
|
HTP_OP_GLU_SWIGLU = 8,
|
|
HTP_OP_GLU_SWIGLU_OAI = 9,
|
|
HTP_OP_SOFTMAX = 10,
|
|
HTP_OP_ADD_ID = 11,
|
|
HTP_OP_ROPE = 12,
|
|
INVALID
|
|
};
|
|
|
|
static inline size_t htp_type_block_size(uint32_t t) {
|
|
switch (t) {
|
|
case HTP_TYPE_F32:
|
|
return 1;
|
|
case HTP_TYPE_F16:
|
|
return 1;
|
|
case HTP_TYPE_Q4_0:
|
|
return QK4_0;
|
|
case HTP_TYPE_Q8_0:
|
|
return QK8_0;
|
|
case HTP_TYPE_MXFP4:
|
|
return QK_MXFP4;
|
|
default:
|
|
assert(0 && "unsupported HTP data type");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static inline size_t htp_type_nbytes(uint32_t t) {
|
|
switch (t) {
|
|
case HTP_TYPE_F32:
|
|
return 4;
|
|
case HTP_TYPE_F16:
|
|
return 2;
|
|
case HTP_TYPE_Q4_0:
|
|
return sizeof(block_q4_0);
|
|
case HTP_TYPE_Q8_0:
|
|
return sizeof(block_q8_0);
|
|
case HTP_TYPE_MXFP4:
|
|
return sizeof(block_mxfp4);
|
|
default:
|
|
assert(0 && "unsupported HTP data type");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static const char * htp_type_name(uint32_t t) {
|
|
switch (t) {
|
|
case HTP_TYPE_F32:
|
|
return "fp32";
|
|
case HTP_TYPE_F16:
|
|
return "fp16";
|
|
case HTP_TYPE_Q4_0:
|
|
return "q4_0";
|
|
case HTP_TYPE_Q8_0:
|
|
return "q8_0";
|
|
case HTP_TYPE_MXFP4:
|
|
return "mxfp4";
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Internal types
|
|
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
|
|
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
|
|
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
|
|
|
|
#define HTP_MAX_DIMS 4
|
|
|
|
struct htp_tensor {
|
|
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
|
|
uint32_t type; // Data type
|
|
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
|
|
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
|
|
};
|
|
|
|
#define HTP_MAX_OP_PARAMS 64
|
|
|
|
struct htp_general_req {
|
|
uint32_t op; // GGML/HTP Op
|
|
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
|
|
// Params for the op, e.g. epsilon of RMS norm
|
|
uint32_t flags; // Request flags
|
|
|
|
struct htp_tensor src0; // Input0 tensor
|
|
struct htp_tensor src1; // Input1 tensor
|
|
struct htp_tensor src2; // Input2 tensor
|
|
struct htp_tensor dst; // Output tensor
|
|
|
|
// should be multiple of 64 bytes (cacheline)
|
|
};
|
|
|
|
struct htp_general_rsp {
|
|
uint32_t op; // GGML/HTP Op
|
|
uint32_t status; // HTP_STATUS_...
|
|
uint32_t prof_usecs; // Number of usec per request
|
|
uint32_t prof_cycles; // Number of cycles per request
|
|
uint32_t prof_pkts; // Number of instruction packets per request
|
|
uint8_t unused[44]; // Pad to 64 bytes
|
|
};
|
|
|
|
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
|
|
#define HTP_MAX_PACKET_BUFFERS 4
|
|
|
|
#endif /* HTP_MSG_H */
|