Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2
This commit is contained in:
commit
11c26ecfca
|
|
@ -1,9 +1,7 @@
|
|||
ARG UBUNTU_VERSION=25.10
|
||||
ARG UBUNTU_VERSION=26.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
|
|
|
|||
|
|
@ -351,16 +351,10 @@ jobs:
|
|||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
|
|
@ -374,13 +368,6 @@ jobs:
|
|||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Copy Libcurl
|
||||
id: prepare_libcurl
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
|
|
|
|||
|
|
@ -2488,6 +2488,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.models_max = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||
add_opt(common_arg(
|
||||
{"--models-allow-extra-args"},
|
||||
string_format("for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: %s)", params.models_allow_extra_args ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.models_allow_extra_args = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS"));
|
||||
add_opt(common_arg(
|
||||
{"--no-models-autoload"},
|
||||
"disables automatic loading of models (default: enabled)",
|
||||
|
|
|
|||
|
|
@ -462,6 +462,7 @@ struct common_params {
|
|||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
bool models_allow_extra_args = false; // allow passing extra arguments when loading models via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -4183,6 +4183,21 @@ class Qwen3MoeModel(Qwen2MoeModel):
|
|||
super().set_vocab()
|
||||
|
||||
|
||||
@ModelBase.register("RND1")
|
||||
class RND1Model(Qwen2MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.RND1
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# RND1 specific parameters
|
||||
# RND1 uses bidirectional attention
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
|
||||
if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
|
||||
self.gguf_writer.add_mask_token_id(mask_token_id)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
|
||||
class Qwen3VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
The example demonstrates batched generation from a given prompt
|
||||
|
||||
```bash
|
||||
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
|
||||
|
||||
...
|
||||
|
||||
|
|
|
|||
|
|
@ -6,8 +6,54 @@ More Info:
|
|||
- https://github.com/ggml-org/llama.cpp/pull/14644
|
||||
- https://github.com/ggml-org/llama.cpp/pull/14771
|
||||
|
||||
## Parameters
|
||||
The diffusion CLI supports various parameters to control the generation process:
|
||||
|
||||
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
|
||||
### Core Diffusion Parameters
|
||||
- `--diffusion-steps`: Number of diffusion steps (default: 256)
|
||||
- `--diffusion-algorithm`: Algorithm for token selection
|
||||
- `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
|
||||
- `1`: ENTROPY_BASED - Entropy-based selection
|
||||
- `2`: MARGIN_BASED - Margin-based selection
|
||||
- `3`: RANDOM - Random selection
|
||||
- `4`: CONFIDENCE_BASED - Confidence-based selection (default)
|
||||
- More documentation here https://github.com/DreamLM/Dream
|
||||
- `--diffusion-visual`: Enable live visualization during generation
|
||||
|
||||
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
|
||||
### Scheduling Parameters
|
||||
Choose one of the following scheduling methods:
|
||||
|
||||
**Timestep-based scheduling:**
|
||||
- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
|
||||
|
||||
**Block-based scheduling:**
|
||||
- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
|
||||
|
||||
### Sampling Parameters
|
||||
- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
|
||||
- `--top-k`: Top-k filtering for sampling
|
||||
- `--top-p`: Top-p (nucleus) filtering for sampling
|
||||
- `--seed`: Random seed for reproducibility
|
||||
|
||||
### Model Parameters
|
||||
- `-m`: Path to the GGUF model file
|
||||
- `-p`: Input prompt text
|
||||
- `-ub`: Maximum sequence length (ubatch size)
|
||||
- `-c`: Context size
|
||||
- `-b`: Batch size
|
||||
|
||||
### Examples
|
||||
#### Dream architechture:
|
||||
```
|
||||
llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
|
||||
```
|
||||
|
||||
#### LLaDA architechture:
|
||||
```
|
||||
llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
|
||||
```
|
||||
|
||||
#### RND1 architecture:
|
||||
```
|
||||
llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
|
||||
```
|
||||
|
|
|
|||
|
|
@ -25,16 +25,17 @@ if(GIT_EXE)
|
|||
)
|
||||
endif()
|
||||
|
||||
# Build the version string with optional dirty flag
|
||||
set(GGML_VERSION "${GGML_VERSION_BASE}")
|
||||
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
|
||||
set(GGML_VERSION "${GGML_VERSION}-dirty")
|
||||
endif()
|
||||
|
||||
if(NOT GGML_BUILD_COMMIT)
|
||||
set(GGML_BUILD_COMMIT "unknown")
|
||||
endif()
|
||||
|
||||
# Build the commit string with optional dirty flag
|
||||
if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
|
||||
set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
|
||||
endif()
|
||||
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
|
|
|||
|
|
@ -328,6 +328,14 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|||
set(GGML_INTERNAL_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
foreach (feat RVV)
|
||||
set(GGML_INTERNAL_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
|
|
@ -402,6 +410,13 @@ if (GGML_CPU_ALL_VARIANTS)
|
|||
else()
|
||||
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
ggml_add_cpu_backend_variant(riscv64_0)
|
||||
ggml_add_cpu_backend_variant(riscv64_v RVV)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -2303,9 +2303,9 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
|||
// calculate rope cache for fist layer in current device.
|
||||
cann_ctx->rope_cache.cached = false;
|
||||
|
||||
bool cann_graph_update_required = false;
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
bool cann_graph_update_required = false;
|
||||
|
||||
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
||||
if (!prefill_use_graph) {
|
||||
|
|
@ -2336,7 +2336,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
|||
}
|
||||
#else
|
||||
bool use_cann_graph = false;
|
||||
bool cann_graph_update_required = false;
|
||||
#endif // USE_ACL_GRAPH
|
||||
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
|
||||
|
||||
|
|
|
|||
|
|
@ -452,6 +452,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
ggml-cpu/spacemit/ime_kernels.h
|
||||
)
|
||||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
set(MARCH_STR "rv64gc")
|
||||
if (GGML_RV_ZFH)
|
||||
string(APPEND MARCH_STR "_zfh")
|
||||
|
|
@ -468,6 +469,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
# Begin with the lowest baseline
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
if (GGML_INTERNAL_RVV)
|
||||
message(STATUS "RVV enabled")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
|
|
|
|||
|
|
@ -51,10 +51,8 @@
|
|||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
|
|
|
|||
|
|
@ -24,6 +24,29 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||
int16x8_t * out_mins,
|
||||
int8_t * out_scales) {
|
||||
constexpr uint32_t kmask1 = 0x3f3f3f3f;
|
||||
constexpr uint32_t kmask2 = 0x0f0f0f0f;
|
||||
constexpr uint32_t kmask3 = 0x03030303;
|
||||
constexpr uint8_t scales_size = 12;
|
||||
|
||||
uint32_t sm[3];
|
||||
memcpy(sm, scales_in, scales_size);
|
||||
|
||||
const uint32_t mins_0_3 = sm[1] & kmask1;
|
||||
const uint32_t mins_4_7 = ((sm[2] >> 4) & kmask2) | (((sm[1] >> 6) & kmask3) << 4);
|
||||
const uint32x2_t mins_u32 = { mins_0_3, mins_4_7 };
|
||||
|
||||
*out_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins_u32)));
|
||||
|
||||
uint32_t scales_u32[2];
|
||||
scales_u32[0] = sm[0] & kmask1;
|
||||
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
||||
memcpy(out_scales, scales_u32, 8);
|
||||
}
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
assert(k % QK8_0 == 0);
|
||||
|
|
@ -474,6 +497,162 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
||||
constexpr int ncols_interleaved = 8;
|
||||
constexpr int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
constexpr int col_pairs = ncols_interleaved / 2;
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
// 1x8 tile = 2 x 4
|
||||
float32x4_t acc_f32[ncols_interleaved / 4];
|
||||
|
||||
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
|
||||
for (int i = 0; i < ncols_interleaved / 4; i++) {
|
||||
acc_f32[i] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
|
||||
float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
|
||||
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
|
||||
float32x4_t sb_scale_0 = vmulq_f32(q4_d_0, q8_d);
|
||||
float32x4_t sb_scale_1 = vmulq_f32(q4_d_1, q8_d);
|
||||
float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
|
||||
float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
|
||||
float32x4_t sb_min_0 = vmulq_f32(q4_dmin_0, q8_d);
|
||||
float32x4_t sb_min_1 = vmulq_f32(q4_dmin_1, q8_d);
|
||||
|
||||
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
|
||||
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
|
||||
// 2 sb each iteration
|
||||
int32x4_t acc_lo[col_pairs];
|
||||
int32x4_t acc_hi[col_pairs];
|
||||
|
||||
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
|
||||
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
|
||||
int16_t bsums_arr[8];
|
||||
vst1q_s16(bsums_arr, bsums);
|
||||
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||
for (int i = 0; i < col_pairs; i++) {
|
||||
acc_lo[i] = vdupq_n_s32(0);
|
||||
acc_hi[i] = vdupq_n_s32(0);
|
||||
}
|
||||
// Need scales for the low and high nibbles
|
||||
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||
int16x8_t q4sb_mins[2]; // int16 as its needed for bias_acc later
|
||||
int16x8_t q4sb_scales[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int8_t aux_q4sb[8];
|
||||
const int offset = sb * 24 + i * 12;
|
||||
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
|
||||
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
|
||||
}
|
||||
|
||||
const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
|
||||
|
||||
// Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
|
||||
// but still need the qs to use the low and hi bits from q4
|
||||
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
|
||||
int8x16_t q8_qs[8];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
q8_qs[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base + i * 8));
|
||||
}
|
||||
|
||||
// Q4s columns iterated in pairs (01, 23, 45, 67)
|
||||
for (int cp = 0; cp < col_pairs; cp++) {
|
||||
uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_base + 16 * cp);
|
||||
uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_base + 16 * cp + 64);
|
||||
uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_base + 16 * cp + 128);
|
||||
uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_base + 16 * cp + 192);
|
||||
|
||||
acc_lo[cp] =
|
||||
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)), q8_qs[0]); // 0 .. 7
|
||||
acc_lo[cp] =
|
||||
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)), q8_qs[1]); // 8 ..15
|
||||
acc_lo[cp] =
|
||||
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)), q8_qs[2]); // 16..23
|
||||
acc_lo[cp] =
|
||||
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)), q8_qs[3]); // 24..31
|
||||
|
||||
acc_hi[cp] =
|
||||
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)), q8_qs[4]); // 32..39
|
||||
acc_hi[cp] =
|
||||
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)), q8_qs[5]); // 40..47
|
||||
acc_hi[cp] =
|
||||
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)), q8_qs[6]); // 48..55
|
||||
acc_hi[cp] =
|
||||
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)), q8_qs[7]); // 56..63
|
||||
}
|
||||
|
||||
// Iterates over a pair of column pairs (4 columns) to use a single 128 register
|
||||
// p = 0 -> 0123 p2 -> 4567
|
||||
for (int i = 0, p = 0; p < col_pairs; i++, p += 2) {
|
||||
int16x4_t group_scales_lo = p == 0 ? vget_low_s16(q4sb_scales[0]) : vget_high_s16(q4sb_scales[0]);
|
||||
int16x4_t group_scales_hi = p == 0 ? vget_low_s16(q4sb_scales[1]) : vget_high_s16(q4sb_scales[1]);
|
||||
float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
|
||||
|
||||
// 0123 or 4567
|
||||
// TODO: Single superblock mul at the end of the superblock
|
||||
float32x4_t sumf_0 =
|
||||
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
|
||||
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
|
||||
|
||||
float32x4_t sumf_1 =
|
||||
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_hi), vpaddq_s32(acc_hi[p], acc_hi[p + 1])));
|
||||
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_1);
|
||||
}
|
||||
|
||||
// Multiply Acc bsum + mins
|
||||
// Each pair of subblocks share the same bsums
|
||||
// Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
|
||||
int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
|
||||
int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
|
||||
|
||||
// cols 0-3 bias
|
||||
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
||||
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
||||
|
||||
// cols 4-7 bias
|
||||
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
||||
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
||||
} // for sb
|
||||
|
||||
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0);
|
||||
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_1);
|
||||
} // for b
|
||||
|
||||
int base = x * ncols_interleaved;
|
||||
vst1q_f32(s + base, acc_f32[0]);
|
||||
vst1q_f32(s + base + 4, acc_f32[1]);
|
||||
} // for x
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -1889,3 +2068,212 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n,
|
||||
float * GGML_RESTRICT s,
|
||||
size_t bs,
|
||||
const void * GGML_RESTRICT vx,
|
||||
const void * GGML_RESTRICT vy,
|
||||
int nr,
|
||||
int nc) {
|
||||
constexpr int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
|
||||
constexpr int ncols_interleaved = 8;
|
||||
constexpr int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
constexpr int q8_k_blocklen = 4;
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
// 8 accumulators: 2 row pairs × 4 col pairs
|
||||
float32x4_t acc_f32[blocklen];
|
||||
|
||||
for (int y = 0; y < nr / q8_k_blocklen; y++) {
|
||||
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
||||
|
||||
for (int i = 0; i < blocklen; i++) {
|
||||
acc_f32[i] = vdupq_n_f32(0);
|
||||
}
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
// bsums pairs belongs to the same q8_k subblock
|
||||
const int16x8_t bsums[4]{
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
|
||||
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
|
||||
};
|
||||
int16_t bsums_arr[4][8];
|
||||
for (int q8_row = 0; q8_row < 4; q8_row++) {
|
||||
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
|
||||
}
|
||||
|
||||
int32x4_t sb_acc[4]; // Aux accumulators to store subblock (partial) results
|
||||
int32x4_t acc[8]; // rows 01 stored in [0][1][2][3] rows 23 stored in [4][5][6][7]
|
||||
int32x4_t bias_acc[8]; // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567, [2]->r1 0123 ...
|
||||
for (int i = 0; i < 8; i++) {
|
||||
acc[i] = vdupq_n_s32(0);
|
||||
bias_acc[i] = vdupq_n_s32(0);
|
||||
}
|
||||
|
||||
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||
// Need scales for the low and high nibbles
|
||||
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||
int8_t q4sb_scales[2][8];
|
||||
int16x8_t q4sb_mins[2]; // int16 as its needed for bias_acc later
|
||||
for (int i = 0; i < 2; i++) {
|
||||
const int offset = sb * 24 + i * 12;
|
||||
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], q4sb_scales[i]);
|
||||
}
|
||||
|
||||
// q8_ptr[b].qs has interleaved Q8 rows (01, 23)
|
||||
const int8_t * q8_base = q8_ptr[b].qs + sb * 256;
|
||||
|
||||
int8x16_t q8_qs_01[8];
|
||||
int8x16_t q8_qs_23[8];
|
||||
|
||||
// Load 32-byte per row pair, 1 subblock each time
|
||||
for (int i = 0; i < 8; i++) {
|
||||
const int offset = i * 32; // 16 for row 01, 16 for row 23
|
||||
q8_qs_01[i] = vld1q_s8(q8_base + offset);
|
||||
q8_qs_23[i] = vld1q_s8(q8_base + offset + 16);
|
||||
}
|
||||
|
||||
const int8x16_t q8s[2][8] = {
|
||||
{ q8_qs_01[0], q8_qs_01[1], q8_qs_01[2], q8_qs_01[3],
|
||||
q8_qs_01[4], q8_qs_01[5], q8_qs_01[6], q8_qs_01[7] },
|
||||
{ q8_qs_23[0], q8_qs_23[1], q8_qs_23[2], q8_qs_23[3],
|
||||
q8_qs_23[4], q8_qs_23[5], q8_qs_23[6], q8_qs_23[7] },
|
||||
};
|
||||
|
||||
// Q4s columns iterated in pairs (01, 23, 45, 67)
|
||||
for (int cp = 0; cp < ncols_interleaved / 2; cp++) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
sb_acc[i] = vdupq_n_s32(0);
|
||||
}
|
||||
|
||||
uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 0); // 0 .. 7 & 32..39
|
||||
uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 64); // 8 ..15 & 40..47
|
||||
uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 128); // 16..23 & 48..55
|
||||
uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 192); // 24..31 & 56..63
|
||||
const int8x16_t q4_nibbles[2][4] = {
|
||||
{
|
||||
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)),
|
||||
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)),
|
||||
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)),
|
||||
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)),
|
||||
},
|
||||
{
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)),
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)),
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)),
|
||||
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)),
|
||||
}
|
||||
};
|
||||
|
||||
// Calculates the Qs muladd of every row pair (rp) rows 01 and 23 of q8
|
||||
// for each of the internal 32 qs subblock (blk)
|
||||
for (int rp = 0; rp < 2; rp++) {
|
||||
for (int blk = 0; blk < 2; blk++) {
|
||||
const int8x16_t * q8 = &q8s[rp][4 * blk];
|
||||
const int8x16_t * q4 = q4_nibbles[blk];
|
||||
int32x4_t acc = sb_acc[2 * rp + blk];
|
||||
// mul add for each qs in the same subblock
|
||||
for (int qs_offset = 0; qs_offset < 4; qs_offset++) {
|
||||
acc = vmmlaq_s32(acc, q4[qs_offset], q8[qs_offset]);
|
||||
}
|
||||
sb_acc[2 * rp + blk] = acc;
|
||||
}
|
||||
}
|
||||
|
||||
// Scales[i] corresponds to column i
|
||||
const int scale_offset = cp * 2;
|
||||
for (int blk = 0; blk < 2; blk++) {
|
||||
const int32x4_t block_scale = {
|
||||
(int32_t) q4sb_scales[blk][scale_offset],
|
||||
(int32_t) q4sb_scales[blk][scale_offset],
|
||||
(int32_t) q4sb_scales[blk][scale_offset + 1],
|
||||
(int32_t) q4sb_scales[blk][scale_offset + 1],
|
||||
};
|
||||
acc[cp] = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
|
||||
acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// Multiply Acc bsum + mins
|
||||
for (int q8_row = 0; q8_row < 4; q8_row++) {
|
||||
// Each pair of subblocks share the same bsums
|
||||
// Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
|
||||
int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][q8_row * 2]);
|
||||
int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][q8_row * 2 + 1]);
|
||||
|
||||
bias_acc[2 * q8_row] =
|
||||
vmlal_s16(bias_acc[2 * q8_row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
||||
bias_acc[2 * q8_row] =
|
||||
vmlal_s16(bias_acc[2 * q8_row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
||||
bias_acc[2 * q8_row + 1] =
|
||||
vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
||||
bias_acc[2 * q8_row + 1] =
|
||||
vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
||||
}
|
||||
} // for sb
|
||||
|
||||
// Reorder of i8mm output with bias and output layout
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int32x2x2_t aux = vzip_s32(vget_low_s32(acc[i]), vget_high_s32(acc[i]));
|
||||
acc[i] = vcombine_s32(aux.val[0], aux.val[1]);
|
||||
}
|
||||
int32x4_t reorder_acc[8] = {
|
||||
vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1])),
|
||||
vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3])),
|
||||
vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1])),
|
||||
vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3])),
|
||||
vcombine_s32(vget_low_s32(acc[4]), vget_low_s32(acc[5])),
|
||||
vcombine_s32(vget_low_s32(acc[6]), vget_low_s32(acc[7])),
|
||||
vcombine_s32(vget_high_s32(acc[4]), vget_high_s32(acc[5])),
|
||||
vcombine_s32(vget_high_s32(acc[6]), vget_high_s32(acc[7])),
|
||||
};
|
||||
|
||||
for (int i = 0; i < q8_k_blocklen; i++) {
|
||||
for (int j = 0; j < 2; j++) {
|
||||
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d[i]);
|
||||
float32x4_t q4_dmin = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].dmin + j * 4)));
|
||||
const float32x4_t dmins = vmulq_f32(q4_dmin, q8_d);
|
||||
|
||||
float32x4_t q4_d = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].d + j * 4)));
|
||||
const float32x4_t scale = vmulq_f32(q4_d, q8_d);
|
||||
|
||||
acc_f32[2 * i + j] = vmlsq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(bias_acc[2 * i + j]), dmins);
|
||||
acc_f32[2 * i + j] =
|
||||
vmlaq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(reorder_acc[2 * i + j]), scale);
|
||||
}
|
||||
}
|
||||
} // for b
|
||||
|
||||
// With the previous reorder, the tile is already in the correct memory layout.
|
||||
for (int i = 0; i < q8_k_blocklen; i++) {
|
||||
int row = y * q8_k_blocklen + i;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
int col = x * ncols_interleaved + j * 4;
|
||||
int offset = row * bs + col;
|
||||
vst1q_f32(s + offset, acc_f32[2 * i + j]);
|
||||
}
|
||||
}
|
||||
} // for x
|
||||
} // for y
|
||||
return;
|
||||
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__riscv) && __riscv_xlen == 64
|
||||
#include <sys/auxv.h>
|
||||
|
||||
//https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
|
||||
#ifndef COMPAT_HWCAP_ISA_V
|
||||
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
|
||||
#endif
|
||||
|
||||
struct riscv64_features {
|
||||
bool has_rvv = false;
|
||||
|
||||
riscv64_features() {
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
|
||||
has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_riscv64_score() {
|
||||
int score = 1;
|
||||
riscv64_features rf;
|
||||
|
||||
#ifdef GGML_USE_RVV
|
||||
if (!rf.has_rvv) { return 0; }
|
||||
score += 1 << 1;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_riscv64_score)
|
||||
|
||||
#endif // __riscv && __riscv_xlen == 64
|
||||
|
|
@ -1961,6 +1961,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
|
|
|
|||
|
|
@ -212,6 +212,6 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
|||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
|
|||
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
|
|
@ -40,7 +40,7 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
|
|
@ -166,7 +166,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
|||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) {
|
||||
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
|
|
@ -180,17 +180,17 @@ static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const in
|
|||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static void ggml_cpy_flt_contiguous_cuda(
|
||||
static void ggml_cpy_scalar_contiguous_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t, bool transposed = false>
|
||||
static void ggml_cpy_flt_cuda(
|
||||
static void ggml_cpy_scalar_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
|
@ -212,11 +212,11 @@ static void ggml_cpy_flt_cuda(
|
|||
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
|
||||
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
} else {
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
}
|
||||
|
|
@ -399,94 +399,132 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, float, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, half> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q8_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q4_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q4_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q5_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_iq4_nl_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q5_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, half, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<half, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
|
|
|
|||
|
|
@ -4115,6 +4115,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_I32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,14 @@ set(HTP_CMAKE_ARGS
|
|||
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
|
||||
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
|
||||
|
||||
ExternalProject_Add(htp-v68
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
|
||||
|
||||
ExternalProject_Add(htp-v69
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
|
||||
|
||||
ExternalProject_Add(htp-v73
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
|
||||
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
|
||||
|
|
@ -61,6 +69,8 @@ ExternalProject_Add(htp-v81
|
|||
|
||||
# Install Hexagon skels required at runtime
|
||||
install(FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include <chrono>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <sal.h>
|
||||
|
|
@ -240,6 +241,23 @@ struct ggml_hexagon_session {
|
|||
uint32_t prof_pkts;
|
||||
};
|
||||
|
||||
static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req_flags);
|
||||
}
|
||||
|
||||
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
||||
// Bump pending flag (cleared in the session::flush once we get the responce)
|
||||
this->op_pending++; // atomic inc
|
||||
|
|
@ -1912,6 +1930,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
|
|||
return true;
|
||||
}
|
||||
|
||||
template <typename... _TTensor>
|
||||
static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
|
||||
return ([&]() -> bool {
|
||||
return !tensors || !tensors->buffer ||
|
||||
(ggml_backend_buffer_is_hexagon(tensors->buffer) &&
|
||||
ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
|
||||
}() && ...);
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
|
@ -1959,16 +1986,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
|||
}
|
||||
|
||||
// src0 & src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2016,20 +2034,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
|||
|
||||
// src0 (weights) must be repacked and mapped to the same session
|
||||
// src1 & sr2 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2063,16 +2068,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
|
|||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2104,20 +2100,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
|
|||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2144,12 +2127,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
|||
}
|
||||
|
||||
// src0 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2186,16 +2164,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
|||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1 && src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2248,16 +2217,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
|||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1 && src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2269,7 +2229,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|||
|
||||
int mode = op_params[2];
|
||||
|
||||
if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
|
||||
if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
|
||||
return false;
|
||||
}
|
||||
if (mode & 1) {
|
||||
|
|
@ -2312,20 +2272,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
|||
}
|
||||
|
||||
// src0, src1, src2 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2 && src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -2346,6 +2293,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
|
|||
h->nb[3] = t->nb[3];
|
||||
}
|
||||
|
||||
static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
|
||||
if (!t) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
||||
buf->fd = tensor_buf->fd;
|
||||
buf->ptr = t->data;
|
||||
buf->offset = (uint8_t *) t->data - tensor_buf->base;
|
||||
buf->size = ggml_nbytes(t);
|
||||
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
|
||||
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
|
||||
return 1;
|
||||
}
|
||||
|
||||
static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
|
||||
return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
|
||||
}
|
||||
|
||||
static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
|
||||
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
||||
auto sess = buf->sess;
|
||||
|
|
@ -2360,10 +2327,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
|
|||
const struct ggml_tensor * src1 = op->src[1];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = ggml_time_us();
|
||||
|
||||
|
|
@ -2385,55 +2348,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
|
|||
}
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer Weights.
|
||||
// The content is static, there is no need to do any cache management
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = 0;
|
||||
dspqueue_buffers_init(bufs, src0, false, false);
|
||||
|
||||
// Second buffer Input Activations. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[2].fd = dst_buf->fd;
|
||||
bufs[2].ptr = dst->data;
|
||||
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[2].size = ggml_nbytes(dst);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 (normally weight) tensor
|
||||
auto sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
|
|
@ -2463,11 +2398,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
|
|||
const struct ggml_tensor * src2 = op->src[2];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = ggml_time_us();
|
||||
|
||||
|
|
@ -2490,66 +2420,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
|
|||
}
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer Weights.
|
||||
// The content is static, there is no need to do any cache management
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = 0;
|
||||
dspqueue_buffers_init(bufs, src0, false, false);
|
||||
|
||||
// Second buffer Input Activations. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer expert IDs. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[2].fd = src2_buf->fd;
|
||||
bufs[2].ptr = src2->data;
|
||||
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[2].size = ggml_nbytes(src2);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
||||
|
||||
// Forth buffer Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[3].fd = dst_buf->fd;
|
||||
bufs[3].ptr = dst->data;
|
||||
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[3].size = ggml_nbytes(dst);
|
||||
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[3], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 (normally weight) tensor
|
||||
auto sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
|
|
@ -2581,10 +2477,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
|||
const struct ggml_tensor * src1 = node->src[1];
|
||||
const struct ggml_tensor * dst = node;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1 = 0;
|
||||
uint64_t t2 = 0;
|
||||
|
||||
|
|
@ -2621,60 +2513,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
|||
init_htp_tensor(&req.dst, dst);
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = First Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
// Second buffer = Second Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[2].fd = dst_buf->fd;
|
||||
bufs[2].ptr = dst->data;
|
||||
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[2].size = ggml_nbytes(dst);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[16 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
|
||||
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
|
|
@ -2705,11 +2567,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
|
|||
const struct ggml_tensor * src2 = node->src[2];
|
||||
const struct ggml_tensor * dst = node;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1 = 0;
|
||||
uint64_t t2 = 0;
|
||||
|
||||
|
|
@ -2741,58 +2598,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
|
|||
init_htp_tensor(&req.dst, dst);
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = input activations
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
|
||||
dspqueue_buffers_init(bufs, src0, true, true);
|
||||
// Second buffer = experts bias
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
// Third buffer = activated experts
|
||||
bufs[2].fd = src2_buf->fd;
|
||||
bufs[2].ptr = src2->data;
|
||||
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[2].size = ggml_nbytes(src2);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
|
||||
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
||||
// Forth buffer = output activations
|
||||
bufs[3].fd = dst_buf->fd;
|
||||
bufs[3].ptr = dst->data;
|
||||
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[3].size = ggml_nbytes(dst);
|
||||
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[3], dst, true, true);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[16 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
|
||||
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
|
||||
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
|
|
@ -2886,71 +2704,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
|||
}
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
int n_bufs = 0;
|
||||
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = Only Operand of Unary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
bufs[n_bufs].fd = src0_buf->fd;
|
||||
bufs[n_bufs].ptr = src0->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src0);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
++n_bufs;
|
||||
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
if (src1) {
|
||||
// Second buffer = Second Operand of Binary op
|
||||
// Second buffer(nullable) = Second Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
bufs[n_bufs].fd = src1_buf->fd;
|
||||
bufs[n_bufs].ptr = src1->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src1);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
}
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
||||
|
||||
// Second or third buffer = Output Activations. We'll handle DSP
|
||||
// Second buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
bufs[n_bufs].fd = dst_buf->fd;
|
||||
bufs[n_bufs].ptr = dst->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(dst);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
if (src1) {
|
||||
|
|
@ -3023,85 +2803,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
|
|||
}
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
int n_bufs = 0;
|
||||
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
bufs[n_bufs].fd = src0_buf->fd;
|
||||
bufs[n_bufs].ptr = src0->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src0);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
++n_bufs;
|
||||
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
// Second buffer
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
bufs[n_bufs].fd = src1_buf->fd;
|
||||
bufs[n_bufs].ptr = src1->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src1);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
||||
|
||||
if (src2) {
|
||||
// Third buffer
|
||||
// Third buffer(nullable)
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
bufs[n_bufs].fd = src2_buf->fd;
|
||||
bufs[n_bufs].ptr = src2->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src2);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
}
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
|
||||
|
||||
// Final buffer = Output Activations. We'll handle DSP
|
||||
// Second buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
bufs[n_bufs].fd = dst_buf->fd;
|
||||
bufs[n_bufs].ptr = dst->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(dst);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
if (src1) {
|
||||
|
|
|
|||
|
|
@ -390,6 +390,12 @@ int get_hex_arch_ver(int domain, int * arch) {
|
|||
}
|
||||
|
||||
switch (arch_ver.capability & 0xff) {
|
||||
case 0x68:
|
||||
*arch = 68;
|
||||
return 0;
|
||||
case 0x69:
|
||||
*arch = 69;
|
||||
return 0;
|
||||
case 0x73:
|
||||
*arch = 73;
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -66,6 +66,13 @@ static inline bool dma_queue_push(dma_queue * q,
|
|||
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
|
||||
desc->dstbypass = 1;
|
||||
desc->srcbypass = 1;
|
||||
#if __HVX_ARCH__ >= 73
|
||||
desc->dstbypass = 1;
|
||||
desc->srcbypass = 1;
|
||||
#else
|
||||
desc->dstbypass = 0;
|
||||
desc->srcbypass = 1;
|
||||
#endif
|
||||
desc->order = 0;
|
||||
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
|
||||
desc->src = (void *) src;
|
||||
|
|
|
|||
|
|
@ -16,12 +16,7 @@
|
|||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
|
||||
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
||||
|
||||
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
|
||||
|
|
@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|||
|
||||
HVX_Vector vec_out = Q6_V_vzero();
|
||||
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
|
||||
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
||||
|
|
@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|||
|
||||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||
|
||||
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
vec_out = hvx_vec_exp_fp32_guard(in);
|
||||
vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
||||
}
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
||||
|
|
|
|||
|
|
@ -16,6 +16,15 @@
|
|||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
|
||||
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
||||
|
||||
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
|
||||
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
|
||||
|
||||
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
|
||||
}
|
||||
|
||||
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||
int num_elems_whole = num_elems - left_over;
|
||||
|
|
@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
|||
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||
}
|
||||
|
||||
static const uint32_t kNanInfMask = 0x7f800000;
|
||||
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
|
||||
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * p_vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
|
||||
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
|
||||
}
|
||||
} else {
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
|||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
|
||||
HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,6 +21,26 @@ typedef union {
|
|||
float fp32[VLEN_FP32];
|
||||
} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
|
||||
|
||||
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
||||
#if __HVX_ARCH__ < 73
|
||||
static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
|
||||
{
|
||||
HVX_Vector const vzero = Q6_V_vzero();
|
||||
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
||||
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
||||
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
||||
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
||||
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
||||
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
||||
{
|
||||
return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
||||
union {
|
||||
float f;
|
||||
|
|
@ -726,24 +746,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
|
|||
return Q6_Vsf_equals_Vqf32(r_qf);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
|
||||
static const float kInf = INFINITY;
|
||||
static const uint32_t kNanMask = 0x7fffffff;
|
||||
static const uint32_t kNanMin = 0x7f800000;
|
||||
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
|
||||
|
||||
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
||||
|
||||
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
|
||||
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
|
||||
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
|
||||
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
|
||||
|
||||
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
|
||||
}
|
||||
|
||||
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
||||
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
||||
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
|
||||
|
|
@ -958,14 +960,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
|
|||
return Q6_Vsf_equals_Vqf32(temp);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
|
||||
static const float kMaxExp = -88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
|
||||
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
|
||||
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
|
||||
HVX_Vector one,
|
||||
HVX_Vector max_exp,
|
||||
HVX_Vector min_exp) {
|
||||
const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
|
||||
const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
|
||||
|
||||
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
|
||||
return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
|
||||
out = Q6_V_vmux_QVV(pred_max, out, one);
|
||||
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
||||
}
|
||||
|
||||
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||
|
|
@ -977,9 +981,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
|||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
static const float kMinExp = -87.f; // 0
|
||||
static const float kMaxExp = 87.f; // 1
|
||||
|
||||
const HVX_Vector one = hvx_vec_splat_fp32(1.f);
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
|
||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -143,16 +143,25 @@ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
|
|||
}
|
||||
|
||||
static int vtcm_acquire(struct htp_context * ctx) {
|
||||
int err;
|
||||
if (!ctx->vtcm_valid) {
|
||||
// Temporarily bump thread priority to make sure it's higher than other sessions.
|
||||
// This way the resource manager will notify the other thread to release VTCM.
|
||||
// Note that we need to reaquire VTCM at normal priority for this to work next time.
|
||||
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
||||
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||
if (err != 0) {
|
||||
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
||||
abort();
|
||||
}
|
||||
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
||||
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
|
||||
|
||||
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
||||
if (err != 0) {
|
||||
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
||||
abort();
|
||||
}
|
||||
ctx->vtcm_valid = true;
|
||||
}
|
||||
|
||||
|
|
@ -201,7 +210,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
|
|||
HAP_compute_res_attr_init(&attr);
|
||||
HAP_compute_res_attr_set_serialize(&attr, 0);
|
||||
HAP_compute_res_attr_set_cache_mode(&attr, 1);
|
||||
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size);
|
||||
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
|
||||
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
|
||||
HAP_compute_res_attr_set_hmx_param(&attr, 1);
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,10 @@
|
|||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
|
||||
#define HTP_ROPE_TYPE_NORMAL 0
|
||||
#define HTP_ROPE_TYPE_NEOX 2
|
||||
|
||||
#define htp_rope_preamble \
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
|
|
@ -146,6 +150,57 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
|
|||
rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
|
||||
}
|
||||
|
||||
static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
||||
float * restrict dst,
|
||||
const int num_elems,
|
||||
const float * restrict theta_cache) {
|
||||
// for (int i = 0; i < num_elems; i += 2) {
|
||||
//const float cos_theta = theta_cache[i + 0];
|
||||
//const float sin_theta = theta_cache[i + 1];
|
||||
|
||||
//const float x0 = src[0];
|
||||
//const float x1 = src[num_elems/2];
|
||||
|
||||
//dst[0] = x0*cos_theta - x1*sin_theta;
|
||||
//dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
|
||||
|
||||
//src += 1;
|
||||
//dst += 1;
|
||||
// }
|
||||
|
||||
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
|
||||
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
|
||||
uint8_t * restrict dst_curr = (uint8_t *) dst;
|
||||
|
||||
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
|
||||
int half_size = (sizeof(float) * (num_elems / 2));
|
||||
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
|
||||
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
|
||||
|
||||
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
|
||||
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
|
||||
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
||||
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
|
||||
|
||||
src0_curr += VLEN;
|
||||
theta_curr += 2 * VLEN;
|
||||
dst_curr += VLEN;
|
||||
}
|
||||
}
|
||||
|
||||
static void hvx_calc_rope_f32(const float * restrict src0,
|
||||
float * restrict dst,
|
||||
const int num_elems,
|
||||
|
|
@ -212,6 +267,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
|||
const struct htp_tensor * src2 = &octx->src2;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
const int32_t mode = rope_ctx->mode;
|
||||
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
||||
|
||||
htp_rope_preamble;
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
|
|
@ -247,12 +305,26 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
|||
float * dst_data_loc = dst_data;
|
||||
|
||||
if (1 == opt_path) {
|
||||
if (is_neox) {
|
||||
hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
||||
} else {
|
||||
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
||||
const float cos_theta = wp0[i0 + 0];
|
||||
const float sin_theta = wp0[i0 + 1];
|
||||
|
||||
if (is_neox) {
|
||||
const float x0 = src_loc[0];
|
||||
const float x1 = src_loc[rope_ctx->n_dims/2];
|
||||
|
||||
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
||||
|
||||
src_loc += 1;
|
||||
dst_data_loc += 1;
|
||||
} else {
|
||||
const float x0 = src_loc[0];
|
||||
const float x1 = src_loc[1];
|
||||
|
||||
|
|
@ -263,6 +335,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
|||
dst_data_loc += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
|
||||
dst_data_loc[0] = src_loc[0];
|
||||
|
|
|
|||
|
|
@ -11381,13 +11381,13 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
|
|||
}
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
|
||||
static void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
|
||||
|
||||
// Returns true if node has enqueued work into the queue, false otherwise
|
||||
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
if (ggml_is_empty(node) || !node->buffer) {
|
||||
if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -11399,54 +11399,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
ggml_tensor * src2 = node->src[2];
|
||||
ggml_tensor * src3 = node->src[3];
|
||||
|
||||
switch (node->op) {
|
||||
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NONE:
|
||||
return false;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(node)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
if (node->op == GGML_OP_ADD) {
|
||||
int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops;
|
||||
if (next_node_idx < cgraph->n_nodes &&
|
||||
cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM &&
|
||||
|
|
@ -11459,72 +11412,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
ctx->do_add_rms_partials = true;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_FILL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ROLL:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
break;
|
||||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
|
@ -11961,145 +11848,14 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready);
|
||||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
} else if (node->op == GGML_OP_GLU) {
|
||||
std::cerr << __func__ << ": error: op not supported GLU " << node->name << " (" << ggml_glu_op_name(static_cast<ggml_glu_op>(node->op_params[0])) << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) {
|
||||
static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) {
|
||||
GGML_UNUSED(cgraph);
|
||||
ggml_backend_buffer * buf = nullptr;
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_FILL:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ROLL:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_3D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
buf = tensor->buffer;
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
buf = tensor->buffer;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(tensor)) {
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
buf = tensor->buffer;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
buf = tensor->buffer;
|
||||
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (buf == nullptr) {
|
||||
return false;
|
||||
}
|
||||
GGML_UNUSED(tensor);
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
||||
|
||||
|
|
@ -12143,8 +11899,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
subctx->out_memcpys.clear();
|
||||
subctx->memsets.clear();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Clean up after graph processing is done
|
||||
|
|
|
|||
|
|
@ -427,6 +427,7 @@ class MODEL_ARCH(IntEnum):
|
|||
APERTUS = auto()
|
||||
COGVLM = auto()
|
||||
MINIMAXM2 = auto()
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
|
||||
|
||||
|
|
@ -797,6 +798,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.APERTUS: "apertus",
|
||||
MODEL_ARCH.MINIMAXM2: "minimax-m2",
|
||||
MODEL_ARCH.COGVLM: "cogvlm",
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
}
|
||||
|
||||
|
|
@ -2991,6 +2993,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.VISEXP_UP,
|
||||
MODEL_TENSOR.VISEXP_DOWN,
|
||||
],
|
||||
MODEL_ARCH.RND1: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.PANGU_EMBED: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
781baf2a14d9e0aaee542b2e1bb918bfc4132199
|
||||
55bc9320a4aae82af18e23eefd5de319a755d7b9
|
||||
|
|
|
|||
|
|
@ -115,6 +115,7 @@ add_library(llama
|
|||
models/qwen3vl-moe.cpp
|
||||
models/qwen3moe.cpp
|
||||
models/refact.cpp
|
||||
models/rnd1.cpp
|
||||
models/rwkv6-base.cpp
|
||||
models/rwkv6.cpp
|
||||
models/rwkv6qwen2.cpp
|
||||
|
|
|
|||
|
|
@ -108,6 +108,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_APERTUS, "apertus" },
|
||||
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
||||
{ LLM_ARCH_COGVLM, "cogvlm" },
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
|
@ -2446,6 +2447,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RND1,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
|
@ -2722,6 +2743,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
case LLM_ARCH_RND1:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -112,6 +112,7 @@ enum llm_arch {
|
|||
LLM_ARCH_APERTUS,
|
||||
LLM_ARCH_MINIMAX_M2,
|
||||
LLM_ARCH_COGVLM,
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RND1:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
// Set non-causal attention for diffusion models
|
||||
hparams.causal_attn = false;
|
||||
} break;
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
|
|
@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
case LLM_ARCH_RND1:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
|
|
@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
|
||||
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
}
|
||||
|
||||
|
|
@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
case LLM_ARCH_DREAM:
|
||||
case LLM_ARCH_LLADA:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
case LLM_ARCH_RND1:
|
||||
{
|
||||
res = nullptr;
|
||||
} break;
|
||||
|
|
@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
||||
}
|
||||
break;
|
||||
case LLM_ARCH_RND1:
|
||||
{
|
||||
llm = std::make_unique<llm_build_rnd1>(*this, params);
|
||||
}
|
||||
break;
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
{
|
||||
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
||||
|
|
@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_QWEN3:
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
case LLM_ARCH_RND1:
|
||||
case LLM_ARCH_OLMO2:
|
||||
case LLM_ARCH_OLMOE:
|
||||
case LLM_ARCH_PHI2:
|
||||
|
|
|
|||
|
|
@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
|
|||
llm_build_refact(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_rnd1 : public llm_graph_context {
|
||||
llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -0,0 +1,126 @@
|
|||
#include "models.h"
|
||||
|
||||
// RND1 is a Qwen3Moe AR model converted to diffusion model.
|
||||
llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// Non-causal attention for diffusion
|
||||
auto * inp_attn = build_attn_inp_no_cache();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
cur = moe_out;
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
|
@ -6953,9 +6953,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
|
||||
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
for (bool use_view_slice : { true, false }) {
|
||||
for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
|
||||
{2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
|
||||
|
|
|
|||
|
|
@ -13,11 +13,16 @@ endif()
|
|||
|
||||
set(TARGET_SRCS
|
||||
server.cpp
|
||||
utils.hpp
|
||||
server-http.cpp
|
||||
server-http.h
|
||||
server-models.cpp
|
||||
server-models.h
|
||||
server-task.cpp
|
||||
server-task.h
|
||||
server-queue.cpp
|
||||
server-queue.h
|
||||
server-common.cpp
|
||||
server-common.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html.gz
|
||||
|
|
|
|||
|
|
@ -197,6 +197,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
|
||||
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
|
||||
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
|
||||
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
|
||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
|
|
@ -1495,8 +1496,8 @@ The `status` object can be:
|
|||
Load a model
|
||||
|
||||
Payload:
|
||||
- `model`: name of the model to be loaded
|
||||
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance
|
||||
- `model`: name of the model to be loaded.
|
||||
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
|
||||
|
||||
```json
|
||||
{
|
||||
|
|
|
|||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,349 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "chat.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
|
||||
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
using raw_buffer = std::vector<uint8_t>;
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||
// Fallback null to default value
|
||||
if (body.contains(key) && !body.at(key).is_null()) {
|
||||
try {
|
||||
return body.at(key);
|
||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
|
||||
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
|
||||
return default_value;
|
||||
}
|
||||
} else {
|
||||
return default_value;
|
||||
}
|
||||
}
|
||||
|
||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||
enum error_type {
|
||||
ERROR_TYPE_INVALID_REQUEST,
|
||||
ERROR_TYPE_AUTHENTICATION,
|
||||
ERROR_TYPE_SERVER,
|
||||
ERROR_TYPE_NOT_FOUND,
|
||||
ERROR_TYPE_PERMISSION,
|
||||
ERROR_TYPE_UNAVAILABLE, // custom error
|
||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||
ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
|
||||
};
|
||||
|
||||
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
||||
struct server_grammar_trigger {
|
||||
common_grammar_trigger value;
|
||||
|
||||
server_grammar_trigger() = default;
|
||||
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
||||
server_grammar_trigger(const json & in) {
|
||||
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
||||
value.value = in.at("value").get<std::string>();
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
value.token = (llama_token) in.at("token").get<int>();
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
json out {
|
||||
{"type", (int) value.type},
|
||||
{"value", value.value},
|
||||
};
|
||||
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
||||
out["token"] = (int) value.token;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
json format_error_response(const std::string & message, const enum error_type type);
|
||||
|
||||
//
|
||||
// random string / id
|
||||
//
|
||||
|
||||
std::string random_string();
|
||||
std::string gen_chatcmplid();
|
||||
std::string gen_tool_call_id();
|
||||
|
||||
//
|
||||
// lora utils
|
||||
//
|
||||
|
||||
// check whether the given lora set has only aloras activated (empty => false)
|
||||
bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
|
||||
|
||||
// if the two sets of loras are different, they require a cache clear unless the
|
||||
// change is only from aloras to aloras.
|
||||
bool lora_should_clear_cache(
|
||||
const std::vector<common_adapter_lora_info> & current,
|
||||
const std::vector<common_adapter_lora_info> & next);
|
||||
|
||||
std::vector<common_adapter_lora_info> parse_lora_request(
|
||||
const std::vector<common_adapter_lora_info> & lora_base,
|
||||
const json & data);
|
||||
|
||||
bool are_lora_equal(
|
||||
const std::vector<common_adapter_lora_info> & l1,
|
||||
const std::vector<common_adapter_lora_info> & l2);
|
||||
|
||||
// get the ids of all enabled loras
|
||||
std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
|
||||
|
||||
//
|
||||
// server_tokens
|
||||
//
|
||||
|
||||
/**
|
||||
* server_tokens is a helper to manage the input tokens and image for the server.
|
||||
* it is made this way to simplify the logic of KV cache management.
|
||||
*/
|
||||
struct server_tokens {
|
||||
bool has_mtmd = false;
|
||||
|
||||
private: // disallow accessing these members directly, risking out-of-sync
|
||||
|
||||
// map a **start** index in tokens to the image chunk
|
||||
// note: the order need to be in-sync with tokens
|
||||
std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
|
||||
|
||||
// list of tokens
|
||||
// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
|
||||
// otherwise, it is a normal text token
|
||||
// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
|
||||
// note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
|
||||
llama_tokens tokens;
|
||||
|
||||
// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
|
||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
|
||||
// idx 0 1 2 3 4 5 6 7 8 9 10
|
||||
// pos 0 1 2 3 4 5 5 5 7 7 7
|
||||
// map_idx_to_media will contain: {5, img0}, {8, img1}
|
||||
|
||||
public:
|
||||
server_tokens() = default;
|
||||
~server_tokens() = default;
|
||||
|
||||
// Prevent copying
|
||||
// TODO: server_tokens should be copyable - remove this:
|
||||
server_tokens(const server_tokens&) = delete;
|
||||
server_tokens& operator=(const server_tokens&) = delete;
|
||||
|
||||
// Allow moving (usually implicitly generated if members are movable)
|
||||
server_tokens(server_tokens&&) = default;
|
||||
server_tokens& operator=(server_tokens&&) = default;
|
||||
|
||||
// Allow accessing elements using [] operator
|
||||
llama_token operator[](size_t index) { return tokens[index]; }
|
||||
const llama_token& operator[](size_t index) const { return tokens[index]; }
|
||||
|
||||
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
|
||||
server_tokens(const llama_tokens & tokens, bool has_mtmd);
|
||||
|
||||
// for debugging
|
||||
std::string str() const;
|
||||
|
||||
llama_pos pos_next() const;
|
||||
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
|
||||
|
||||
void push_back(llama_token tok);
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
void push_back(const mtmd_input_chunk * chunk);
|
||||
|
||||
// appends server tokens, updates the media map. copies media chunks.
|
||||
void push_back(server_tokens & tokens);
|
||||
|
||||
// for compatibility with context shift and prompt truncation
|
||||
void insert(const llama_tokens & inp_tokens);
|
||||
|
||||
// for compatibility with speculative decoding, ctx shift, slot save/load
|
||||
const llama_tokens & get_text_tokens() const;
|
||||
|
||||
// for compatibility with speculative decoding
|
||||
void set_token(llama_pos pos, llama_token id);
|
||||
|
||||
size_t size() const { return tokens.size(); }
|
||||
|
||||
bool empty() const { return tokens.empty(); }
|
||||
|
||||
void clear() {
|
||||
map_idx_to_media.clear();
|
||||
tokens.clear();
|
||||
}
|
||||
|
||||
void keep_first(size_t n);
|
||||
|
||||
std::string detokenize(const llama_context * ctx, bool special) const;
|
||||
|
||||
size_t get_common_prefix(const server_tokens & b) const;
|
||||
|
||||
// make sure all text tokens are within the vocab range
|
||||
bool validate(const struct llama_context * ctx) const;
|
||||
|
||||
// encode and decode the image chunk
|
||||
int32_t process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
size_t idx,
|
||||
llama_pos pos,
|
||||
int32_t seq_id,
|
||||
size_t & n_tokens_out) const;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// tokenizer and input processing utils
|
||||
//
|
||||
|
||||
bool json_is_array_of_numbers(const json & data);
|
||||
|
||||
// is array having BOTH numbers & strings?
|
||||
bool json_is_array_of_mixed_numbers_strings(const json & data);
|
||||
|
||||
// does array have any individual integers/tokens?
|
||||
bool json_is_array_and_contains_numbers(const json & data);
|
||||
|
||||
// get value by path(key1 / key2)
|
||||
json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
|
||||
|
||||
/**
|
||||
* this handles 2 cases:
|
||||
* - only string, example: "string"
|
||||
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
||||
*/
|
||||
llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
|
||||
|
||||
// return the last index of character that can form a valid string
|
||||
// if the last character is potentially cut in half, return the index before the cut
|
||||
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
|
||||
size_t validate_utf8(const std::string& text);
|
||||
|
||||
// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
|
||||
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
|
||||
|
||||
/**
|
||||
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
||||
* this supports these cases:
|
||||
* - "prompt": "string"
|
||||
* - "prompt": [12, 34, 56]
|
||||
* - "prompt": [12, 34, "string", 56, 78]
|
||||
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
|
||||
* and multiple prompts (multi-tasks):
|
||||
* - "prompt": ["string1", "string2"]
|
||||
* - "prompt": ["string1", [12, 34, 56]]
|
||||
* - "prompt": [[12, 34, 56], [78, 90, 12]]
|
||||
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
|
||||
*/
|
||||
std::vector<server_tokens> tokenize_input_prompts(
|
||||
const llama_vocab * vocab,
|
||||
mtmd_context * mctx,
|
||||
const json & json_prompt,
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
//
|
||||
// OAI utils
|
||||
//
|
||||
|
||||
// used by /completions endpoint
|
||||
json oaicompat_completion_params_parse(const json & body);
|
||||
|
||||
struct oaicompat_parser_options {
|
||||
bool use_jinja;
|
||||
bool prefill_assistant;
|
||||
common_reasoning_format reasoning_format;
|
||||
std::map<std::string,std::string> chat_template_kwargs;
|
||||
common_chat_templates * tmpls;
|
||||
bool allow_image;
|
||||
bool allow_audio;
|
||||
bool enable_thinking = true;
|
||||
};
|
||||
|
||||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
const oaicompat_parser_options & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// TODO: move it to server-task.cpp
|
||||
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
|
||||
|
||||
// TODO: move it to server-task.cpp
|
||||
json format_response_rerank(
|
||||
const json & request,
|
||||
const json & ranks,
|
||||
bool is_tei_format,
|
||||
std::vector<std::string> & texts,
|
||||
int top_n);
|
||||
|
||||
//
|
||||
// other utils
|
||||
//
|
||||
|
||||
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
|
||||
|
||||
std::string safe_json_to_str(const json & data);
|
||||
|
||||
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
|
||||
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
|
||||
|
||||
// format server-sent event (SSE), return the formatted string to send
|
||||
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||
std::string format_sse(const json & data);
|
||||
|
||||
bool is_valid_utf8(const std::string & str);
|
||||
|
||||
//
|
||||
// formatting output responses
|
||||
// TODO: move these to server-task.cpp
|
||||
//
|
||||
|
||||
llama_tokens format_prompt_infill(
|
||||
const llama_vocab * vocab,
|
||||
const json & input_prefix,
|
||||
const json & input_suffix,
|
||||
const json & input_extra,
|
||||
const int n_batch,
|
||||
const int n_predict,
|
||||
const int n_ctx,
|
||||
const bool spm_infill,
|
||||
const llama_tokens & tokens_prompt);
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
server_tokens format_prompt_rerank(
|
||||
const struct llama_model * model,
|
||||
const struct llama_vocab * vocab,
|
||||
mtmd_context * mctx,
|
||||
const std::string & query,
|
||||
const std::string & doc);
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
#include "utils.hpp"
|
||||
#include "common.h"
|
||||
#include "server-http.h"
|
||||
#include "server-common.h"
|
||||
|
||||
#include <cpp-httplib/httplib.h>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#include "utils.hpp"
|
||||
#include "server-common.h"
|
||||
#include "server-models.h"
|
||||
|
||||
#include "download.h"
|
||||
|
|
@ -383,10 +383,12 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
|||
child_args.push_back(std::to_string(inst.meta.port));
|
||||
|
||||
// append extra args
|
||||
if (base_params.models_allow_extra_args) {
|
||||
for (const auto & arg : extra_args) {
|
||||
child_args.push_back(arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> child_env = base_env; // copy
|
||||
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
|
||||
|
|
@ -405,6 +407,8 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
|||
if (result != 0) {
|
||||
throw std::runtime_error("failed to spawn server instance");
|
||||
}
|
||||
|
||||
inst.stdin_file = subprocess_stdin(inst.subproc.get());
|
||||
}
|
||||
|
||||
// start a thread to manage the child process
|
||||
|
|
@ -455,13 +459,12 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
|||
cv.notify_all();
|
||||
}
|
||||
|
||||
static void interrupt_subprocess(subprocess_s * proc) {
|
||||
static void interrupt_subprocess(FILE * stdin_file) {
|
||||
// because subprocess.h does not provide a way to send SIGINT,
|
||||
// we will send a command to the child process to exit gracefully
|
||||
FILE * p_stdin = subprocess_stdin(proc);
|
||||
if (p_stdin) {
|
||||
fprintf(p_stdin, "%s\n", CMD_EXIT);
|
||||
fflush(p_stdin);
|
||||
if (stdin_file) {
|
||||
fprintf(stdin_file, "%s\n", CMD_EXIT);
|
||||
fflush(stdin_file);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -471,7 +474,7 @@ void server_models::unload(const std::string & name) {
|
|||
if (it != mapping.end()) {
|
||||
if (it->second.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
interrupt_subprocess(it->second.subproc.get());
|
||||
interrupt_subprocess(it->second.stdin_file);
|
||||
// status change will be handled by the managing thread
|
||||
} else {
|
||||
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
|
||||
|
|
@ -486,7 +489,7 @@ void server_models::unload_all() {
|
|||
for (auto & [name, inst] : mapping) {
|
||||
if (inst.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
interrupt_subprocess(inst.subproc.get());
|
||||
interrupt_subprocess(inst.stdin_file);
|
||||
// status change will be handled by the managing thread
|
||||
}
|
||||
// moving the thread to join list to avoid deadlock
|
||||
|
|
|
|||
|
|
@ -74,6 +74,7 @@ private:
|
|||
std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
|
||||
std::thread th;
|
||||
server_model_meta meta;
|
||||
FILE * stdin_file = nullptr;
|
||||
};
|
||||
|
||||
std::mutex mutex;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,268 @@
|
|||
#include "server-task.h"
|
||||
#include "server-queue.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
|
||||
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
#define RES_INF(fmt, ...) LOG_INF("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_WRN(fmt, ...) LOG_WRN("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_ERR(fmt, ...) LOG_ERR("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define RES_DBG(fmt, ...) LOG_DBG("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// server_queue
|
||||
//
|
||||
|
||||
int server_queue::post(server_task && task, bool front) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
GGML_ASSERT(task.id != -1);
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
const int task_id = task.id;
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return task_id;
|
||||
}
|
||||
|
||||
int server_queue::post(std::vector<server_task> && tasks, bool front) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : tasks) {
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
}
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void server_queue::defer(server_task && task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
QUE_DBG("defer task, id = %d\n", task.id);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
int server_queue::get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
int new_id = id++;
|
||||
return new_id;
|
||||
}
|
||||
|
||||
void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
void server_queue::on_update_slots(std::function<void(void)> callback) {
|
||||
callback_update_slots = std::move(callback);
|
||||
}
|
||||
|
||||
void server_queue::pop_deferred_task() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!queue_tasks_deferred.empty()) {
|
||||
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
void server_queue::terminate() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
running = false;
|
||||
condition_tasks.notify_all();
|
||||
}
|
||||
|
||||
void server_queue::start_loop() {
|
||||
running = true;
|
||||
|
||||
while (true) {
|
||||
QUE_DBG("%s", "processing new tasks\n");
|
||||
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!running) {
|
||||
QUE_DBG("%s", "terminate\n");
|
||||
return;
|
||||
}
|
||||
if (queue_tasks.empty()) {
|
||||
lock.unlock();
|
||||
break;
|
||||
}
|
||||
server_task task = std::move(queue_tasks.front());
|
||||
queue_tasks.pop_front();
|
||||
lock.unlock();
|
||||
|
||||
QUE_DBG("processing task, id = %d\n", task.id);
|
||||
callback_new_task(std::move(task));
|
||||
}
|
||||
|
||||
// all tasks in the current loop is processed, slots data is now ready
|
||||
QUE_DBG("%s", "update slots\n");
|
||||
|
||||
callback_update_slots();
|
||||
|
||||
QUE_DBG("%s", "waiting for new tasks\n");
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!running) {
|
||||
QUE_DBG("%s", "terminate\n");
|
||||
return;
|
||||
}
|
||||
if (queue_tasks.empty()) {
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return (!queue_tasks.empty() || !running);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void server_queue::cleanup_pending_task(int id_target) {
|
||||
// no need lock because this is called exclusively by post()
|
||||
auto rm_func = [id_target](const server_task & task) {
|
||||
return task.id == id_target;
|
||||
};
|
||||
queue_tasks.erase(
|
||||
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
|
||||
queue_tasks.end());
|
||||
queue_tasks_deferred.erase(
|
||||
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
|
||||
queue_tasks_deferred.end());
|
||||
}
|
||||
|
||||
//
|
||||
// server_response
|
||||
//
|
||||
|
||||
void server_response::add_waiting_task_id(int id_task) {
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(id_task);
|
||||
}
|
||||
|
||||
void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (const auto & task : tasks) {
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.insert(task.id);
|
||||
}
|
||||
}
|
||||
|
||||
void server_response::remove_waiting_task_id(int id_task) {
|
||||
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(id_task);
|
||||
// make sure to clean up all pending results
|
||||
queue_results.erase(
|
||||
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
|
||||
return res->id == id_task;
|
||||
}),
|
||||
queue_results.end());
|
||||
}
|
||||
|
||||
void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (const auto & id_task : id_tasks) {
|
||||
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.erase(id_task);
|
||||
}
|
||||
}
|
||||
|
||||
server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
if (!running) {
|
||||
RES_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
for (size_t i = 0; i < queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (!running) {
|
||||
RES_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
server_task_result_ptr server_response::recv(int id_task) {
|
||||
std::unordered_set<int> id_tasks = {id_task};
|
||||
return recv(id_tasks);
|
||||
}
|
||||
|
||||
void server_response::send(server_task_result_ptr && result) {
|
||||
RES_DBG("sending result for task id = %d\n", result->id);
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
for (const auto & id_task : waiting_task_ids) {
|
||||
if (result->id == id_task) {
|
||||
RES_DBG("task id = %d pushed to result queue\n", result->id);
|
||||
|
||||
queue_results.emplace_back(std::move(result));
|
||||
condition_results.notify_all();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void server_response::terminate() {
|
||||
running = false;
|
||||
condition_results.notify_all();
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
#pragma once
|
||||
|
||||
#include "server-task.h"
|
||||
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <mutex>
|
||||
#include <unordered_set>
|
||||
|
||||
struct server_queue {
|
||||
private:
|
||||
int id = 0;
|
||||
bool running;
|
||||
|
||||
// queues
|
||||
std::deque<server_task> queue_tasks;
|
||||
std::deque<server_task> queue_tasks_deferred;
|
||||
|
||||
std::mutex mutex_tasks;
|
||||
std::condition_variable condition_tasks;
|
||||
|
||||
// callback functions
|
||||
std::function<void(server_task &&)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
|
||||
public:
|
||||
// Add a new task to the end of the queue
|
||||
int post(server_task && task, bool front = false);
|
||||
|
||||
// multi-task version of post()
|
||||
int post(std::vector<server_task> && tasks, bool front = false);
|
||||
|
||||
// Add a new task, but defer until one slot is available
|
||||
void defer(server_task && task);
|
||||
|
||||
// Get the next id for creating a new task
|
||||
int get_new_id();
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task &&)> callback);
|
||||
|
||||
// Register the function to be called when all slots data is ready to be processed
|
||||
void on_update_slots(std::function<void(void)> callback);
|
||||
|
||||
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
||||
void pop_deferred_task();
|
||||
|
||||
// end the start_loop routine
|
||||
void terminate();
|
||||
|
||||
/**
|
||||
* Main loop consists of these steps:
|
||||
* - Wait until a new task arrives
|
||||
* - Process the task (i.e. maybe copy data into slot)
|
||||
* - Check if multitask is finished
|
||||
* - Update all slots
|
||||
*/
|
||||
void start_loop();
|
||||
|
||||
// for metrics
|
||||
size_t queue_tasks_deferred_size() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return queue_tasks_deferred.size();
|
||||
}
|
||||
|
||||
private:
|
||||
void cleanup_pending_task(int id_target);
|
||||
};
|
||||
|
||||
struct server_response {
|
||||
private:
|
||||
bool running = true;
|
||||
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::unordered_set<int> waiting_task_ids;
|
||||
|
||||
// the main result queue (using ptr for polymorphism)
|
||||
std::vector<server_task_result_ptr> queue_results;
|
||||
|
||||
std::mutex mutex_results;
|
||||
std::condition_variable condition_results;
|
||||
|
||||
public:
|
||||
// add the id_task to the list of tasks waiting for response
|
||||
void add_waiting_task_id(int id_task);
|
||||
|
||||
void add_waiting_tasks(const std::vector<server_task> & tasks);
|
||||
|
||||
// when the request is finished, we can remove task associated with it
|
||||
void remove_waiting_task_id(int id_task);
|
||||
|
||||
// remove multiple tasks from waiting list
|
||||
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
|
||||
|
||||
// This function blocks the thread until there is a response for one of the id_tasks
|
||||
server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
|
||||
|
||||
// same as recv(), but have timeout in seconds
|
||||
// if timeout is reached, nullptr is returned
|
||||
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
|
||||
|
||||
// single-task version of recv()
|
||||
server_task_result_ptr recv(int id_task);
|
||||
|
||||
// Send a new result to a waiting id_task
|
||||
void send(server_task_result_ptr && result);
|
||||
|
||||
// terminate the waiting loop
|
||||
void terminate();
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,453 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <list>
|
||||
|
||||
// TODO: prevent including the whole server-common.h as we only use server_tokens
|
||||
#include "server-common.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum server_task_type {
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
SERVER_TASK_TYPE_EMBEDDING,
|
||||
SERVER_TASK_TYPE_RERANK,
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
SERVER_TASK_TYPE_CANCEL,
|
||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||
SERVER_TASK_TYPE_METRICS,
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
|
||||
enum oaicompat_type {
|
||||
OAICOMPAT_TYPE_NONE,
|
||||
OAICOMPAT_TYPE_CHAT,
|
||||
OAICOMPAT_TYPE_COMPLETION,
|
||||
OAICOMPAT_TYPE_EMBEDDING,
|
||||
};
|
||||
|
||||
enum stop_type {
|
||||
STOP_TYPE_NONE,
|
||||
STOP_TYPE_EOS,
|
||||
STOP_TYPE_WORD,
|
||||
STOP_TYPE_LIMIT,
|
||||
};
|
||||
|
||||
struct task_params {
|
||||
bool stream = true;
|
||||
bool include_usage = false;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool return_progress = false;
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters
|
||||
|
||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<common_adapter_lora_info> lora;
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
std::vector<std::string> response_fields;
|
||||
bool timings_per_token = false;
|
||||
bool post_sampling_probs = false;
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
|
||||
// OAI-compat fields
|
||||
bool verbose = false;
|
||||
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
|
||||
// Embeddings
|
||||
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
|
||||
json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
|
||||
json to_json(bool only_metrics = false) const;
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
int id = -1; // to be filled by server_queue
|
||||
int index = -1; // used when there are multiple prompts (batch request)
|
||||
|
||||
// used by SERVER_TASK_TYPE_CANCEL
|
||||
int id_target = -1;
|
||||
int id_slot = -1;
|
||||
|
||||
// used by SERVER_TASK_TYPE_INFERENCE
|
||||
task_params params;
|
||||
server_tokens tokens;
|
||||
|
||||
server_task_type type;
|
||||
|
||||
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
|
||||
struct slot_action {
|
||||
int slot_id;
|
||||
std::string filename;
|
||||
std::string filepath;
|
||||
};
|
||||
slot_action slot_action;
|
||||
|
||||
// used by SERVER_TASK_TYPE_METRICS
|
||||
bool metrics_reset_bucket = false;
|
||||
|
||||
// used by SERVER_TASK_TYPE_SET_LORA
|
||||
std::vector<common_adapter_lora_info> set_lora;
|
||||
|
||||
server_task() = default;
|
||||
|
||||
server_task(server_task_type type) : type(type) {}
|
||||
|
||||
int32_t n_tokens() const {
|
||||
return tokens.size();
|
||||
}
|
||||
|
||||
static task_params params_from_json_cmpl(
|
||||
const llama_context * ctx,
|
||||
const common_params & params_base,
|
||||
const json & data);
|
||||
|
||||
// utility function
|
||||
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
||||
std::unordered_set<int> ids(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
ids.insert(tasks[i].id);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
int32_t cache_n = -1;
|
||||
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
double prompt_per_second;
|
||||
|
||||
int32_t predicted_n = -1;
|
||||
double predicted_ms;
|
||||
double predicted_per_token_ms;
|
||||
double predicted_per_second;
|
||||
|
||||
// Optional speculative metrics - only included when > 0
|
||||
int32_t draft_n = 0;
|
||||
int32_t draft_n_accepted = 0;
|
||||
|
||||
json to_json() const;
|
||||
};
|
||||
|
||||
struct result_prompt_progress {
|
||||
int32_t total = 0;
|
||||
int32_t cache = 0;
|
||||
int32_t processed = 0;
|
||||
int64_t time_ms = 0;
|
||||
|
||||
json to_json() const;
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
virtual bool is_error() {
|
||||
// only used by server_task_result_error
|
||||
return false;
|
||||
}
|
||||
virtual bool is_stop() {
|
||||
// only used by server_task_result_cmpl_*
|
||||
return true;
|
||||
}
|
||||
virtual int get_index() {
|
||||
return -1;
|
||||
}
|
||||
virtual json to_json() = 0;
|
||||
virtual ~server_task_result() = default;
|
||||
};
|
||||
|
||||
// using shared_ptr for polymorphism of server_task_result
|
||||
using server_task_result_ptr = std::unique_ptr<server_task_result>;
|
||||
|
||||
struct completion_token_output {
|
||||
llama_token tok;
|
||||
float prob;
|
||||
std::string text_to_send;
|
||||
struct prob_info {
|
||||
llama_token tok;
|
||||
std::string txt;
|
||||
float prob;
|
||||
};
|
||||
std::vector<prob_info> probs;
|
||||
|
||||
json to_json(bool post_sampling_probs) const;
|
||||
|
||||
static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
|
||||
|
||||
static float logarithm(float x);
|
||||
|
||||
static std::vector<unsigned char> str_to_bytes(const std::string & str);
|
||||
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
int index = 0;
|
||||
|
||||
std::string content;
|
||||
llama_tokens tokens;
|
||||
|
||||
bool stream;
|
||||
bool include_usage;
|
||||
result_timings timings;
|
||||
std::string prompt;
|
||||
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
int32_t n_tokens_cached;
|
||||
bool has_new_line;
|
||||
std::string stopping_word;
|
||||
stop_type stop = STOP_TYPE_NONE;
|
||||
|
||||
bool post_sampling_probs;
|
||||
std::vector<completion_token_output> probs_output;
|
||||
std::vector<std::string> response_fields;
|
||||
|
||||
task_params generation_params;
|
||||
|
||||
// OAI-compat fields
|
||||
bool verbose = false;
|
||||
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
common_chat_msg oaicompat_msg;
|
||||
|
||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return true; // in stream mode, final responses are considered stop
|
||||
}
|
||||
|
||||
virtual json to_json() override;
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
json to_json_oaicompat();
|
||||
|
||||
json to_json_oaicompat_chat();
|
||||
|
||||
json to_json_oaicompat_chat_stream();
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_partial : server_task_result {
|
||||
int index = 0;
|
||||
|
||||
std::string content;
|
||||
llama_tokens tokens;
|
||||
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
|
||||
bool post_sampling_probs;
|
||||
bool is_progress = false;
|
||||
completion_token_output prob_output;
|
||||
result_timings timings;
|
||||
result_prompt_progress progress;
|
||||
|
||||
// OAI-compat fields
|
||||
bool verbose = false;
|
||||
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return false; // in stream mode, partial responses are not considered stop
|
||||
}
|
||||
|
||||
virtual json to_json() override;
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
json to_json_oaicompat();
|
||||
|
||||
json to_json_oaicompat_chat();
|
||||
};
|
||||
|
||||
struct server_task_result_embd : server_task_result {
|
||||
int index = 0;
|
||||
std::vector<std::vector<float>> embedding;
|
||||
|
||||
int32_t n_tokens;
|
||||
|
||||
// OAI-compat fields
|
||||
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override;
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
json to_json_oaicompat();
|
||||
};
|
||||
|
||||
struct server_task_result_rerank : server_task_result {
|
||||
int index = 0;
|
||||
float score = -1e6;
|
||||
|
||||
int32_t n_tokens;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_task_result_error : server_task_result {
|
||||
int index = 0;
|
||||
error_type err_type = ERROR_TYPE_SERVER;
|
||||
std::string err_msg;
|
||||
|
||||
// for ERROR_TYPE_EXCEED_CONTEXT_SIZE
|
||||
int32_t n_prompt_tokens = 0;
|
||||
int32_t n_ctx = 0;
|
||||
|
||||
virtual bool is_error() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_task_result_metrics : server_task_result {
|
||||
int n_idle_slots;
|
||||
int n_processing_slots;
|
||||
int n_tasks_deferred;
|
||||
int64_t t_start;
|
||||
|
||||
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t t_prompt_processing_total = 0;
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_tokens_max = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
uint64_t n_decode_total = 0;
|
||||
uint64_t n_busy_slots_total = 0;
|
||||
|
||||
// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
|
||||
// therefore, we use json to temporarily store the slot.to_json() result
|
||||
json slots_data = json::array();
|
||||
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_task_result_slot_save_load : server_task_result {
|
||||
std::string filename;
|
||||
bool is_save; // true = save, false = load
|
||||
|
||||
size_t n_tokens;
|
||||
size_t n_bytes;
|
||||
double t_ms;
|
||||
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_task_result_slot_erase : server_task_result {
|
||||
size_t n_erased;
|
||||
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_task_result_apply_lora : server_task_result {
|
||||
virtual json to_json() override;
|
||||
};
|
||||
|
||||
struct server_prompt_checkpoint {
|
||||
llama_pos pos_min;
|
||||
llama_pos pos_max;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_prompt {
|
||||
server_tokens tokens;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
std::list<server_prompt_checkpoint> checkpoints;
|
||||
|
||||
size_t size() const {
|
||||
size_t res = data.size();
|
||||
|
||||
for (const auto & checkpoint : checkpoints) {
|
||||
res += checkpoint.size();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int n_tokens() const {
|
||||
return tokens.size();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_prompt_cache {
|
||||
server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
|
||||
this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
|
||||
this->limit_tokens = limit_tokens;
|
||||
}
|
||||
|
||||
std::list<server_prompt> states;
|
||||
|
||||
// in bytes, 0 = no limit
|
||||
size_t limit_size = 0;
|
||||
|
||||
// in tokens, 0 = no limit
|
||||
size_t limit_tokens = 0;
|
||||
|
||||
size_t size() const;
|
||||
|
||||
size_t n_tokens() const;
|
||||
|
||||
server_prompt * alloc(const server_prompt & prompt, size_t state_size);
|
||||
|
||||
bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
|
||||
|
||||
void update();
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -28,6 +28,7 @@
|
|||
sendMessage,
|
||||
stopGeneration
|
||||
} from '$lib/stores/chat.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import {
|
||||
supportsVision,
|
||||
supportsAudio,
|
||||
|
|
@ -47,6 +48,7 @@
|
|||
|
||||
let { showCenteredEmpty = false } = $props();
|
||||
|
||||
let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
|
||||
let autoScrollEnabled = $state(true);
|
||||
let chatScrollContainer: HTMLDivElement | undefined = $state();
|
||||
let dragCounter = $state(0);
|
||||
|
|
@ -149,7 +151,7 @@
|
|||
}
|
||||
|
||||
function handleScroll() {
|
||||
if (!chatScrollContainer) return;
|
||||
if (disableAutoScroll || !chatScrollContainer) return;
|
||||
|
||||
const { scrollTop, scrollHeight, clientHeight } = chatScrollContainer;
|
||||
const distanceFromBottom = scrollHeight - scrollTop - clientHeight;
|
||||
|
|
@ -194,8 +196,10 @@
|
|||
const extras = result?.extras;
|
||||
|
||||
// Enable autoscroll for user-initiated message sending
|
||||
if (!disableAutoScroll) {
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
}
|
||||
await sendMessage(message, extras);
|
||||
scrollChatToBottom();
|
||||
|
||||
|
|
@ -241,6 +245,8 @@
|
|||
}
|
||||
|
||||
function scrollChatToBottom(behavior: ScrollBehavior = 'smooth') {
|
||||
if (disableAutoScroll) return;
|
||||
|
||||
chatScrollContainer?.scrollTo({
|
||||
top: chatScrollContainer?.scrollHeight,
|
||||
behavior
|
||||
|
|
@ -248,14 +254,27 @@
|
|||
}
|
||||
|
||||
afterNavigate(() => {
|
||||
if (!disableAutoScroll) {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
}
|
||||
});
|
||||
|
||||
onMount(() => {
|
||||
if (!disableAutoScroll) {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
}
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (disableAutoScroll) {
|
||||
autoScrollEnabled = false;
|
||||
if (scrollInterval) {
|
||||
clearInterval(scrollInterval);
|
||||
scrollInterval = undefined;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (isCurrentConversationLoading && autoScrollEnabled) {
|
||||
scrollInterval = setInterval(scrollChatToBottom, AUTO_SCROLL_INTERVAL);
|
||||
} else if (scrollInterval) {
|
||||
|
|
@ -289,9 +308,11 @@
|
|||
class="mb-16 md:mb-24"
|
||||
messages={activeMessages()}
|
||||
onUserAction={() => {
|
||||
if (!disableAutoScroll) {
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
scrollChatToBottom();
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
Settings,
|
||||
Funnel,
|
||||
AlertTriangle,
|
||||
Brain,
|
||||
Code,
|
||||
Monitor,
|
||||
Sun,
|
||||
|
|
@ -58,6 +57,33 @@
|
|||
label: 'Paste long text to file length',
|
||||
type: 'input'
|
||||
},
|
||||
{
|
||||
key: 'enableContinueGeneration',
|
||||
label: 'Enable "Continue" button',
|
||||
type: 'checkbox',
|
||||
isExperimental: true
|
||||
},
|
||||
{
|
||||
key: 'pdfAsImage',
|
||||
label: 'Parse PDF as image',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'askForTitleConfirmation',
|
||||
label: 'Ask for confirmation before changing conversation title',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Display',
|
||||
icon: Monitor,
|
||||
fields: [
|
||||
{
|
||||
key: 'showThoughtInProgress',
|
||||
label: 'Show thought in progress',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'showMessageStats',
|
||||
label: 'Show message generation statistics',
|
||||
|
|
@ -79,31 +105,14 @@
|
|||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'pdfAsImage',
|
||||
label: 'Parse PDF as image',
|
||||
key: 'disableAutoScroll',
|
||||
label: 'Disable automatic scroll',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'renderUserContentAsMarkdown',
|
||||
label: 'Render user content as Markdown',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'autoMicOnEmpty',
|
||||
label: 'Show microphone on empty input',
|
||||
type: 'checkbox',
|
||||
isExperimental: true
|
||||
},
|
||||
{
|
||||
key: 'enableContinueGeneration',
|
||||
label: 'Enable "Continue" button',
|
||||
type: 'checkbox',
|
||||
isExperimental: true
|
||||
},
|
||||
{
|
||||
key: 'askForTitleConfirmation',
|
||||
label: 'Ask for confirmation before changing conversation title',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
@ -214,17 +223,6 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Reasoning',
|
||||
icon: Brain,
|
||||
fields: [
|
||||
{
|
||||
key: 'showThoughtInProgress',
|
||||
label: 'Show thought in progress',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Import/Export',
|
||||
icon: Database,
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
|
|||
pasteLongTextToFileLen: 2500,
|
||||
pdfAsImage: false,
|
||||
showModelInfo: false,
|
||||
disableAutoScroll: false,
|
||||
renderUserContentAsMarkdown: false,
|
||||
modelSelectorEnabled: false,
|
||||
autoMicOnEmpty: false,
|
||||
|
|
@ -94,6 +95,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
|||
'Ask for confirmation before automatically changing conversation title when editing the first message.',
|
||||
pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
|
||||
showModelInfo: 'Display the model name used to generate each message below the message content.',
|
||||
disableAutoScroll:
|
||||
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
|
||||
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
|
||||
modelSelectorEnabled:
|
||||
'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.',
|
||||
|
|
|
|||
Loading…
Reference in New Issue