Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2

This commit is contained in:
Aleksander Grygier 2025-11-24 16:07:05 +01:00
commit 11c26ecfca
53 changed files with 4399 additions and 3626 deletions

View File

@ -1,9 +1,7 @@
ARG UBUNTU_VERSION=25.10 ARG UBUNTU_VERSION=26.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
# Install build tools # Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils RUN apt update && apt install -y git build-essential cmake wget xz-utils

View File

@ -351,16 +351,10 @@ jobs:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build - name: Build
id: cmake_build id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: | run: |
cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup - name: Python setup
@ -374,13 +368,6 @@ jobs:
run: | run: |
pip install -r tools/server/tests/requirements.txt pip install -r tools/server/tests/requirements.txt
- name: Copy Libcurl
id: prepare_libcurl
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}

View File

@ -2488,6 +2488,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_max = value; params.models_max = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--models-allow-extra-args"},
string_format("for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: %s)", params.models_allow_extra_args ? "enabled" : "disabled"),
[](common_params & params) {
params.models_allow_extra_args = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS"));
add_opt(common_arg( add_opt(common_arg(
{"--no-models-autoload"}, {"--no-models-autoload"},
"disables automatic loading of models (default: enabled)", "disables automatic loading of models (default: enabled)",

View File

@ -462,6 +462,7 @@ struct common_params {
std::string models_dir = ""; // directory containing models for the router server std::string models_dir = ""; // directory containing models for the router server
int models_max = 4; // maximum number of models to load simultaneously int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server bool models_autoload = true; // automatically load models when requested via the router server
bool models_allow_extra_args = false; // allow passing extra arguments when loading models via the router server
bool log_json = false; bool log_json = false;

View File

@ -4183,6 +4183,21 @@ class Qwen3MoeModel(Qwen2MoeModel):
super().set_vocab() super().set_vocab()
@ModelBase.register("RND1")
class RND1Model(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.RND1
def set_gguf_parameters(self):
super().set_gguf_parameters()
# RND1 specific parameters
# RND1 uses bidirectional attention
self.gguf_writer.add_causal_attention(False)
if (mask_token_id := self.hparams.get("mask_token_id")) is not None:
self.gguf_writer.add_mask_token_id(mask_token_id)
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration") @ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
class Qwen3VLVisionModel(MmprojModel): class Qwen3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):

View File

@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt The example demonstrates batched generation from a given prompt
```bash ```bash
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified
... ...

View File

@ -6,8 +6,54 @@ More Info:
- https://github.com/ggml-org/llama.cpp/pull/14644 - https://github.com/ggml-org/llama.cpp/pull/14644
- https://github.com/ggml-org/llama.cpp/pull/14771 - https://github.com/ggml-org/llama.cpp/pull/14771
## Parameters
The diffusion CLI supports various parameters to control the generation process:
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual` ### Core Diffusion Parameters
- `--diffusion-steps`: Number of diffusion steps (default: 256)
- `--diffusion-algorithm`: Algorithm for token selection
- `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
- `1`: ENTROPY_BASED - Entropy-based selection
- `2`: MARGIN_BASED - Margin-based selection
- `3`: RANDOM - Random selection
- `4`: CONFIDENCE_BASED - Confidence-based selection (default)
- More documentation here https://github.com/DreamLM/Dream
- `--diffusion-visual`: Enable live visualization during generation
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual` ### Scheduling Parameters
Choose one of the following scheduling methods:
**Timestep-based scheduling:**
- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
**Block-based scheduling:**
- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
### Sampling Parameters
- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
- `--top-k`: Top-k filtering for sampling
- `--top-p`: Top-p (nucleus) filtering for sampling
- `--seed`: Random seed for reproducibility
### Model Parameters
- `-m`: Path to the GGUF model file
- `-p`: Input prompt text
- `-ub`: Maximum sequence length (ubatch size)
- `-c`: Context size
- `-b`: Batch size
### Examples
#### Dream architechture:
```
llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
```
#### LLaDA architechture:
```
llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
```
#### RND1 architecture:
```
llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
```

View File

@ -25,16 +25,17 @@ if(GIT_EXE)
) )
endif() endif()
# Build the version string with optional dirty flag
set(GGML_VERSION "${GGML_VERSION_BASE}") set(GGML_VERSION "${GGML_VERSION_BASE}")
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
set(GGML_VERSION "${GGML_VERSION}-dirty")
endif()
if(NOT GGML_BUILD_COMMIT) if(NOT GGML_BUILD_COMMIT)
set(GGML_BUILD_COMMIT "unknown") set(GGML_BUILD_COMMIT "unknown")
endif() endif()
# Build the commit string with optional dirty flag
if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
endif()
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

View File

@ -328,6 +328,14 @@ function(ggml_add_cpu_backend_variant tag_name)
set(GGML_INTERNAL_${feat} OFF) set(GGML_INTERNAL_${feat} OFF)
endforeach() endforeach()
foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON)
endforeach()
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
foreach (feat RVV)
set(GGML_INTERNAL_${feat} OFF)
endforeach()
foreach (feat ${ARGN}) foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON) set(GGML_INTERNAL_${feat} ON)
endforeach() endforeach()
@ -402,6 +410,13 @@ if (GGML_CPU_ALL_VARIANTS)
else() else()
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}") message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
endif() endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
ggml_add_cpu_backend_variant(riscv64_0)
ggml_add_cpu_backend_variant(riscv64_v RVV)
else()
message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
endif()
else() else()
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}") message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
endif() endif()

View File

@ -2303,9 +2303,9 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
// calculate rope cache for fist layer in current device. // calculate rope cache for fist layer in current device.
cann_ctx->rope_cache.cached = false; cann_ctx->rope_cache.cached = false;
bool cann_graph_update_required = false;
#ifdef USE_ACL_GRAPH #ifdef USE_ACL_GRAPH
bool use_cann_graph = true; bool use_cann_graph = true;
bool cann_graph_update_required = false;
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or("")); static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
if (!prefill_use_graph) { if (!prefill_use_graph) {
@ -2336,7 +2336,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
} }
#else #else
bool use_cann_graph = false; bool use_cann_graph = false;
bool cann_graph_update_required = false;
#endif // USE_ACL_GRAPH #endif // USE_ACL_GRAPH
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required); evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);

View File

@ -452,22 +452,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/spacemit/ime_kernels.h ggml-cpu/spacemit/ime_kernels.h
) )
endif() endif()
set(MARCH_STR "rv64gc") if(NOT GGML_CPU_ALL_VARIANTS)
if (GGML_RV_ZFH) set(MARCH_STR "rv64gc")
string(APPEND MARCH_STR "_zfh") if (GGML_RV_ZFH)
endif() string(APPEND MARCH_STR "_zfh")
if (GGML_XTHEADVECTOR)
string(APPEND MARCH_STR "_xtheadvector")
elseif (GGML_RVV)
string(APPEND MARCH_STR "_v")
if (GGML_RV_ZVFH)
string(APPEND MARCH_STR "_zvfh")
endif() endif()
if (GGML_XTHEADVECTOR)
string(APPEND MARCH_STR "_xtheadvector")
elseif (GGML_RVV)
string(APPEND MARCH_STR "_v")
if (GGML_RV_ZVFH)
string(APPEND MARCH_STR "_zvfh")
endif()
endif()
if (GGML_RV_ZICBOP)
string(APPEND MARCH_STR "_zicbop")
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
else()
# Begin with the lowest baseline
set(ARCH_DEFINITIONS "")
if (GGML_INTERNAL_RVV)
message(STATUS "RVV enabled")
list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
endif()
ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
endif() endif()
if (GGML_RV_ZICBOP)
string(APPEND MARCH_STR "_zicbop")
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x") elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected") message(STATUS "s390x detected")
list(APPEND GGML_CPU_SOURCES list(APPEND GGML_CPU_SOURCES

View File

@ -51,10 +51,8 @@
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64) #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp // repack.cpp
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)

View File

@ -24,6 +24,29 @@
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
int16x8_t * out_mins,
int8_t * out_scales) {
constexpr uint32_t kmask1 = 0x3f3f3f3f;
constexpr uint32_t kmask2 = 0x0f0f0f0f;
constexpr uint32_t kmask3 = 0x03030303;
constexpr uint8_t scales_size = 12;
uint32_t sm[3];
memcpy(sm, scales_in, scales_size);
const uint32_t mins_0_3 = sm[1] & kmask1;
const uint32_t mins_4_7 = ((sm[2] >> 4) & kmask2) | (((sm[1] >> 6) & kmask3) << 4);
const uint32x2_t mins_u32 = { mins_0_3, mins_4_7 };
*out_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins_u32)));
uint32_t scales_u32[2];
scales_u32[0] = sm[0] & kmask1;
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
memcpy(out_scales, scales_u32, 8);
}
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32); assert(QK8_0 == 32);
assert(k % QK8_0 == 0); assert(k % QK8_0 == 0);
@ -474,6 +497,162 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
} }
void ggml_gemv_q4_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 8;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON)
constexpr int col_pairs = ncols_interleaved / 2;
const uint8x16_t m4b = vdupq_n_u8(0x0f);
// 1x8 tile = 2 x 4
float32x4_t acc_f32[ncols_interleaved / 4];
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int i = 0; i < ncols_interleaved / 4; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
float32x4_t sb_scale_0 = vmulq_f32(q4_d_0, q8_d);
float32x4_t sb_scale_1 = vmulq_f32(q4_d_1, q8_d);
float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
float32x4_t sb_min_0 = vmulq_f32(q4_dmin_0, q8_d);
float32x4_t sb_min_1 = vmulq_f32(q4_dmin_1, q8_d);
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
// 2 sb each iteration
int32x4_t acc_lo[col_pairs];
int32x4_t acc_hi[col_pairs];
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
int16_t bsums_arr[8];
vst1q_s16(bsums_arr, bsums);
for (int sb = 0; sb < QK_K / 64; sb++) {
for (int i = 0; i < col_pairs; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int16x8_t q4sb_mins[2]; // int16 as its needed for bias_acc later
int16x8_t q4sb_scales[2];
for (int i = 0; i < 2; i++) {
int8_t aux_q4sb[8];
const int offset = sb * 24 + i * 12;
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
}
const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
// Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
// but still need the qs to use the low and hi bits from q4
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
int8x16_t q8_qs[8];
for (int i = 0; i < 8; i++) {
q8_qs[i] = (int8x16_t) vld1q_dup_s64((const int64_t *) (q8_base + i * 8));
}
// Q4s columns iterated in pairs (01, 23, 45, 67)
for (int cp = 0; cp < col_pairs; cp++) {
uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_base + 16 * cp);
uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_base + 16 * cp + 64);
uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_base + 16 * cp + 128);
uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_base + 16 * cp + 192);
acc_lo[cp] =
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)), q8_qs[0]); // 0 .. 7
acc_lo[cp] =
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)), q8_qs[1]); // 8 ..15
acc_lo[cp] =
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)), q8_qs[2]); // 16..23
acc_lo[cp] =
ggml_vdotq_s32(acc_lo[cp], vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)), q8_qs[3]); // 24..31
acc_hi[cp] =
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)), q8_qs[4]); // 32..39
acc_hi[cp] =
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)), q8_qs[5]); // 40..47
acc_hi[cp] =
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)), q8_qs[6]); // 48..55
acc_hi[cp] =
ggml_vdotq_s32(acc_hi[cp], vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)), q8_qs[7]); // 56..63
}
// Iterates over a pair of column pairs (4 columns) to use a single 128 register
// p = 0 -> 0123 p2 -> 4567
for (int i = 0, p = 0; p < col_pairs; i++, p += 2) {
int16x4_t group_scales_lo = p == 0 ? vget_low_s16(q4sb_scales[0]) : vget_high_s16(q4sb_scales[0]);
int16x4_t group_scales_hi = p == 0 ? vget_low_s16(q4sb_scales[1]) : vget_high_s16(q4sb_scales[1]);
float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
// 0123 or 4567
// TODO: Single superblock mul at the end of the superblock
float32x4_t sumf_0 =
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
float32x4_t sumf_1 =
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_hi), vpaddq_s32(acc_hi[p], acc_hi[p + 1])));
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_1);
}
// Multiply Acc bsum + mins
// Each pair of subblocks share the same bsums
// Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
// cols 0-3 bias
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
// cols 4-7 bias
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
} // for sb
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0);
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_1);
} // for b
int base = x * ncols_interleaved;
vst1q_f32(s + base, acc_f32[0]);
vst1q_f32(s + base + 4, acc_f32[1]);
} // for x
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON)
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
@ -1889,3 +2068,212 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc); ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
} }
void ggml_gemm_q4_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
const void * GGML_RESTRICT vx,
const void * GGML_RESTRICT vy,
int nr,
int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 8;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
constexpr int q8_k_blocklen = 4;
const uint8x16_t m4b = vdupq_n_u8(0x0f);
// 8 accumulators: 2 row pairs × 4 col pairs
float32x4_t acc_f32[blocklen];
for (int y = 0; y < nr / q8_k_blocklen; y++) {
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int i = 0; i < blocklen; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
// bsums pairs belongs to the same q8_k subblock
const int16x8_t bsums[4]{
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
};
int16_t bsums_arr[4][8];
for (int q8_row = 0; q8_row < 4; q8_row++) {
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
}
int32x4_t sb_acc[4]; // Aux accumulators to store subblock (partial) results
int32x4_t acc[8]; // rows 01 stored in [0][1][2][3] rows 23 stored in [4][5][6][7]
int32x4_t bias_acc[8]; // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567, [2]->r1 0123 ...
for (int i = 0; i < 8; i++) {
acc[i] = vdupq_n_s32(0);
bias_acc[i] = vdupq_n_s32(0);
}
for (int sb = 0; sb < QK_K / 64; sb++) {
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int8_t q4sb_scales[2][8];
int16x8_t q4sb_mins[2]; // int16 as its needed for bias_acc later
for (int i = 0; i < 2; i++) {
const int offset = sb * 24 + i * 12;
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], q4sb_scales[i]);
}
// q8_ptr[b].qs has interleaved Q8 rows (01, 23)
const int8_t * q8_base = q8_ptr[b].qs + sb * 256;
int8x16_t q8_qs_01[8];
int8x16_t q8_qs_23[8];
// Load 32-byte per row pair, 1 subblock each time
for (int i = 0; i < 8; i++) {
const int offset = i * 32; // 16 for row 01, 16 for row 23
q8_qs_01[i] = vld1q_s8(q8_base + offset);
q8_qs_23[i] = vld1q_s8(q8_base + offset + 16);
}
const int8x16_t q8s[2][8] = {
{ q8_qs_01[0], q8_qs_01[1], q8_qs_01[2], q8_qs_01[3],
q8_qs_01[4], q8_qs_01[5], q8_qs_01[6], q8_qs_01[7] },
{ q8_qs_23[0], q8_qs_23[1], q8_qs_23[2], q8_qs_23[3],
q8_qs_23[4], q8_qs_23[5], q8_qs_23[6], q8_qs_23[7] },
};
// Q4s columns iterated in pairs (01, 23, 45, 67)
for (int cp = 0; cp < ncols_interleaved / 2; cp++) {
for (int i = 0; i < 4; i++) {
sb_acc[i] = vdupq_n_s32(0);
}
uint8x16_t q4_qs_cp_0 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 0); // 0 .. 7 & 32..39
uint8x16_t q4_qs_cp_1 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 64); // 8 ..15 & 40..47
uint8x16_t q4_qs_cp_2 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 128); // 16..23 & 48..55
uint8x16_t q4_qs_cp_3 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 16 * cp + 192); // 24..31 & 56..63
const int8x16_t q4_nibbles[2][4] = {
{
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_0, m4b)),
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_1, m4b)),
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_2, m4b)),
vreinterpretq_s8_u8(vandq_u8(q4_qs_cp_3, m4b)),
},
{
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_0, 4)),
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_1, 4)),
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_2, 4)),
vreinterpretq_s8_u8(vshrq_n_u8(q4_qs_cp_3, 4)),
}
};
// Calculates the Qs muladd of every row pair (rp) rows 01 and 23 of q8
// for each of the internal 32 qs subblock (blk)
for (int rp = 0; rp < 2; rp++) {
for (int blk = 0; blk < 2; blk++) {
const int8x16_t * q8 = &q8s[rp][4 * blk];
const int8x16_t * q4 = q4_nibbles[blk];
int32x4_t acc = sb_acc[2 * rp + blk];
// mul add for each qs in the same subblock
for (int qs_offset = 0; qs_offset < 4; qs_offset++) {
acc = vmmlaq_s32(acc, q4[qs_offset], q8[qs_offset]);
}
sb_acc[2 * rp + blk] = acc;
}
}
// Scales[i] corresponds to column i
const int scale_offset = cp * 2;
for (int blk = 0; blk < 2; blk++) {
const int32x4_t block_scale = {
(int32_t) q4sb_scales[blk][scale_offset],
(int32_t) q4sb_scales[blk][scale_offset],
(int32_t) q4sb_scales[blk][scale_offset + 1],
(int32_t) q4sb_scales[blk][scale_offset + 1],
};
acc[cp] = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
}
}
// Multiply Acc bsum + mins
for (int q8_row = 0; q8_row < 4; q8_row++) {
// Each pair of subblocks share the same bsums
// Load scalar bsum → broadcast to a vector (vdupq_n_s16(s)).
int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][q8_row * 2]);
int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][q8_row * 2 + 1]);
bias_acc[2 * q8_row] =
vmlal_s16(bias_acc[2 * q8_row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
bias_acc[2 * q8_row] =
vmlal_s16(bias_acc[2 * q8_row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
bias_acc[2 * q8_row + 1] =
vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
bias_acc[2 * q8_row + 1] =
vmlal_s16(bias_acc[2 * q8_row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
}
} // for sb
// Reorder of i8mm output with bias and output layout
for (int i = 0; i < 8; i++) {
int32x2x2_t aux = vzip_s32(vget_low_s32(acc[i]), vget_high_s32(acc[i]));
acc[i] = vcombine_s32(aux.val[0], aux.val[1]);
}
int32x4_t reorder_acc[8] = {
vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1])),
vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3])),
vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1])),
vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3])),
vcombine_s32(vget_low_s32(acc[4]), vget_low_s32(acc[5])),
vcombine_s32(vget_low_s32(acc[6]), vget_low_s32(acc[7])),
vcombine_s32(vget_high_s32(acc[4]), vget_high_s32(acc[5])),
vcombine_s32(vget_high_s32(acc[6]), vget_high_s32(acc[7])),
};
for (int i = 0; i < q8_k_blocklen; i++) {
for (int j = 0; j < 2; j++) {
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d[i]);
float32x4_t q4_dmin = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].dmin + j * 4)));
const float32x4_t dmins = vmulq_f32(q4_dmin, q8_d);
float32x4_t q4_d = vcvt_f32_f16(vld1_f16((const __fp16 *) (q4_ptr[b].d + j * 4)));
const float32x4_t scale = vmulq_f32(q4_d, q8_d);
acc_f32[2 * i + j] = vmlsq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(bias_acc[2 * i + j]), dmins);
acc_f32[2 * i + j] =
vmlaq_f32(acc_f32[2 * i + j], vcvtq_f32_s32(reorder_acc[2 * i + j]), scale);
}
}
} // for b
// With the previous reorder, the tile is already in the correct memory layout.
for (int i = 0; i < q8_k_blocklen; i++) {
int row = y * q8_k_blocklen + i;
for (int j = 0; j < 2; j++) {
int col = x * ncols_interleaved + j * 4;
int offset = row * bs + col;
vst1q_f32(s + offset, acc_f32[2 * i + j]);
}
}
} // for x
} // for y
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}

View File

@ -0,0 +1,35 @@
#include "ggml-backend-impl.h"
#if defined(__riscv) && __riscv_xlen == 64
#include <sys/auxv.h>
//https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
#ifndef COMPAT_HWCAP_ISA_V
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif
struct riscv64_features {
bool has_rvv = false;
riscv64_features() {
uint32_t hwcap = getauxval(AT_HWCAP);
has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
}
};
static int ggml_backend_cpu_riscv64_score() {
int score = 1;
riscv64_features rf;
#ifdef GGML_USE_RVV
if (!rf.has_rvv) { return 0; }
score += 1 << 1;
#endif
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_riscv64_score)
#endif // __riscv && __riscv_xlen == 64

View File

@ -1961,6 +1961,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &q4_K_8x8_q8_K; return &q4_K_8x8_q8_K;
} }
} }
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (cur->ne[1] % 8 == 0) {
return &q4_K_8x8_q8_K;
}
}
} else if (cur->type == GGML_TYPE_Q2_K) { } else if (cur->type == GGML_TYPE_Q2_K) {
if (ggml_cpu_has_avx512()) { if (ggml_cpu_has_avx512()) {
if (cur->ne[1] % 8 == 0) { if (cur->ne[1] % 8 == 0) {

View File

@ -212,6 +212,6 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
} }
template<typename src_t, typename dst_t> template<typename src_t, typename dst_t>
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) { static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi); *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
} }

View File

@ -12,10 +12,10 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
template <cpy_kernel_t cpy_1> template <cpy_kernel_t cpy_1>
static __global__ void cpy_flt(const char * cx, char * cdst, const int ne, static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13) { const int nb12, const int nb13) {
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
@ -40,7 +40,7 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
} }
template <typename T> template <typename T>
static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne, static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13) { const int nb12, const int nb13) {
@ -166,7 +166,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
} }
template<typename src_t, typename dst_t> template<typename src_t, typename dst_t>
static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) { static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) { if (i >= ne) {
@ -180,17 +180,17 @@ static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const in
} }
template<typename src_t, typename dst_t> template<typename src_t, typename dst_t>
static void ggml_cpy_flt_contiguous_cuda( static void ggml_cpy_scalar_contiguous_cuda(
const char * cx, char * cdst, const int64_t ne, const char * cx, char * cdst, const int64_t ne,
cudaStream_t stream) { cudaStream_t stream) {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne); (cx, cdst, ne);
} }
template<typename src_t, typename dst_t, bool transposed = false> template<typename src_t, typename dst_t, bool transposed = false>
static void ggml_cpy_flt_cuda( static void ggml_cpy_scalar_cuda(
const char * cx, char * cdst, const int ne, const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
@ -212,11 +212,11 @@ static void ggml_cpy_flt_cuda(
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>> cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} else { } else {
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
} }
@ -399,94 +399,132 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} }
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
if (can_be_transposed) { if (can_be_transposed) {
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<float, float, true>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else { } else {
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<float, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<float, nv_bfloat16>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<float, nv_bfloat16>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, half> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<float, half>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<float, half>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_q8_0_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_q8_0_f32_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_q4_0_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, ggml_cpy_q4_0_f32_cuda
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_q4_1_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, ggml_cpy_q4_1_f32_cuda
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_q5_0_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, ggml_cpy_q5_0_f32_cuda
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_iq4_nl_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_f32_q5_1_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_q5_1_f32_cuda
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
if (can_be_transposed) { if (can_be_transposed) {
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<half, half, true>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else { } else {
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<half, half>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<half, nv_bfloat16>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<half, nv_bfloat16>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<half, float> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<half, float>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<half, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
if (can_be_transposed) { if (can_be_transposed) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else { } else {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, half>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<nv_bfloat16, half>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, float>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<nv_bfloat16, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
if (can_be_transposed) {
ggml_cpy_scalar_cuda<int32_t, int32_t, true>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
ggml_cpy_scalar_cuda<int32_t, int32_t>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<float, int32_t>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<float, int32_t>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) { if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, main_stream); ggml_cpy_scalar_contiguous_cuda<int32_t, float>
(src0_ddc, src1_ddc, ne, main_stream);
} else { } else {
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_scalar_cuda<int32_t, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} }
} else { } else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,

View File

@ -4115,6 +4115,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) { if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
return true; return true;
} }
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_I32) {
return true;
}
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) { if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
return true; return true;
} }

View File

@ -43,6 +43,14 @@ set(HTP_CMAKE_ARGS
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT} -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}) -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
ExternalProject_Add(htp-v68
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
ExternalProject_Add(htp-v69
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
ExternalProject_Add(htp-v73 ExternalProject_Add(htp-v73
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73") CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
@ -61,6 +69,8 @@ ExternalProject_Add(htp-v81
# Install Hexagon skels required at runtime # Install Hexagon skels required at runtime
install(FILES install(FILES
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so

View File

@ -9,6 +9,7 @@
#include <chrono> #include <chrono>
#include <mutex> #include <mutex>
#include <string> #include <string>
#include <stdexcept>
#ifdef _WIN32 #ifdef _WIN32
# include <sal.h> # include <sal.h>
@ -240,6 +241,23 @@ struct ggml_hexagon_session {
uint32_t prof_pkts; uint32_t prof_pkts;
}; };
static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
char dims[64 * GGML_MAX_SRC];
char strides[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
names, dims, types, strides, buffs, req_flags);
}
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
// Bump pending flag (cleared in the session::flush once we get the responce) // Bump pending flag (cleared in the session::flush once we get the responce)
this->op_pending++; // atomic inc this->op_pending++; // atomic inc
@ -1912,6 +1930,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
return true; return true;
} }
template <typename... _TTensor>
static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
return ([&]() -> bool {
return !tensors || !tensors->buffer ||
(ggml_backend_buffer_is_hexagon(tensors->buffer) &&
ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
}() && ...);
}
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
@ -1959,16 +1986,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
} }
// src0 & src1 & dst must be mapped to the same session // src0 & src1 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2016,20 +2034,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
// src0 (weights) must be repacked and mapped to the same session // src0 (weights) must be repacked and mapped to the same session
// src1 & sr2 & dst must be mapped to the same session // src1 & sr2 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (src2->buffer &&
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2063,16 +2068,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
} }
// src0, src1 & dst must be mapped to the same session // src0, src1 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2104,20 +2100,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
} }
// src0, src1 & dst must be mapped to the same session // src0, src1 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (src2->buffer &&
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2144,12 +2127,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
} }
// src0 & dst must be mapped to the same session // src0 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2186,16 +2164,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
} }
// src0, src1 & dst must be mapped to the same session // src0, src1 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1 && src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2248,16 +2217,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
} }
// src0, src1 & dst must be mapped to the same session // src0, src1 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1 && src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2269,7 +2229,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
int mode = op_params[2]; int mode = op_params[2];
if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) { if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
return false; return false;
} }
if (mode & 1) { if (mode & 1) {
@ -2312,20 +2272,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
} }
// src0, src1, src2 & dst must be mapped to the same session // src0, src1, src2 & dst must be mapped to the same session
if (src0->buffer && if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
return false;
}
if (src1->buffer &&
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
return false;
}
if (src2 && src2->buffer &&
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
return false;
}
if (dst->buffer &&
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
return false; return false;
} }
@ -2346,6 +2293,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
h->nb[3] = t->nb[3]; h->nb[3] = t->nb[3];
} }
static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
if (!t) {
return 0;
}
memset(buf, 0, sizeof(*buf));
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
buf->fd = tensor_buf->fd;
buf->ptr = t->data;
buf->offset = (uint8_t *) t->data - tensor_buf->base;
buf->size = ggml_nbytes(t);
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
return 1;
}
static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
}
static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) { static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context); auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
auto sess = buf->sess; auto sess = buf->sess;
@ -2360,10 +2327,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src1 = op->src[1];
const struct ggml_tensor * dst = op; const struct ggml_tensor * dst = op;
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
uint64_t t1, t2; uint64_t t1, t2;
t1 = ggml_time_us(); t1 = ggml_time_us();
@ -2385,55 +2348,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
} }
dspqueue_buffer bufs[3]; dspqueue_buffer bufs[3];
memset(bufs, 0, sizeof(bufs));
// First buffer Weights. // First buffer Weights.
// The content is static, there is no need to do any cache management // The content is static, there is no need to do any cache management
bufs[0].fd = src0_buf->fd; dspqueue_buffers_init(bufs, src0, false, false);
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = 0;
// Second buffer Input Activations. This is a buffer that the CPU // Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and // writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the // invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible. // framework will automatically skip cache operations where possible.
bufs[1].fd = src1_buf->fd; dspqueue_buffers_init(&bufs[1], src1, true, true);
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer Output Activations. We'll handle DSP // Third buffer Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush // cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are // CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start. // written out before writes from the DSP start.
bufs[2].fd = dst_buf->fd; dspqueue_buffers_init(&bufs[2], dst, true, false);
bufs[2].ptr = dst->data;
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[2].size = ggml_nbytes(dst);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 (normally weight) tensor auto * sess = get_session_from_tensor(src0);
auto sess = src0_buf->sess;
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]); hex_dump_dspbuf(src1, &bufs[1]);
@ -2463,11 +2398,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
const struct ggml_tensor * src2 = op->src[2]; const struct ggml_tensor * src2 = op->src[2];
const struct ggml_tensor * dst = op; const struct ggml_tensor * dst = op;
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
uint64_t t1, t2; uint64_t t1, t2;
t1 = ggml_time_us(); t1 = ggml_time_us();
@ -2490,66 +2420,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
} }
dspqueue_buffer bufs[4]; dspqueue_buffer bufs[4];
memset(bufs, 0, sizeof(bufs));
// First buffer Weights. // First buffer Weights.
// The content is static, there is no need to do any cache management // The content is static, there is no need to do any cache management
bufs[0].fd = src0_buf->fd; dspqueue_buffers_init(bufs, src0, false, false);
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = 0;
// Second buffer Input Activations. This is a buffer that the CPU // Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and // writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the // invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible. // framework will automatically skip cache operations where possible.
bufs[1].fd = src1_buf->fd; dspqueue_buffers_init(&bufs[1], src1, true, true);
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer expert IDs. This is a buffer that the CPU // Third buffer expert IDs. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and // writes and the DSP reads, so we'll need to flush CPU caches and
// invalidate DSP ones. On platforms with I/O coherency support the // invalidate DSP ones. On platforms with I/O coherency support the
// framework will automatically skip cache operations where possible. // framework will automatically skip cache operations where possible.
bufs[2].fd = src2_buf->fd; dspqueue_buffers_init(&bufs[2], src2, true, true);
bufs[2].ptr = src2->data;
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[2].size = ggml_nbytes(src2);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Forth buffer Output Activations. We'll handle DSP // Forth buffer Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush // cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are // CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start. // written out before writes from the DSP start.
bufs[3].fd = dst_buf->fd; dspqueue_buffers_init(&bufs[3], dst, true, false);
bufs[3].ptr = dst->data;
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[3].size = ggml_nbytes(dst);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 (normally weight) tensor auto * sess = get_session_from_tensor(src0);
auto sess = src0_buf->sess;
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]); hex_dump_dspbuf(src1, &bufs[1]);
@ -2581,10 +2477,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * src1 = node->src[1];
const struct ggml_tensor * dst = node; const struct ggml_tensor * dst = node;
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
uint64_t t1 = 0; uint64_t t1 = 0;
uint64_t t2 = 0; uint64_t t2 = 0;
@ -2621,60 +2513,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
init_htp_tensor(&req.dst, dst); init_htp_tensor(&req.dst, dst);
dspqueue_buffer bufs[3]; dspqueue_buffer bufs[3];
memset(bufs, 0, sizeof(bufs));
// First buffer = First Operand of Binary op // First buffer = First Operand of Binary op
// This is a buffer that the CPU writes and the DSP reads, so we'll // This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms // need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip // with I/O coherency support the framework will automatically skip
// cache operations where possible. // cache operations where possible.
bufs[0].fd = src0_buf->fd; dspqueue_buffers_init(bufs, src0, true, true);
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
// Second buffer = Second Operand of Binary op // Second buffer = Second Operand of Binary op
// This is a buffer that the CPU writes and the DSP reads, so we'll // This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms // need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip // with I/O coherency support the framework will automatically skip
// cache operations where possible. // cache operations where possible.
bufs[1].fd = src1_buf->fd; dspqueue_buffers_init(&bufs[1], src1, true, true);
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer = Output Activations. We'll handle DSP // Third buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush // cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are // CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start. // written out before writes from the DSP start.
bufs[2].fd = dst_buf->fd; dspqueue_buffers_init(&bufs[2], dst, true, false);
bufs[2].ptr = dst->data;
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[2].size = ggml_nbytes(dst);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0);
ggml_hexagon_session * sess = src0_buf->sess;
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[16 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]); hex_dump_dspbuf(src1, &bufs[1]);
@ -2705,11 +2567,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
const struct ggml_tensor * src2 = node->src[2]; const struct ggml_tensor * src2 = node->src[2];
const struct ggml_tensor * dst = node; const struct ggml_tensor * dst = node;
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
uint64_t t1 = 0; uint64_t t1 = 0;
uint64_t t2 = 0; uint64_t t2 = 0;
@ -2741,58 +2598,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
init_htp_tensor(&req.dst, dst); init_htp_tensor(&req.dst, dst);
dspqueue_buffer bufs[4]; dspqueue_buffer bufs[4];
memset(bufs, 0, sizeof(bufs));
// First buffer = input activations // First buffer = input activations
bufs[0].fd = src0_buf->fd; dspqueue_buffers_init(bufs, src0, true, true);
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
// Second buffer = experts bias // Second buffer = experts bias
bufs[1].fd = src1_buf->fd; dspqueue_buffers_init(&bufs[1], src1, true, true);
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer = activated experts // Third buffer = activated experts
bufs[2].fd = src2_buf->fd; dspqueue_buffers_init(&bufs[2], src2, true, true);
bufs[2].ptr = src2->data;
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[2].size = ggml_nbytes(src2);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Forth buffer = output activations // Forth buffer = output activations
bufs[3].fd = dst_buf->fd; dspqueue_buffers_init(&bufs[3], dst, true, true);
bufs[3].ptr = dst->data;
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[3].size = ggml_nbytes(dst);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0);
ggml_hexagon_session * sess = src0_buf->sess;
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[16 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
hex_dump_dspbuf(src1, &bufs[1]); hex_dump_dspbuf(src1, &bufs[1]);
@ -2886,71 +2704,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
} }
dspqueue_buffer bufs[3]; dspqueue_buffer bufs[3];
int n_bufs = 0;
memset(bufs, 0, sizeof(bufs));
// First buffer = Only Operand of Unary op // First buffer = Only Operand of Unary op
// This is a buffer that the CPU writes and the DSP reads, so we'll // This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms // need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip // with I/O coherency support the framework will automatically skip
// cache operations where possible. // cache operations where possible.
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context); size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
bufs[n_bufs].fd = src0_buf->fd;
bufs[n_bufs].ptr = src0->data;
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[n_bufs].size = ggml_nbytes(src0);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
++n_bufs;
if (src1) { // Second buffer(nullable) = Second Operand of Binary op
// Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll
// This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms
// need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip
// with I/O coherency support the framework will automatically skip // cache operations where possible.
// cache operations where possible. n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
bufs[n_bufs].fd = src1_buf->fd;
bufs[n_bufs].ptr = src1->data;
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[n_bufs].size = ggml_nbytes(src1);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
}
// Second or third buffer = Output Activations. We'll handle DSP // Second or third buffer = Output Activations. We'll handle DSP
// Second buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush // cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are // CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start. // written out before writes from the DSP start.
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context); n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
bufs[n_bufs].fd = dst_buf->fd;
bufs[n_bufs].ptr = dst->data;
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[n_bufs].size = ggml_nbytes(dst);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
++n_bufs;
// Primary DSP session from the src0 tensor // Primary DSP session from the src0 tensor
ggml_hexagon_session * sess = src0_buf->sess; auto * sess = get_session_from_tensor(src0);
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
if (src1) { if (src1) {
@ -3023,85 +2803,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
} }
dspqueue_buffer bufs[4]; dspqueue_buffer bufs[4];
int n_bufs = 0;
memset(bufs, 0, sizeof(bufs));
// First buffer // First buffer
// This is a buffer that the CPU writes and the DSP reads, so we'll // This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms // need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip // with I/O coherency support the framework will automatically skip
// cache operations where possible. // cache operations where possible.
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context); size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
bufs[n_bufs].fd = src0_buf->fd;
bufs[n_bufs].ptr = src0->data;
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[n_bufs].size = ggml_nbytes(src0);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
++n_bufs;
// Second buffer // Second buffer
// This is a buffer that the CPU writes and the DSP reads, so we'll // This is a buffer that the CPU writes and the DSP reads, so we'll
// need to flush CPU caches and invalidate DSP ones. On platforms // need to flush CPU caches and invalidate DSP ones. On platforms
// with I/O coherency support the framework will automatically skip // with I/O coherency support the framework will automatically skip
// cache operations where possible. // cache operations where possible.
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context); n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
bufs[n_bufs].fd = src1_buf->fd;
bufs[n_bufs].ptr = src1->data;
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[n_bufs].size = ggml_nbytes(src1);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
if (src2) { // Third buffer(nullable)
// Third buffer // This is a buffer that the CPU writes and the DSP reads, so we'll
// This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms
// need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip
// with I/O coherency support the framework will automatically skip // cache operations where possible.
// cache operations where possible. n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
bufs[n_bufs].fd = src2_buf->fd;
bufs[n_bufs].ptr = src2->data;
bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[n_bufs].size = ggml_nbytes(src2);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
}
// Final buffer = Output Activations. We'll handle DSP // Final buffer = Output Activations. We'll handle DSP
// Second buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP
// cache maintenance in the response message but need to flush // cache maintenance in the response message but need to flush
// CPU caches to ensure any previously written dirty lines are // CPU caches to ensure any previously written dirty lines are
// written out before writes from the DSP start. // written out before writes from the DSP start.
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context); n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
bufs[n_bufs].fd = dst_buf->fd;
bufs[n_bufs].ptr = dst->data;
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[n_bufs].size = ggml_nbytes(dst);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
++n_bufs;
// Primary DSP session from the src0 tensor // Primary DSP session from the src0 tensor
ggml_hexagon_session * sess = src0_buf->sess; auto * sess = get_session_from_tensor(src0);
if (opt_verbose) { if (opt_verbose) {
char dims[64 * GGML_MAX_SRC]; hex_print_op_info(op, sess, req.flags);
char strides[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
hex_format_op_dims(dims, op);
hex_format_op_strides(strides, op);
hex_format_op_types(types, op);
hex_format_op_buffs(buffs, op);
hex_format_op_names(names, op);
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
names, dims, types, strides, buffs, req.flags);
if (opt_verbose > 1) { if (opt_verbose > 1) {
hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src0, &bufs[0]);
if (src1) { if (src1) {

View File

@ -390,6 +390,12 @@ int get_hex_arch_ver(int domain, int * arch) {
} }
switch (arch_ver.capability & 0xff) { switch (arch_ver.capability & 0xff) {
case 0x68:
*arch = 68;
return 0;
case 0x69:
*arch = 69;
return 0;
case 0x73: case 0x73:
*arch = 73; *arch = 73;
return 0; return 0;

View File

@ -66,6 +66,13 @@ static inline bool dma_queue_push(dma_queue * q,
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1; desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
desc->dstbypass = 1; desc->dstbypass = 1;
desc->srcbypass = 1; desc->srcbypass = 1;
#if __HVX_ARCH__ >= 73
desc->dstbypass = 1;
desc->srcbypass = 1;
#else
desc->dstbypass = 0;
desc->srcbypass = 1;
#endif
desc->order = 0; desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE; desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
desc->src = (void *) src; desc->src = (void *) src;

View File

@ -16,13 +16,8 @@
#include "hvx-utils.h" #include "hvx-utils.h"
#include "ops-utils.h" #include "ops-utils.h"
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
static const float kInf = INFINITY; const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
static const float kMaxExp = 88.02f; // log(INF)
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
HVX_Vector out = hvx_vec_exp_fp32(in_vec); HVX_Vector out = hvx_vec_exp_fp32(in_vec);
@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
HVX_Vector vec_out = Q6_V_vzero(); HVX_Vector vec_out = Q6_V_vzero();
static const float kInf = INFINITY;
static const float kMaxExp = 88.02f; // log(INF)
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
if (0 == unaligned_loop) { if (0 == unaligned_loop) {
HVX_Vector * p_vec_in1 = (HVX_Vector *) src; HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
HVX_Vector * p_vec_out = (HVX_Vector *) dst; HVX_Vector * p_vec_out = (HVX_Vector *) dst;
@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
if (true == negate) { if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
} else { } else {
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
} }
} }
} else { } else {
@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
if (true == negate) { if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
} else { } else {
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
} }
} }
} }
@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
if (true == negate) { if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
} else { } else {
vec_out = hvx_vec_exp_fp32_guard(in); vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
} }
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);

View File

@ -16,6 +16,15 @@
#include "hvx-utils.h" #include "hvx-utils.h"
#include "ops-utils.h" #include "ops-utils.h"
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
}
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1); int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over; int num_elems_whole = num_elems - left_over;
@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n"); FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
} }
static const uint32_t kNanInfMask = 0x7f800000;
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
if (0 == unaligned_loop) { if (0 == unaligned_loop) {
HVX_Vector * p_vec_in = (HVX_Vector *) src; HVX_Vector * p_vec_in = (HVX_Vector *) src;
HVX_Vector * p_vec_out = (HVX_Vector *) dst; HVX_Vector * p_vec_out = (HVX_Vector *) dst;
#pragma unroll(4) #pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
} }
} else { } else {
#pragma unroll(4) #pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
} }
} }
@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
float * dstf = (float *) dst + num_elems_whole; float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf; HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = hvx_vec_inverse_fp32_guard(in); HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
} }

View File

@ -21,6 +21,26 @@ typedef union {
float fp32[VLEN_FP32]; float fp32[VLEN_FP32];
} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; } __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
/* Q6_Vsf_equals_Vw is only available on v73+.*/
#if __HVX_ARCH__ < 73
static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
{
HVX_Vector const vzero = Q6_V_vzero();
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
return ret;
}
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
{
return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
}
#endif
static inline HVX_Vector hvx_vec_splat_fp32(float i) { static inline HVX_Vector hvx_vec_splat_fp32(float i) {
union { union {
float f; float f;
@ -726,24 +746,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
return Q6_Vsf_equals_Vqf32(r_qf); return Q6_Vsf_equals_Vqf32(r_qf);
} }
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
static const float kInf = INFINITY;
static const uint32_t kNanMask = 0x7fffffff;
static const uint32_t kNanMin = 0x7f800000;
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
}
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
@ -958,14 +960,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
return Q6_Vsf_equals_Vqf32(temp); return Q6_Vsf_equals_Vqf32(temp);
} }
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
static const float kMaxExp = -88.02f; // log(INF) HVX_Vector one,
HVX_Vector max_exp,
const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); HVX_Vector min_exp) {
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); out = Q6_V_vmux_QVV(pred_max, out, one);
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
} }
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
@ -977,9 +981,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
const HVX_Vector * restrict v_src = (HVX_Vector *) src; const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst; HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
static const float kMinExp = -87.f; // 0
static const float kMaxExp = 87.f; // 1
const HVX_Vector one = hvx_vec_splat_fp32(1.f);
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
#pragma unroll(4) #pragma unroll(4)
for (int i = 0; i < step_of_1; i++) { for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
} }
} }

View File

@ -143,16 +143,25 @@ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
} }
static int vtcm_acquire(struct htp_context * ctx) { static int vtcm_acquire(struct htp_context * ctx) {
int err;
if (!ctx->vtcm_valid) { if (!ctx->vtcm_valid) {
// Temporarily bump thread priority to make sure it's higher than other sessions. // Temporarily bump thread priority to make sure it's higher than other sessions.
// This way the resource manager will notify the other thread to release VTCM. // This way the resource manager will notify the other thread to release VTCM.
// Note that we need to reaquire VTCM at normal priority for this to work next time. // Note that we need to reaquire VTCM at normal priority for this to work next time.
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10); qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
if (err != 0) {
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
abort();
}
HAP_compute_res_release_cached(ctx->vtcm_rctx); HAP_compute_res_release_cached(ctx->vtcm_rctx);
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio); qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000); err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
if (err != 0) {
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
abort();
}
ctx->vtcm_valid = true; ctx->vtcm_valid = true;
} }
@ -201,7 +210,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
HAP_compute_res_attr_init(&attr); HAP_compute_res_attr_init(&attr);
HAP_compute_res_attr_set_serialize(&attr, 0); HAP_compute_res_attr_set_serialize(&attr, 0);
HAP_compute_res_attr_set_cache_mode(&attr, 1); HAP_compute_res_attr_set_cache_mode(&attr, 1);
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx); HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
HAP_compute_res_attr_set_hmx_param(&attr, 1); HAP_compute_res_attr_set_hmx_param(&attr, 1);

View File

@ -24,6 +24,10 @@
#include "hvx-utils.h" #include "hvx-utils.h"
#include "ops-utils.h" #include "ops-utils.h"
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
#define HTP_ROPE_TYPE_NORMAL 0
#define HTP_ROPE_TYPE_NEOX 2
#define htp_rope_preamble \ #define htp_rope_preamble \
const uint32_t ne00 = src0->ne[0]; \ const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \ const uint32_t ne01 = src0->ne[1]; \
@ -146,6 +150,57 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor); rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
} }
static void hvx_calc_rope_neox_f32(const float * restrict src0,
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
// for (int i = 0; i < num_elems; i += 2) {
//const float cos_theta = theta_cache[i + 0];
//const float sin_theta = theta_cache[i + 1];
//const float x0 = src[0];
//const float x1 = src[num_elems/2];
//dst[0] = x0*cos_theta - x1*sin_theta;
//dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
//src += 1;
//dst += 1;
// }
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
uint8_t * restrict dst_curr = (uint8_t *) dst;
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
int half_size = (sizeof(float) * (num_elems / 2));
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
src0_curr += VLEN;
theta_curr += 2 * VLEN;
dst_curr += VLEN;
}
}
static void hvx_calc_rope_f32(const float * restrict src0, static void hvx_calc_rope_f32(const float * restrict src0,
float * restrict dst, float * restrict dst,
const int num_elems, const int num_elems,
@ -212,6 +267,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
const struct htp_tensor * src2 = &octx->src2; const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst; struct htp_tensor * dst = &octx->dst;
const int32_t mode = rope_ctx->mode;
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
htp_rope_preamble; htp_rope_preamble;
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
@ -247,20 +305,35 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
float * dst_data_loc = dst_data; float * dst_data_loc = dst_data;
if (1 == opt_path) { if (1 == opt_path) {
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0); if (is_neox) {
hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
} else {
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
}
} else { } else {
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) { for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
const float cos_theta = wp0[i0 + 0]; const float cos_theta = wp0[i0 + 0];
const float sin_theta = wp0[i0 + 1]; const float sin_theta = wp0[i0 + 1];
const float x0 = src_loc[0]; if (is_neox) {
const float x1 = src_loc[1]; const float x0 = src_loc[0];
const float x1 = src_loc[rope_ctx->n_dims/2];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta; dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
src_loc += 2; src_loc += 1;
dst_data_loc += 2; dst_data_loc += 1;
} else {
const float x0 = src_loc[0];
const float x1 = src_loc[1];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
src_loc += 2;
dst_data_loc += 2;
}
} }
} }

View File

@ -11381,13 +11381,13 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
} }
} }
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready); static void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
// Returns true if node has enqueued work into the queue, false otherwise // Returns true if node has enqueued work into the queue, false otherwise
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){
ggml_tensor * node = cgraph->nodes[node_idx]; ggml_tensor * node = cgraph->nodes[node_idx];
if (ggml_is_empty(node) || !node->buffer) { if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
return false; return false;
} }
@ -11399,132 +11399,19 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
ggml_tensor * src2 = node->src[2]; ggml_tensor * src2 = node->src[2];
ggml_tensor * src3 = node->src[3]; ggml_tensor * src3 = node->src[3];
switch (node->op) { if (node->op == GGML_OP_ADD) {
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops;
case GGML_OP_RESHAPE: if (next_node_idx < cgraph->n_nodes &&
case GGML_OP_VIEW: cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM &&
case GGML_OP_PERMUTE: cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] &&
case GGML_OP_TRANSPOSE: ggml_nrows(cgraph->nodes[next_node_idx]) == 1 &&
case GGML_OP_NONE: ctx->device->add_rms_fusion) {
return false; uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
case GGML_OP_UNARY: ctx->do_add_rms_partials_offset_calculation = true;
switch (ggml_get_unary_op(node)) { if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) {
case GGML_UNARY_OP_EXP: ctx->do_add_rms_partials = true;
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
break;
default:
return false;
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(node)) {
case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_SWIGLU_OAI:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
break;
default:
return false;
}
break;
case GGML_OP_ADD:
{
int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops;
if (next_node_idx < cgraph->n_nodes &&
cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM &&
cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] &&
ggml_nrows(cgraph->nodes[next_node_idx]) == 1 &&
ctx->device->add_rms_fusion) {
uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]);
ctx->do_add_rms_partials_offset_calculation = true;
if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) {
ctx->do_add_rms_partials = true;
}
} }
} break; }
case GGML_OP_REPEAT:
case GGML_OP_REPEAT_BACK:
case GGML_OP_GET_ROWS:
case GGML_OP_ADD_ID:
case GGML_OP_ACC:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_ADD1:
case GGML_OP_ARANGE:
case GGML_OP_FILL:
case GGML_OP_CONCAT:
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_LOG:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_ROLL:
case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_SILU_BACK:
case GGML_OP_NORM:
case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
case GGML_OP_L2_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ARGSORT:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGMAX:
case GGML_OP_COUNT_EQUAL:
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_3D:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_POOL_2D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_TRANSPOSE_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
case GGML_OP_SSM_SCAN:
case GGML_OP_SSM_CONV:
case GGML_OP_LEAKY_RELU:
case GGML_OP_FLASH_ATTN_EXT:
case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
break;
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ABORT("fatal error");
} }
vk_context compute_ctx; vk_context compute_ctx;
@ -11961,145 +11848,14 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
ctx->compute_ctx.reset(); ctx->compute_ctx.reset();
bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready); ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
} else if (node->op == GGML_OP_GLU) {
std::cerr << __func__ << ": error: op not supported GLU " << node->name << " (" << ggml_glu_op_name(static_cast<ggml_glu_op>(node->op_params[0])) << ")" << std::endl;
} else {
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
}
}
} }
return true; return true;
} }
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) { static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) {
GGML_UNUSED(cgraph); GGML_UNUSED(cgraph);
ggml_backend_buffer * buf = nullptr; GGML_UNUSED(tensor);
switch (tensor->op) {
case GGML_OP_ADD:
case GGML_OP_ACC:
case GGML_OP_GET_ROWS:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_ADD1:
case GGML_OP_ARANGE:
case GGML_OP_FILL:
case GGML_OP_ADD_ID:
case GGML_OP_CONCAT:
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_LOG:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_ROLL:
case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
case GGML_OP_CONT:
case GGML_OP_DUP:
case GGML_OP_SILU_BACK:
case GGML_OP_NORM:
case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
case GGML_OP_L2_NORM:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
case GGML_OP_ARGSORT:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGMAX:
case GGML_OP_COUNT_EQUAL:
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_3D:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_POOL_2D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_TRANSPOSE_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
case GGML_OP_SSM_SCAN:
case GGML_OP_SSM_CONV:
case GGML_OP_LEAKY_RELU:
case GGML_OP_REPEAT:
case GGML_OP_REPEAT_BACK:
case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
buf = tensor->buffer;
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_TRUNC:
buf = tensor->buffer;
break;
default:
return false;
}
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(tensor)) {
case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_SWIGLU_OAI:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
buf = tensor->buffer;
break;
default:
return false;
}
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
case GGML_OP_FLASH_ATTN_EXT:
buf = tensor->buffer;
break;
default:
return false;
}
if (buf == nullptr) {
return false;
}
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
@ -12143,8 +11899,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
subctx->out_memcpys.clear(); subctx->out_memcpys.clear();
subctx->memsets.clear(); subctx->memsets.clear();
} }
return true;
} }
// Clean up after graph processing is done // Clean up after graph processing is done

View File

@ -427,6 +427,7 @@ class MODEL_ARCH(IntEnum):
APERTUS = auto() APERTUS = auto()
COGVLM = auto() COGVLM = auto()
MINIMAXM2 = auto() MINIMAXM2 = auto()
RND1 = auto()
PANGU_EMBED = auto() PANGU_EMBED = auto()
@ -797,6 +798,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.APERTUS: "apertus", MODEL_ARCH.APERTUS: "apertus",
MODEL_ARCH.MINIMAXM2: "minimax-m2", MODEL_ARCH.MINIMAXM2: "minimax-m2",
MODEL_ARCH.COGVLM: "cogvlm", MODEL_ARCH.COGVLM: "cogvlm",
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
} }
@ -2991,6 +2993,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.VISEXP_UP, MODEL_TENSOR.VISEXP_UP,
MODEL_TENSOR.VISEXP_DOWN, MODEL_TENSOR.VISEXP_DOWN,
], ],
MODEL_ARCH.RND1: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.PANGU_EMBED: [ MODEL_ARCH.PANGU_EMBED: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,

View File

@ -1 +1 @@
781baf2a14d9e0aaee542b2e1bb918bfc4132199 55bc9320a4aae82af18e23eefd5de319a755d7b9

View File

@ -115,6 +115,7 @@ add_library(llama
models/qwen3vl-moe.cpp models/qwen3vl-moe.cpp
models/qwen3moe.cpp models/qwen3moe.cpp
models/refact.cpp models/refact.cpp
models/rnd1.cpp
models/rwkv6-base.cpp models/rwkv6-base.cpp
models/rwkv6.cpp models/rwkv6.cpp
models/rwkv6qwen2.cpp models/rwkv6qwen2.cpp

View File

@ -108,6 +108,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_APERTUS, "apertus" }, { LLM_ARCH_APERTUS, "apertus" },
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" }, { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
{ LLM_ARCH_COGVLM, "cogvlm" }, { LLM_ARCH_COGVLM, "cogvlm" },
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -2446,6 +2447,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
}, },
}, },
{
LLM_ARCH_RND1,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -2722,6 +2743,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
case LLM_ARCH_DREAM: case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA: case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE: case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_RND1:
return true; return true;
default: default:
return false; return false;

View File

@ -112,6 +112,7 @@ enum llm_arch {
LLM_ARCH_APERTUS, LLM_ARCH_APERTUS,
LLM_ARCH_MINIMAX_M2, LLM_ARCH_MINIMAX_M2,
LLM_ARCH_COGVLM, LLM_ARCH_COGVLM,
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED, LLM_ARCH_PANGU_EMBED,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };

View File

@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_RND1:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
// Set non-causal attention for diffusion models
hparams.causal_attn = false;
} break;
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
{ {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break; } break;
case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_QWEN3VLMOE:
case LLM_ARCH_RND1:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
} }
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) { if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
} }
@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_DREAM: case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA: case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE: case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_RND1:
{ {
res = nullptr; res = nullptr;
} break; } break;
@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm = std::make_unique<llm_build_llada_moe>(*this, params); llm = std::make_unique<llm_build_llada_moe>(*this, params);
} }
break; break;
case LLM_ARCH_RND1:
{
llm = std::make_unique<llm_build_rnd1>(*this, params);
}
break;
case LLM_ARCH_QWEN2VL: case LLM_ARCH_QWEN2VL:
{ {
llm = std::make_unique<llm_build_qwen2vl>(*this, params); llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3:
case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_LLADA_MOE: case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_RND1:
case LLM_ARCH_OLMO2: case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE: case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:

View File

@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
llm_build_refact(const llama_model & model, const llm_graph_params & params); llm_build_refact(const llama_model & model, const llm_graph_params & params);
}; };
struct llm_build_rnd1 : public llm_graph_context {
llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_rwkv6 : public llm_build_rwkv6_base { struct llm_build_rwkv6 : public llm_build_rwkv6_base {
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params); llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
}; };

126
src/models/rnd1.cpp Normal file
View File

@ -0,0 +1,126 @@
#include "models.h"
// RND1 is a Qwen3Moe AR model converted to diffusion model.
llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
// Non-causal attention for diffusion
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self_attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// MoE branch
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
ggml_tensor * moe_out =
build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
cur = moe_out;
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -6953,9 +6953,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
for (bool use_view_slice : { true, false }) { for (bool use_view_slice : { true, false }) {
for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5}, for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
{2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) { {2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {

View File

@ -13,11 +13,16 @@ endif()
set(TARGET_SRCS set(TARGET_SRCS
server.cpp server.cpp
utils.hpp
server-http.cpp server-http.cpp
server-http.h server-http.h
server-models.cpp server-models.cpp
server-models.h server-models.h
server-task.cpp
server-task.h
server-queue.cpp
server-queue.h
server-common.cpp
server-common.h
) )
set(PUBLIC_ASSETS set(PUBLIC_ASSETS
index.html.gz index.html.gz

View File

@ -197,6 +197,7 @@ The project is under active development, and we are [looking for feedback and co
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) | | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) | | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | | `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) | | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
@ -1495,8 +1496,8 @@ The `status` object can be:
Load a model Load a model
Payload: Payload:
- `model`: name of the model to be loaded - `model`: name of the model to be loaded.
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance - `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
```json ```json
{ {

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,349 @@
#pragma once
#include "common.h"
#include "log.h"
#include "llama.h"
#include "chat.h"
#include "mtmd.h"
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
#include <string>
#include <vector>
#include <cinttypes>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
using json = nlohmann::ordered_json;
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
using raw_buffer = std::vector<uint8_t>;
template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value
if (body.contains(key) && !body.at(key).is_null()) {
try {
return body.at(key);
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
return default_value;
}
} else {
return default_value;
}
}
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
enum error_type {
ERROR_TYPE_INVALID_REQUEST,
ERROR_TYPE_AUTHENTICATION,
ERROR_TYPE_SERVER,
ERROR_TYPE_NOT_FOUND,
ERROR_TYPE_PERMISSION,
ERROR_TYPE_UNAVAILABLE, // custom error
ERROR_TYPE_NOT_SUPPORTED, // custom error
ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
};
// thin wrapper around common_grammar_trigger with (de)serialization functions
struct server_grammar_trigger {
common_grammar_trigger value;
server_grammar_trigger() = default;
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
server_grammar_trigger(const json & in) {
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
value.value = in.at("value").get<std::string>();
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
value.token = (llama_token) in.at("token").get<int>();
}
}
json to_json() const {
json out {
{"type", (int) value.type},
{"value", value.value},
};
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
out["token"] = (int) value.token;
}
return out;
}
};
json format_error_response(const std::string & message, const enum error_type type);
//
// random string / id
//
std::string random_string();
std::string gen_chatcmplid();
std::string gen_tool_call_id();
//
// lora utils
//
// check whether the given lora set has only aloras activated (empty => false)
bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
// if the two sets of loras are different, they require a cache clear unless the
// change is only from aloras to aloras.
bool lora_should_clear_cache(
const std::vector<common_adapter_lora_info> & current,
const std::vector<common_adapter_lora_info> & next);
std::vector<common_adapter_lora_info> parse_lora_request(
const std::vector<common_adapter_lora_info> & lora_base,
const json & data);
bool are_lora_equal(
const std::vector<common_adapter_lora_info> & l1,
const std::vector<common_adapter_lora_info> & l2);
// get the ids of all enabled loras
std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
//
// server_tokens
//
/**
* server_tokens is a helper to manage the input tokens and image for the server.
* it is made this way to simplify the logic of KV cache management.
*/
struct server_tokens {
bool has_mtmd = false;
private: // disallow accessing these members directly, risking out-of-sync
// map a **start** index in tokens to the image chunk
// note: the order need to be in-sync with tokens
std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
// list of tokens
// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
// otherwise, it is a normal text token
// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
// note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
llama_tokens tokens;
// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
// idx 0 1 2 3 4 5 6 7 8 9 10
// pos 0 1 2 3 4 5 5 5 7 7 7
// map_idx_to_media will contain: {5, img0}, {8, img1}
public:
server_tokens() = default;
~server_tokens() = default;
// Prevent copying
// TODO: server_tokens should be copyable - remove this:
server_tokens(const server_tokens&) = delete;
server_tokens& operator=(const server_tokens&) = delete;
// Allow moving (usually implicitly generated if members are movable)
server_tokens(server_tokens&&) = default;
server_tokens& operator=(server_tokens&&) = default;
// Allow accessing elements using [] operator
llama_token operator[](size_t index) { return tokens[index]; }
const llama_token& operator[](size_t index) const { return tokens[index]; }
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
server_tokens(const llama_tokens & tokens, bool has_mtmd);
// for debugging
std::string str() const;
llama_pos pos_next() const;
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
void push_back(llama_token tok);
// will create a copy of the chunk if it contains non-text data
void push_back(const mtmd_input_chunk * chunk);
// appends server tokens, updates the media map. copies media chunks.
void push_back(server_tokens & tokens);
// for compatibility with context shift and prompt truncation
void insert(const llama_tokens & inp_tokens);
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens() const;
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id);
size_t size() const { return tokens.size(); }
bool empty() const { return tokens.empty(); }
void clear() {
map_idx_to_media.clear();
tokens.clear();
}
void keep_first(size_t n);
std::string detokenize(const llama_context * ctx, bool special) const;
size_t get_common_prefix(const server_tokens & b) const;
// make sure all text tokens are within the vocab range
bool validate(const struct llama_context * ctx) const;
// encode and decode the image chunk
int32_t process_chunk(
llama_context * ctx,
mtmd_context * mctx,
size_t idx,
llama_pos pos,
int32_t seq_id,
size_t & n_tokens_out) const;
};
//
// tokenizer and input processing utils
//
bool json_is_array_of_numbers(const json & data);
// is array having BOTH numbers & strings?
bool json_is_array_of_mixed_numbers_strings(const json & data);
// does array have any individual integers/tokens?
bool json_is_array_and_contains_numbers(const json & data);
// get value by path(key1 / key2)
json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
/**
* this handles 2 cases:
* - only string, example: "string"
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
*/
llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
// return the last index of character that can form a valid string
// if the last character is potentially cut in half, return the index before the cut
// if validate_utf8(text) == text.size(), then the whole text is valid utf8
size_t validate_utf8(const std::string& text);
// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
* and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
*/
std::vector<server_tokens> tokenize_input_prompts(
const llama_vocab * vocab,
mtmd_context * mctx,
const json & json_prompt,
bool add_special,
bool parse_special);
//
// OAI utils
//
// used by /completions endpoint
json oaicompat_completion_params_parse(const json & body);
struct oaicompat_parser_options {
bool use_jinja;
bool prefill_assistant;
common_reasoning_format reasoning_format;
std::map<std::string,std::string> chat_template_kwargs;
common_chat_templates * tmpls;
bool allow_image;
bool allow_audio;
bool enable_thinking = true;
};
// used by /chat/completions endpoint
json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */
const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files);
// TODO: move it to server-task.cpp
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
// TODO: move it to server-task.cpp
json format_response_rerank(
const json & request,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts,
int top_n);
//
// other utils
//
std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
std::string safe_json_to_str(const json & data);
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
// format incomplete utf-8 multibyte character for output
std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
// format server-sent event (SSE), return the formatted string to send
// note: if data is a json array, it will be sent as multiple events, one per item
std::string format_sse(const json & data);
bool is_valid_utf8(const std::string & str);
//
// formatting output responses
// TODO: move these to server-task.cpp
//
llama_tokens format_prompt_infill(
const llama_vocab * vocab,
const json & input_prefix,
const json & input_suffix,
const json & input_extra,
const int n_batch,
const int n_predict,
const int n_ctx,
const bool spm_infill,
const llama_tokens & tokens_prompt);
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
server_tokens format_prompt_rerank(
const struct llama_model * model,
const struct llama_vocab * vocab,
mtmd_context * mctx,
const std::string & query,
const std::string & doc);

View File

@ -1,6 +1,6 @@
#include "utils.hpp"
#include "common.h" #include "common.h"
#include "server-http.h" #include "server-http.h"
#include "server-common.h"
#include <cpp-httplib/httplib.h> #include <cpp-httplib/httplib.h>

View File

@ -1,4 +1,4 @@
#include "utils.hpp" #include "server-common.h"
#include "server-models.h" #include "server-models.h"
#include "download.h" #include "download.h"
@ -383,8 +383,10 @@ void server_models::load(const std::string & name, const std::vector<std::string
child_args.push_back(std::to_string(inst.meta.port)); child_args.push_back(std::to_string(inst.meta.port));
// append extra args // append extra args
for (const auto & arg : extra_args) { if (base_params.models_allow_extra_args) {
child_args.push_back(arg); for (const auto & arg : extra_args) {
child_args.push_back(arg);
}
} }
} }
@ -405,6 +407,8 @@ void server_models::load(const std::string & name, const std::vector<std::string
if (result != 0) { if (result != 0) {
throw std::runtime_error("failed to spawn server instance"); throw std::runtime_error("failed to spawn server instance");
} }
inst.stdin_file = subprocess_stdin(inst.subproc.get());
} }
// start a thread to manage the child process // start a thread to manage the child process
@ -455,13 +459,12 @@ void server_models::load(const std::string & name, const std::vector<std::string
cv.notify_all(); cv.notify_all();
} }
static void interrupt_subprocess(subprocess_s * proc) { static void interrupt_subprocess(FILE * stdin_file) {
// because subprocess.h does not provide a way to send SIGINT, // because subprocess.h does not provide a way to send SIGINT,
// we will send a command to the child process to exit gracefully // we will send a command to the child process to exit gracefully
FILE * p_stdin = subprocess_stdin(proc); if (stdin_file) {
if (p_stdin) { fprintf(stdin_file, "%s\n", CMD_EXIT);
fprintf(p_stdin, "%s\n", CMD_EXIT); fflush(stdin_file);
fflush(p_stdin);
} }
} }
@ -471,7 +474,7 @@ void server_models::unload(const std::string & name) {
if (it != mapping.end()) { if (it != mapping.end()) {
if (it->second.meta.is_active()) { if (it->second.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str()); SRV_INF("unloading model instance name=%s\n", name.c_str());
interrupt_subprocess(it->second.subproc.get()); interrupt_subprocess(it->second.stdin_file);
// status change will be handled by the managing thread // status change will be handled by the managing thread
} else { } else {
SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
@ -486,7 +489,7 @@ void server_models::unload_all() {
for (auto & [name, inst] : mapping) { for (auto & [name, inst] : mapping) {
if (inst.meta.is_active()) { if (inst.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str()); SRV_INF("unloading model instance name=%s\n", name.c_str());
interrupt_subprocess(inst.subproc.get()); interrupt_subprocess(inst.stdin_file);
// status change will be handled by the managing thread // status change will be handled by the managing thread
} }
// moving the thread to join list to avoid deadlock // moving the thread to join list to avoid deadlock

View File

@ -74,6 +74,7 @@ private:
std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
std::thread th; std::thread th;
server_model_meta meta; server_model_meta meta;
FILE * stdin_file = nullptr;
}; };
std::mutex mutex; std::mutex mutex;

View File

@ -0,0 +1,268 @@
#include "server-task.h"
#include "server-queue.h"
#include "log.h"
#include <chrono>
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define RES_INF(fmt, ...) LOG_INF("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define RES_WRN(fmt, ...) LOG_WRN("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define RES_ERR(fmt, ...) LOG_ERR("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define RES_DBG(fmt, ...) LOG_DBG("res %12.*s: " fmt, 12, __func__, __VA_ARGS__)
//
// server_queue
//
int server_queue::post(server_task && task, bool front) {
std::unique_lock<std::mutex> lock(mutex_tasks);
GGML_ASSERT(task.id != -1);
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
const int task_id = task.id;
QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
if (front) {
queue_tasks.push_front(std::move(task));
} else {
queue_tasks.push_back(std::move(task));
}
condition_tasks.notify_one();
return task_id;
}
int server_queue::post(std::vector<server_task> && tasks, bool front) {
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : tasks) {
if (task.id == -1) {
task.id = id++;
}
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
if (front) {
queue_tasks.push_front(std::move(task));
} else {
queue_tasks.push_back(std::move(task));
}
}
condition_tasks.notify_one();
return 0;
}
void server_queue::defer(server_task && task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
QUE_DBG("defer task, id = %d\n", task.id);
queue_tasks_deferred.push_back(std::move(task));
condition_tasks.notify_one();
}
int server_queue::get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
int new_id = id++;
return new_id;
}
void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
callback_new_task = std::move(callback);
}
void server_queue::on_update_slots(std::function<void(void)> callback) {
callback_update_slots = std::move(callback);
}
void server_queue::pop_deferred_task() {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (!queue_tasks_deferred.empty()) {
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
queue_tasks_deferred.pop_front();
}
condition_tasks.notify_one();
}
void server_queue::terminate() {
std::unique_lock<std::mutex> lock(mutex_tasks);
running = false;
condition_tasks.notify_all();
}
void server_queue::start_loop() {
running = true;
while (true) {
QUE_DBG("%s", "processing new tasks\n");
while (true) {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (!running) {
QUE_DBG("%s", "terminate\n");
return;
}
if (queue_tasks.empty()) {
lock.unlock();
break;
}
server_task task = std::move(queue_tasks.front());
queue_tasks.pop_front();
lock.unlock();
QUE_DBG("processing task, id = %d\n", task.id);
callback_new_task(std::move(task));
}
// all tasks in the current loop is processed, slots data is now ready
QUE_DBG("%s", "update slots\n");
callback_update_slots();
QUE_DBG("%s", "waiting for new tasks\n");
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (!running) {
QUE_DBG("%s", "terminate\n");
return;
}
if (queue_tasks.empty()) {
condition_tasks.wait(lock, [&]{
return (!queue_tasks.empty() || !running);
});
}
}
}
}
void server_queue::cleanup_pending_task(int id_target) {
// no need lock because this is called exclusively by post()
auto rm_func = [id_target](const server_task & task) {
return task.id == id_target;
};
queue_tasks.erase(
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
queue_tasks.end());
queue_tasks_deferred.erase(
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
queue_tasks_deferred.end());
}
//
// server_response
//
void server_response::add_waiting_task_id(int id_task) {
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(id_task);
}
void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & task : tasks) {
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
waiting_task_ids.insert(task.id);
}
}
void server_response::remove_waiting_task_id(int id_task) {
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(id_task);
// make sure to clean up all pending results
queue_results.erase(
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
return res->id == id_task;
}),
queue_results.end());
}
void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & id_task : id_tasks) {
RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
waiting_task_ids.erase(id_task);
}
}
server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
if (!running) {
RES_DBG("%s : queue result stop\n", __func__);
std::terminate(); // we cannot return here since the caller is HTTP code
}
return !queue_results.empty();
});
for (size_t i = 0; i < queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
for (int i = 0; i < (int) queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
queue_results.erase(queue_results.begin() + i);
return res;
}
}
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
if (!running) {
RES_DBG("%s : queue result stop\n", __func__);
std::terminate(); // we cannot return here since the caller is HTTP code
}
if (cr_res == std::cv_status::timeout) {
return nullptr;
}
}
// should never reach here
}
server_task_result_ptr server_response::recv(int id_task) {
std::unordered_set<int> id_tasks = {id_task};
return recv(id_tasks);
}
void server_response::send(server_task_result_ptr && result) {
RES_DBG("sending result for task id = %d\n", result->id);
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & id_task : waiting_task_ids) {
if (result->id == id_task) {
RES_DBG("task id = %d pushed to result queue\n", result->id);
queue_results.emplace_back(std::move(result));
condition_results.notify_all();
return;
}
}
}
void server_response::terminate() {
running = false;
condition_results.notify_all();
}

110
tools/server/server-queue.h Normal file
View File

@ -0,0 +1,110 @@
#pragma once
#include "server-task.h"
#include <condition_variable>
#include <deque>
#include <mutex>
#include <unordered_set>
struct server_queue {
private:
int id = 0;
bool running;
// queues
std::deque<server_task> queue_tasks;
std::deque<server_task> queue_tasks_deferred;
std::mutex mutex_tasks;
std::condition_variable condition_tasks;
// callback functions
std::function<void(server_task &&)> callback_new_task;
std::function<void(void)> callback_update_slots;
public:
// Add a new task to the end of the queue
int post(server_task && task, bool front = false);
// multi-task version of post()
int post(std::vector<server_task> && tasks, bool front = false);
// Add a new task, but defer until one slot is available
void defer(server_task && task);
// Get the next id for creating a new task
int get_new_id();
// Register function to process a new task
void on_new_task(std::function<void(server_task &&)> callback);
// Register the function to be called when all slots data is ready to be processed
void on_update_slots(std::function<void(void)> callback);
// Call when the state of one slot is changed, it will move one task from deferred to main queue
void pop_deferred_task();
// end the start_loop routine
void terminate();
/**
* Main loop consists of these steps:
* - Wait until a new task arrives
* - Process the task (i.e. maybe copy data into slot)
* - Check if multitask is finished
* - Update all slots
*/
void start_loop();
// for metrics
size_t queue_tasks_deferred_size() {
std::unique_lock<std::mutex> lock(mutex_tasks);
return queue_tasks_deferred.size();
}
private:
void cleanup_pending_task(int id_target);
};
struct server_response {
private:
bool running = true;
// for keeping track of all tasks waiting for the result
std::unordered_set<int> waiting_task_ids;
// the main result queue (using ptr for polymorphism)
std::vector<server_task_result_ptr> queue_results;
std::mutex mutex_results;
std::condition_variable condition_results;
public:
// add the id_task to the list of tasks waiting for response
void add_waiting_task_id(int id_task);
void add_waiting_tasks(const std::vector<server_task> & tasks);
// when the request is finished, we can remove task associated with it
void remove_waiting_task_id(int id_task);
// remove multiple tasks from waiting list
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
// This function blocks the thread until there is a response for one of the id_tasks
server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
// same as recv(), but have timeout in seconds
// if timeout is reached, nullptr is returned
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
// single-task version of recv()
server_task_result_ptr recv(int id_task);
// Send a new result to a waiting id_task
void send(server_task_result_ptr && result);
// terminate the waiting loop
void terminate();
};

1192
tools/server/server-task.cpp Normal file

File diff suppressed because it is too large Load Diff

453
tools/server/server-task.h Normal file
View File

@ -0,0 +1,453 @@
#pragma once
#include "common.h"
#include "llama.h"
#include <string>
#include <unordered_set>
#include <list>
// TODO: prevent including the whole server-common.h as we only use server_tokens
#include "server-common.h"
using json = nlohmann::ordered_json;
enum server_task_type {
SERVER_TASK_TYPE_COMPLETION,
SERVER_TASK_TYPE_EMBEDDING,
SERVER_TASK_TYPE_RERANK,
SERVER_TASK_TYPE_INFILL,
SERVER_TASK_TYPE_CANCEL,
SERVER_TASK_TYPE_NEXT_RESPONSE,
SERVER_TASK_TYPE_METRICS,
SERVER_TASK_TYPE_SLOT_SAVE,
SERVER_TASK_TYPE_SLOT_RESTORE,
SERVER_TASK_TYPE_SLOT_ERASE,
SERVER_TASK_TYPE_SET_LORA,
};
// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
enum oaicompat_type {
OAICOMPAT_TYPE_NONE,
OAICOMPAT_TYPE_CHAT,
OAICOMPAT_TYPE_COMPLETION,
OAICOMPAT_TYPE_EMBEDDING,
};
enum stop_type {
STOP_TYPE_NONE,
STOP_TYPE_EOS,
STOP_TYPE_WORD,
STOP_TYPE_LIMIT,
};
struct task_params {
bool stream = true;
bool include_usage = false;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false;
bool return_progress = false;
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict
int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_adapter_lora_info> lora;
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
struct common_params_sampling sampling;
struct common_params_speculative speculative;
// OAI-compat fields
bool verbose = false;
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
std::string oaicompat_model;
std::string oaicompat_cmpl_id;
common_chat_syntax oaicompat_chat_syntax;
// Embeddings
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
json to_json(bool only_metrics = false) const;
};
struct server_task {
int id = -1; // to be filled by server_queue
int index = -1; // used when there are multiple prompts (batch request)
// used by SERVER_TASK_TYPE_CANCEL
int id_target = -1;
int id_slot = -1;
// used by SERVER_TASK_TYPE_INFERENCE
task_params params;
server_tokens tokens;
server_task_type type;
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
struct slot_action {
int slot_id;
std::string filename;
std::string filepath;
};
slot_action slot_action;
// used by SERVER_TASK_TYPE_METRICS
bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_adapter_lora_info> set_lora;
server_task() = default;
server_task(server_task_type type) : type(type) {}
int32_t n_tokens() const {
return tokens.size();
}
static task_params params_from_json_cmpl(
const llama_context * ctx,
const common_params & params_base,
const json & data);
// utility function
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
std::unordered_set<int> ids(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) {
ids.insert(tasks[i].id);
}
return ids;
}
};
struct result_timings {
int32_t cache_n = -1;
int32_t prompt_n = -1;
double prompt_ms;
double prompt_per_token_ms;
double prompt_per_second;
int32_t predicted_n = -1;
double predicted_ms;
double predicted_per_token_ms;
double predicted_per_second;
// Optional speculative metrics - only included when > 0
int32_t draft_n = 0;
int32_t draft_n_accepted = 0;
json to_json() const;
};
struct result_prompt_progress {
int32_t total = 0;
int32_t cache = 0;
int32_t processed = 0;
int64_t time_ms = 0;
json to_json() const;
};
struct server_task_result {
int id = -1;
int id_slot = -1;
virtual bool is_error() {
// only used by server_task_result_error
return false;
}
virtual bool is_stop() {
// only used by server_task_result_cmpl_*
return true;
}
virtual int get_index() {
return -1;
}
virtual json to_json() = 0;
virtual ~server_task_result() = default;
};
// using shared_ptr for polymorphism of server_task_result
using server_task_result_ptr = std::unique_ptr<server_task_result>;
struct completion_token_output {
llama_token tok;
float prob;
std::string text_to_send;
struct prob_info {
llama_token tok;
std::string txt;
float prob;
};
std::vector<prob_info> probs;
json to_json(bool post_sampling_probs) const;
static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
static float logarithm(float x);
static std::vector<unsigned char> str_to_bytes(const std::string & str);
};
struct server_task_result_cmpl_final : server_task_result {
int index = 0;
std::string content;
llama_tokens tokens;
bool stream;
bool include_usage;
result_timings timings;
std::string prompt;
bool truncated;
int32_t n_decoded;
int32_t n_prompt_tokens;
int32_t n_tokens_cached;
bool has_new_line;
std::string stopping_word;
stop_type stop = STOP_TYPE_NONE;
bool post_sampling_probs;
std::vector<completion_token_output> probs_output;
std::vector<std::string> response_fields;
task_params generation_params;
// OAI-compat fields
bool verbose = false;
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
std::string oaicompat_model;
std::string oaicompat_cmpl_id;
common_chat_msg oaicompat_msg;
std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override {
return true; // in stream mode, final responses are considered stop
}
virtual json to_json() override;
json to_json_non_oaicompat();
json to_json_oaicompat();
json to_json_oaicompat_chat();
json to_json_oaicompat_chat_stream();
};
struct server_task_result_cmpl_partial : server_task_result {
int index = 0;
std::string content;
llama_tokens tokens;
int32_t n_decoded;
int32_t n_prompt_tokens;
bool post_sampling_probs;
bool is_progress = false;
completion_token_output prob_output;
result_timings timings;
result_prompt_progress progress;
// OAI-compat fields
bool verbose = false;
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
std::string oaicompat_model;
std::string oaicompat_cmpl_id;
std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop
}
virtual json to_json() override;
json to_json_non_oaicompat();
json to_json_oaicompat();
json to_json_oaicompat_chat();
};
struct server_task_result_embd : server_task_result {
int index = 0;
std::vector<std::vector<float>> embedding;
int32_t n_tokens;
// OAI-compat fields
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
virtual int get_index() override {
return index;
}
virtual json to_json() override;
json to_json_non_oaicompat();
json to_json_oaicompat();
};
struct server_task_result_rerank : server_task_result {
int index = 0;
float score = -1e6;
int32_t n_tokens;
virtual int get_index() override {
return index;
}
virtual json to_json() override;
};
struct server_task_result_error : server_task_result {
int index = 0;
error_type err_type = ERROR_TYPE_SERVER;
std::string err_msg;
// for ERROR_TYPE_EXCEED_CONTEXT_SIZE
int32_t n_prompt_tokens = 0;
int32_t n_ctx = 0;
virtual bool is_error() override {
return true;
}
virtual json to_json() override;
};
struct server_task_result_metrics : server_task_result {
int n_idle_slots;
int n_processing_slots;
int n_tasks_deferred;
int64_t t_start;
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
uint64_t n_prompt_tokens_processed_total = 0;
uint64_t t_prompt_processing_total = 0;
uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;
uint64_t n_tokens_max = 0;
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
uint64_t n_tokens_predicted = 0;
uint64_t t_tokens_generation = 0;
uint64_t n_decode_total = 0;
uint64_t n_busy_slots_total = 0;
// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
// therefore, we use json to temporarily store the slot.to_json() result
json slots_data = json::array();
virtual json to_json() override;
};
struct server_task_result_slot_save_load : server_task_result {
std::string filename;
bool is_save; // true = save, false = load
size_t n_tokens;
size_t n_bytes;
double t_ms;
virtual json to_json() override;
};
struct server_task_result_slot_erase : server_task_result {
size_t n_erased;
virtual json to_json() override;
};
struct server_task_result_apply_lora : server_task_result {
virtual json to_json() override;
};
struct server_prompt_checkpoint {
llama_pos pos_min;
llama_pos pos_max;
std::vector<uint8_t> data;
size_t size() const {
return data.size();
}
};
struct server_prompt {
server_tokens tokens;
std::vector<uint8_t> data;
std::list<server_prompt_checkpoint> checkpoints;
size_t size() const {
size_t res = data.size();
for (const auto & checkpoint : checkpoints) {
res += checkpoint.size();
}
return res;
}
int n_tokens() const {
return tokens.size();
}
};
struct server_prompt_cache {
server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
this->limit_tokens = limit_tokens;
}
std::list<server_prompt> states;
// in bytes, 0 = no limit
size_t limit_size = 0;
// in tokens, 0 = no limit
size_t limit_tokens = 0;
size_t size() const;
size_t n_tokens() const;
server_prompt * alloc(const server_prompt & prompt, size_t state_size);
bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
void update();
};

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,7 @@
sendMessage, sendMessage,
stopGeneration stopGeneration
} from '$lib/stores/chat.svelte'; } from '$lib/stores/chat.svelte';
import { config } from '$lib/stores/settings.svelte';
import { import {
supportsVision, supportsVision,
supportsAudio, supportsAudio,
@ -47,6 +48,7 @@
let { showCenteredEmpty = false } = $props(); let { showCenteredEmpty = false } = $props();
let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
let autoScrollEnabled = $state(true); let autoScrollEnabled = $state(true);
let chatScrollContainer: HTMLDivElement | undefined = $state(); let chatScrollContainer: HTMLDivElement | undefined = $state();
let dragCounter = $state(0); let dragCounter = $state(0);
@ -149,7 +151,7 @@
} }
function handleScroll() { function handleScroll() {
if (!chatScrollContainer) return; if (disableAutoScroll || !chatScrollContainer) return;
const { scrollTop, scrollHeight, clientHeight } = chatScrollContainer; const { scrollTop, scrollHeight, clientHeight } = chatScrollContainer;
const distanceFromBottom = scrollHeight - scrollTop - clientHeight; const distanceFromBottom = scrollHeight - scrollTop - clientHeight;
@ -194,8 +196,10 @@
const extras = result?.extras; const extras = result?.extras;
// Enable autoscroll for user-initiated message sending // Enable autoscroll for user-initiated message sending
userScrolledUp = false; if (!disableAutoScroll) {
autoScrollEnabled = true; userScrolledUp = false;
autoScrollEnabled = true;
}
await sendMessage(message, extras); await sendMessage(message, extras);
scrollChatToBottom(); scrollChatToBottom();
@ -241,6 +245,8 @@
} }
function scrollChatToBottom(behavior: ScrollBehavior = 'smooth') { function scrollChatToBottom(behavior: ScrollBehavior = 'smooth') {
if (disableAutoScroll) return;
chatScrollContainer?.scrollTo({ chatScrollContainer?.scrollTo({
top: chatScrollContainer?.scrollHeight, top: chatScrollContainer?.scrollHeight,
behavior behavior
@ -248,14 +254,27 @@
} }
afterNavigate(() => { afterNavigate(() => {
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY); if (!disableAutoScroll) {
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
}
}); });
onMount(() => { onMount(() => {
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY); if (!disableAutoScroll) {
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
}
}); });
$effect(() => { $effect(() => {
if (disableAutoScroll) {
autoScrollEnabled = false;
if (scrollInterval) {
clearInterval(scrollInterval);
scrollInterval = undefined;
}
return;
}
if (isCurrentConversationLoading && autoScrollEnabled) { if (isCurrentConversationLoading && autoScrollEnabled) {
scrollInterval = setInterval(scrollChatToBottom, AUTO_SCROLL_INTERVAL); scrollInterval = setInterval(scrollChatToBottom, AUTO_SCROLL_INTERVAL);
} else if (scrollInterval) { } else if (scrollInterval) {
@ -289,9 +308,11 @@
class="mb-16 md:mb-24" class="mb-16 md:mb-24"
messages={activeMessages()} messages={activeMessages()}
onUserAction={() => { onUserAction={() => {
userScrolledUp = false; if (!disableAutoScroll) {
autoScrollEnabled = true; userScrolledUp = false;
scrollChatToBottom(); autoScrollEnabled = true;
scrollChatToBottom();
}
}} }}
/> />

View File

@ -3,7 +3,6 @@
Settings, Settings,
Funnel, Funnel,
AlertTriangle, AlertTriangle,
Brain,
Code, Code,
Monitor, Monitor,
Sun, Sun,
@ -58,6 +57,33 @@
label: 'Paste long text to file length', label: 'Paste long text to file length',
type: 'input' type: 'input'
}, },
{
key: 'enableContinueGeneration',
label: 'Enable "Continue" button',
type: 'checkbox',
isExperimental: true
},
{
key: 'pdfAsImage',
label: 'Parse PDF as image',
type: 'checkbox'
},
{
key: 'askForTitleConfirmation',
label: 'Ask for confirmation before changing conversation title',
type: 'checkbox'
}
]
},
{
title: 'Display',
icon: Monitor,
fields: [
{
key: 'showThoughtInProgress',
label: 'Show thought in progress',
type: 'checkbox'
},
{ {
key: 'showMessageStats', key: 'showMessageStats',
label: 'Show message generation statistics', label: 'Show message generation statistics',
@ -79,31 +105,14 @@
type: 'checkbox' type: 'checkbox'
}, },
{ {
key: 'pdfAsImage', key: 'disableAutoScroll',
label: 'Parse PDF as image', label: 'Disable automatic scroll',
type: 'checkbox' type: 'checkbox'
}, },
{ {
key: 'renderUserContentAsMarkdown', key: 'renderUserContentAsMarkdown',
label: 'Render user content as Markdown', label: 'Render user content as Markdown',
type: 'checkbox' type: 'checkbox'
},
{
key: 'autoMicOnEmpty',
label: 'Show microphone on empty input',
type: 'checkbox',
isExperimental: true
},
{
key: 'enableContinueGeneration',
label: 'Enable "Continue" button',
type: 'checkbox',
isExperimental: true
},
{
key: 'askForTitleConfirmation',
label: 'Ask for confirmation before changing conversation title',
type: 'checkbox'
} }
] ]
}, },
@ -214,17 +223,6 @@
} }
] ]
}, },
{
title: 'Reasoning',
icon: Brain,
fields: [
{
key: 'showThoughtInProgress',
label: 'Show thought in progress',
type: 'checkbox'
}
]
},
{ {
title: 'Import/Export', title: 'Import/Export',
icon: Database, icon: Database,

View File

@ -14,6 +14,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
pasteLongTextToFileLen: 2500, pasteLongTextToFileLen: 2500,
pdfAsImage: false, pdfAsImage: false,
showModelInfo: false, showModelInfo: false,
disableAutoScroll: false,
renderUserContentAsMarkdown: false, renderUserContentAsMarkdown: false,
modelSelectorEnabled: false, modelSelectorEnabled: false,
autoMicOnEmpty: false, autoMicOnEmpty: false,
@ -94,6 +95,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Ask for confirmation before automatically changing conversation title when editing the first message.', 'Ask for confirmation before automatically changing conversation title when editing the first message.',
pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).', pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
showModelInfo: 'Display the model name used to generate each message below the message content.', showModelInfo: 'Display the model name used to generate each message below the message content.',
disableAutoScroll:
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.', renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
modelSelectorEnabled: modelSelectorEnabled:
'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.', 'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.',