Merge branch 'ggml-org:master' into power-law-sampler

This commit is contained in:
ddh0 2025-12-11 12:52:53 -06:00 committed by GitHub
commit 66e2d17c7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 501 additions and 268 deletions

View File

@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
or with a server image:
```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
```
## Docker With CUDA
@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```
## Docker With MUSA
@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
```bash
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```

View File

@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, addr, tensor);
#endif
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
// see if we can merge with an existing block
@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
}
// otherwise, add a new block
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
GGML_UNUSED(tensor);
}
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
GGML_ASSERT(parent_size >= node_size);
if (parent_size > node_size) {
// note: we want after the freeing the chunks to continue to be aligned
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
if (parent_size > node_size) {
struct buffer_address p_addr = p_hn->addr;
p_addr.offset += node_size;
size_t extra_size = parent_size - node_size;
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
}
}
@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, hn->addr, node);
#endif
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
hn->allocated = false;
}

View File

@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_CUMSUM:
case GGML_OP_TRI:
case GGML_OP_DIAG:
return true;
case GGML_OP_SOLVE_TRI:
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
return true;
default:
return false;
}

View File

@ -3,6 +3,80 @@
#include "solve_tri.cuh"
#define MAX_N_FAST 64
#define MAX_K_FAST 32
static __global__ void get_batch_pointers(const float * A,
float * X,
const float ** A_ptrs,
float ** X_ptrs,
int64_t ne02,
int64_t total_batches,
size_t s02,
size_t s03,
size_t s2,
size_t s3) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= total_batches) {
return;
}
const int64_t i3 = idx / ne02;
const int64_t i2 = idx % ne02;
A_ptrs[idx] = A + i3 * s03 + i2 * s02;
X_ptrs[idx] = X + i3 * s3 + i2 * s2;
}
static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
const float * A,
const float * B,
float * X,
int n,
int k,
int64_t ne02,
int64_t ne03,
size_t s02,
size_t s03,
size_t s12,
size_t s13,
size_t s2,
size_t s3,
cudaStream_t stream) {
const float alpha = 1.0f;
const int64_t total_batches = ne02 * ne03;
if (total_batches == 0) {
return;
}
// Bulk copy B -> X (contiguous tensors)
if (X != B) {
const int64_t total_elements_BX = n * k * total_batches;
CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
}
const int id = ggml_cuda_get_device();
ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
ggml_cuda_pool_alloc<float *> X_ptrs_alloc(ctx.pool(id), total_batches);
const float ** A_ptrs_dev = A_ptrs_alloc.get();
float ** X_ptrs_dev = X_ptrs_alloc.get();
get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
total_batches, s02, s03, s2, s3);
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
// Yes, this is necessary, without this we get RMSE errors
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
// revert to standard mode from common.cuh
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
GGML_UNUSED_VARS(s12, s13);
}
// ======================
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
}
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix)
const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns)
const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular)
const ggml_tensor * src1 = dst->src[1]; // B (n×k)
ggml_is_contiguous(src0);
ggml_is_contiguous(src1);
const int64_t n = src0->ne[0];
const int64_t k = src1->ne[0];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
GGML_ASSERT(n <= 64);
GGML_ASSERT(k <= 32);
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
dst->nb[3] / sizeof(float), ctx.stream());
} else {
solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
dst->nb[3] / sizeof(float), ctx.stream());
}
}

View File

@ -19,6 +19,9 @@
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_16BF HIPBLAS_R_16B
#define CUDA_R_32F HIPBLAS_R_32F
#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@ -30,6 +33,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define __all_sync(mask, var) __all(var)
#define __any_sync(mask, var) __any(var)
#define cublasStrsmBatched hipblasStrsmBatched
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx

View File

@ -12,11 +12,16 @@
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
#define CUDA_R_16F MUSA_R_16F
#define CUDA_R_16BF MUSA_R_16BF
#define CUDA_R_32F MUSA_R_32F
#define cublasStrsmBatched mublasStrsmBatched
#define cublasComputeType_t cudaDataType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy

View File

@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
udata->output .resize(n_tokens);
udata->seq_id_data.reserve(n_tokens);
seq_set_t seq_set_unq;
for (size_t i = 0; i < idxs.size(); ++i) {
@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
}
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
udata->seq_id[i] = batch.seq_id[idxs[i]];
udata->output[i] = batch.logits[idxs[i]];
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
seq_set_unq.set(udata->seq_id[i][s]);
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
udata->seq_id_data.push_back(seq_id);
seq_set_unq.set(seq_id);
}
if (udata->output[i]) {
@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
}
}
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
for (size_t i = 0; i < idxs.size(); ++i) {
udata->seq_id[i] = seq_id_ptr;
seq_id_ptr += udata->n_seq_id[i];
}
for (uint32_t s = 0; s < n_seq_max; ++s) {
if (seq_set_unq.test(s)) {
udata->seq_idx[s] = udata->seq_id_unq.size();

View File

@ -56,13 +56,15 @@ struct llama_ubatch {
std::vector<float> embd;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
std::vector<llama_seq_id> seq_id_unq;
std::vector<int32_t> seq_idx;
std::vector<int8_t> output;
std::vector<llama_seq_id> seq_id_data;
};
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
std::shared_ptr<data_t> data;
};

View File

@ -7861,9 +7861,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
for (bool v : {false, true}) {
for (bool circular : {false, true}) {
@ -8064,12 +8079,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
// qwen3next with CHUNK_SIZE 64
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
// qwen3next with CHUNK_SIZE 128
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));

Binary file not shown.

View File

@ -41,7 +41,7 @@
"@tailwindcss/vite": "^4.0.0",
"@types/node": "^22",
"@vitest/browser": "^3.2.3",
"bits-ui": "^2.8.11",
"bits-ui": "^2.14.4",
"clsx": "^2.1.1",
"dexie": "^4.0.11",
"eslint": "^9.18.0",
@ -3343,17 +3343,17 @@
}
},
"node_modules/bits-ui": {
"version": "2.8.11",
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
"version": "2.14.4",
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@floating-ui/core": "^1.7.1",
"@floating-ui/dom": "^1.7.1",
"esm-env": "^1.1.2",
"runed": "^0.29.1",
"svelte-toolbelt": "^0.9.3",
"runed": "^0.35.1",
"svelte-toolbelt": "^0.10.6",
"tabbable": "^6.2.0"
},
"engines": {
@ -3368,9 +3368,9 @@
}
},
"node_modules/bits-ui/node_modules/runed": {
"version": "0.29.2",
"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
"version": "0.35.1",
"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
"dev": true,
"funding": [
"https://github.com/sponsors/huntabyte",
@ -3378,23 +3378,31 @@
],
"license": "MIT",
"dependencies": {
"esm-env": "^1.0.0"
"dequal": "^2.0.3",
"esm-env": "^1.0.0",
"lz-string": "^1.5.0"
},
"peerDependencies": {
"@sveltejs/kit": "^2.21.0",
"svelte": "^5.7.0"
},
"peerDependenciesMeta": {
"@sveltejs/kit": {
"optional": true
}
}
},
"node_modules/bits-ui/node_modules/svelte-toolbelt": {
"version": "0.9.3",
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
"version": "0.10.6",
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
"dev": true,
"funding": [
"https://github.com/sponsors/huntabyte"
],
"dependencies": {
"clsx": "^2.1.1",
"runed": "^0.29.0",
"runed": "^0.35.1",
"style-to-object": "^1.0.8"
},
"engines": {

View File

@ -43,7 +43,7 @@
"@tailwindcss/vite": "^4.0.0",
"@types/node": "^22",
"@vitest/browser": "^3.2.3",
"bits-ui": "^2.8.11",
"bits-ui": "^2.14.4",
"clsx": "^2.1.1",
"dexie": "^4.0.11",
"eslint": "^9.18.0",

View File

@ -331,6 +331,7 @@
class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
? 'cursor-not-allowed opacity-60'
: ''} {className}"
data-slot="chat-form"
>
<ChatAttachmentsList
bind:uploadedFiles

View File

@ -1,6 +1,5 @@
<script lang="ts">
import { Input } from '$lib/components/ui/input';
import { Search } from '@lucide/svelte';
import { SearchInput } from '$lib/components/app';
interface Props {
value?: string;
@ -15,19 +14,6 @@
onInput,
class: className
}: Props = $props();
function handleInput(event: Event) {
const target = event.target as HTMLInputElement;
value = target.value;
onInput?.(target.value);
}
</script>
<div class="relative mb-4 {className}">
<Search
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
/>
<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
</div>
<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />

View File

@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
export { default as RemoveButton } from './misc/RemoveButton.svelte';
export { default as SearchInput } from './misc/SearchInput.svelte';
export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
export { default as ModelsSelector } from './models/ModelsSelector.svelte';

View File

@ -0,0 +1,73 @@
<script lang="ts">
import { Input } from '$lib/components/ui/input';
import { Search, X } from '@lucide/svelte';
interface Props {
value?: string;
placeholder?: string;
onInput?: (value: string) => void;
onClose?: () => void;
onKeyDown?: (event: KeyboardEvent) => void;
class?: string;
id?: string;
ref?: HTMLInputElement | null;
}
let {
value = $bindable(''),
placeholder = 'Search...',
onInput,
onClose,
onKeyDown,
class: className,
id,
ref = $bindable(null)
}: Props = $props();
let showClearButton = $derived(!!value || !!onClose);
function handleInput(event: Event) {
const target = event.target as HTMLInputElement;
value = target.value;
onInput?.(target.value);
}
function handleClear() {
if (value) {
value = '';
onInput?.('');
ref?.focus();
} else {
onClose?.();
}
}
</script>
<div class="relative {className}">
<Search
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
/>
<Input
{id}
bind:value
bind:ref
class="pl-9 {showClearButton ? 'pr-9' : ''}"
oninput={handleInput}
onkeydown={onKeyDown}
{placeholder}
type="search"
/>
{#if showClearButton}
<button
type="button"
class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
onclick={handleClear}
aria-label={value ? 'Clear search' : 'Close'}
>
<X class="h-4 w-4" />
</button>
{/if}
</div>

View File

@ -2,8 +2,8 @@
import { onMount, tick } from 'svelte';
import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
import * as Tooltip from '$lib/components/ui/tooltip';
import * as Popover from '$lib/components/ui/popover';
import { cn } from '$lib/components/ui/utils';
import { portalToBody } from '$lib/utils';
import {
modelsStore,
modelOptions,
@ -17,12 +17,8 @@
import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
import { ServerModelStatus } from '$lib/enums';
import { isRouterMode } from '$lib/stores/server.svelte';
import { DialogModelInformation } from '$lib/components/app';
import {
MENU_MAX_WIDTH,
MENU_OFFSET,
VIEWPORT_GUTTER
} from '$lib/constants/floating-ui-constraints';
import { DialogModelInformation, SearchInput } from '$lib/components/app';
import type { ModelOption } from '$lib/types/models';
interface Props {
class?: string;
@ -145,185 +141,126 @@
return options.some((option) => option.model === currentModel);
});
let isOpen = $state(false);
let showModelDialog = $state(false);
let container: HTMLDivElement | null = null;
let menuRef = $state<HTMLDivElement | null>(null);
let triggerButton = $state<HTMLButtonElement | null>(null);
let menuPosition = $state<{
top: number;
left: number;
width: number;
placement: 'top' | 'bottom';
maxHeight: number;
} | null>(null);
let searchTerm = $state('');
let searchInputRef = $state<HTMLInputElement | null>(null);
let highlightedIndex = $state<number>(-1);
onMount(async () => {
try {
await modelsStore.fetch();
} catch (error) {
console.error('Unable to load models:', error);
}
let filteredOptions: ModelOption[] = $derived(
(() => {
const term = searchTerm.trim().toLowerCase();
if (!term) return options;
return options.filter(
(option) =>
option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
);
})()
);
// Get indices of compatible options for keyboard navigation
let compatibleIndices = $derived(
filteredOptions
.map((option, index) => (isModelCompatible(option) ? index : -1))
.filter((i) => i !== -1)
);
// Reset highlighted index when search term changes
$effect(() => {
void searchTerm;
highlightedIndex = -1;
});
function toggleOpen() {
if (loading || updating) return;
if (isRouter) {
// Router mode: show dropdown
if (isOpen) {
closeMenu();
} else {
openMenu();
}
} else {
// Single model mode: show dialog
showModelDialog = true;
}
}
async function openMenu() {
let isOpen = $state(false);
let showModelDialog = $state(false);
onMount(() => {
modelsStore.fetch().catch((error) => {
console.error('Unable to load models:', error);
});
});
function handleOpenChange(open: boolean) {
if (loading || updating) return;
if (open) {
isOpen = true;
await tick();
updateMenuPosition();
requestAnimationFrame(() => updateMenuPosition());
searchTerm = '';
highlightedIndex = -1;
// Focus search input after popover opens
tick().then(() => {
requestAnimationFrame(() => searchInputRef?.focus());
});
if (isRouter) {
modelsStore.fetchRouterModels().then(() => {
modelsStore.fetchModalitiesForLoadedModels();
});
}
} else {
isOpen = false;
searchTerm = '';
highlightedIndex = -1;
}
}
function handleTriggerClick() {
if (loading || updating) return;
if (!isRouter) {
// Single model mode: show dialog instead of popover
showModelDialog = true;
}
// For router mode, the Popover handles open/close
}
export function open() {
if (isRouter) {
openMenu();
handleOpenChange(true);
} else {
showModelDialog = true;
}
}
function closeMenu() {
if (!isOpen) return;
isOpen = false;
menuPosition = null;
handleOpenChange(false);
}
function handlePointerDown(event: PointerEvent) {
if (!container) return;
function handleSearchKeyDown(event: KeyboardEvent) {
if (event.isComposing) return;
const target = event.target as Node | null;
if (event.key === 'ArrowDown') {
event.preventDefault();
if (compatibleIndices.length === 0) return;
if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
closeMenu();
}
}
function handleKeydown(event: KeyboardEvent) {
if (event.key === 'Escape') {
closeMenu();
}
}
function handleResize() {
if (isOpen) {
updateMenuPosition();
}
}
function updateMenuPosition() {
if (!isOpen || !triggerButton || !menuRef) return;
const triggerRect = triggerButton.getBoundingClientRect();
const viewportWidth = window.innerWidth;
const viewportHeight = window.innerHeight;
if (viewportWidth === 0 || viewportHeight === 0) return;
const scrollWidth = menuRef.scrollWidth;
const scrollHeight = menuRef.scrollHeight;
const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
const safeMaxWidth =
constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
let width = Math.min(
Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
safeMaxWidth || 320
);
const availableBelow = Math.max(
0,
viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
);
const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
function computePlacement(placement: 'top' | 'bottom') {
const available = placement === 'bottom' ? availableBelow : availableAbove;
const allowedHeight =
available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
const maxHeight = Math.min(scrollHeight, allowedHeight);
const height = Math.max(0, maxHeight);
let top: number;
if (placement === 'bottom') {
const rawTop = triggerRect.bottom + MENU_OFFSET;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
const currentPos = compatibleIndices.indexOf(highlightedIndex);
if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
highlightedIndex = compatibleIndices[0];
} else {
top = Math.min(Math.max(rawTop, minTop), maxTop);
highlightedIndex = compatibleIndices[currentPos + 1];
}
} else if (event.key === 'ArrowUp') {
event.preventDefault();
if (compatibleIndices.length === 0) return;
const currentPos = compatibleIndices.indexOf(highlightedIndex);
if (currentPos === -1 || currentPos === 0) {
highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
} else {
const rawTop = triggerRect.top - MENU_OFFSET - height;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
} else {
top = Math.max(Math.min(rawTop, maxTop), minTop);
highlightedIndex = compatibleIndices[currentPos - 1];
}
} else if (event.key === 'Enter') {
event.preventDefault();
if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
const option = filteredOptions[highlightedIndex];
if (isModelCompatible(option)) {
handleSelect(option.id);
}
} else if (compatibleIndices.length > 0) {
// No selection - highlight first compatible option
highlightedIndex = compatibleIndices[0];
}
}
return { placement, top, height, maxHeight };
}
const belowMetrics = computePlacement('bottom');
const aboveMetrics = computePlacement('top');
let metrics = belowMetrics;
if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
metrics = aboveMetrics;
}
let left = triggerRect.right - width;
const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
if (maxLeft < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
} else {
if (left > maxLeft) {
left = maxLeft;
}
if (left < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
}
}
menuPosition = {
top: Math.round(metrics.top),
left: Math.round(left),
width: Math.round(width),
placement: metrics.placement,
maxHeight: Math.round(metrics.maxHeight)
};
}
async function handleSelect(modelId: string) {
@ -356,6 +293,14 @@
if (shouldCloseMenu) {
closeMenu();
// Focus the chat textarea after model selection
requestAnimationFrame(() => {
const textarea = document.querySelector<HTMLTextAreaElement>(
'[data-slot="chat-form"] textarea'
);
textarea?.focus();
});
}
}
@ -404,10 +349,7 @@
}
</script>
<svelte:window onresize={handleResize} />
<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
{#if loading && options.length === 0 && isRouter}
<div class="flex items-center gap-2 text-xs text-muted-foreground">
<Loader2 class="h-3.5 w-3.5 animate-spin" />
@ -418,9 +360,8 @@
{:else}
{@const selectedOption = getDisplayOption()}
<div class="relative">
<button
type="button"
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
<Popover.Trigger
class={cn(
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
!isCurrentModelInCache()
@ -430,15 +371,11 @@
: isHighlightedCurrentModelActive
? 'text-foreground'
: 'text-muted-foreground',
isOpen ? 'text-foreground' : '',
className
isOpen ? 'text-foreground' : ''
)}
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
aria-haspopup={isRouter ? 'listbox' : undefined}
aria-expanded={isRouter ? isOpen : undefined}
onclick={toggleOpen}
bind:this={triggerButton}
disabled={disabled || updating}
onclick={handleTriggerClick}
disabled={disabled || updating || !isRouter}
>
<Package class="h-3.5 w-3.5" />
@ -451,33 +388,35 @@
{:else if isRouter}
<ChevronDown class="h-3 w-3.5" />
{/if}
</button>
</Popover.Trigger>
{#if isOpen && isRouter}
<div
bind:this={menuRef}
use:portalToBody
class={cn(
'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
)}
role="listbox"
style:top={menuPosition ? `${menuPosition.top}px` : undefined}
style:left={menuPosition ? `${menuPosition.left}px` : undefined}
style:width={menuPosition ? `${menuPosition.width}px` : undefined}
data-placement={menuPosition?.placement ?? 'bottom'}
<Popover.Content
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
align="end"
sideOffset={8}
collisionPadding={16}
>
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
<div
class="overflow-y-auto py-1"
style:max-height={menuPosition && menuPosition.maxHeight > 0
? `${menuPosition.maxHeight}px`
: undefined}
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
>
<SearchInput
id="model-search"
placeholder="Search models..."
bind:value={searchTerm}
bind:ref={searchInputRef}
onClose={closeMenu}
onKeyDown={handleSearchKeyDown}
/>
</div>
<div
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
>
{#if !isCurrentModelInCache() && currentModel}
<!-- Show unavailable model as first option (disabled) -->
<button
type="button"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
role="option"
aria-selected="true"
aria-disabled="true"
@ -488,20 +427,25 @@
</button>
<div class="my-1 h-px bg-border"></div>
{/if}
{#each options as option (option.id)}
{#if filteredOptions.length === 0}
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
{/if}
{#each filteredOptions as option, index (option.id)}
{@const status = getModelStatus(option.model)}
{@const isLoaded = status === ServerModelStatus.LOADED}
{@const isLoading = status === ServerModelStatus.LOADING}
{@const isSelected = currentModel === option.model || activeId === option.id}
{@const isCompatible = isModelCompatible(option)}
{@const isHighlighted = index === highlightedIndex}
{@const missingModalities = getMissingModalities(option)}
<div
class={cn(
'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
isCompatible
? 'cursor-pointer hover:bg-muted focus:bg-muted'
: 'cursor-not-allowed opacity-50',
isSelected
isSelected || isHighlighted
? 'bg-accent text-accent-foreground'
: isCompatible
? 'hover:bg-accent hover:text-accent-foreground'
@ -509,10 +453,11 @@
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
)}
role="option"
aria-selected={isSelected}
aria-selected={isSelected || isHighlighted}
aria-disabled={!isCompatible}
tabindex={isCompatible ? 0 : -1}
onclick={() => isCompatible && handleSelect(option.id)}
onmouseenter={() => (highlightedIndex = index)}
onkeydown={(e) => {
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
e.preventDefault();
@ -586,8 +531,8 @@
{/each}
</div>
</div>
{/if}
</div>
</Popover.Content>
</Popover.Root>
{/if}
</div>

View File

@ -0,0 +1,19 @@
import Root from './popover.svelte';
import Close from './popover-close.svelte';
import Content from './popover-content.svelte';
import Trigger from './popover-trigger.svelte';
import Portal from './popover-portal.svelte';
export {
Root,
Content,
Trigger,
Close,
Portal,
//
Root as Popover,
Content as PopoverContent,
Trigger as PopoverTrigger,
Close as PopoverClose,
Portal as PopoverPortal
};

View File

@ -0,0 +1,7 @@
<script lang="ts">
import { Popover as PopoverPrimitive } from 'bits-ui';
let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
</script>
<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />

View File

@ -0,0 +1,37 @@
<script lang="ts">
import { Popover as PopoverPrimitive } from 'bits-ui';
import PopoverPortal from './popover-portal.svelte';
import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
import type { ComponentProps } from 'svelte';
let {
ref = $bindable(null),
class: className,
sideOffset = 4,
side,
align = 'center',
collisionPadding = 8,
avoidCollisions = true,
portalProps,
...restProps
}: PopoverPrimitive.ContentProps & {
portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
} = $props();
</script>
<PopoverPortal {...portalProps}>
<PopoverPrimitive.Content
bind:ref
data-slot="popover-content"
{sideOffset}
{side}
{align}
{collisionPadding}
{avoidCollisions}
class={cn(
'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
className
)}
{...restProps}
/>
</PopoverPortal>

View File

@ -0,0 +1,7 @@
<script lang="ts">
import { Popover as PopoverPrimitive } from 'bits-ui';
let { ...restProps }: PopoverPrimitive.PortalProps = $props();
</script>
<PopoverPrimitive.Portal {...restProps} />

View File

@ -0,0 +1,17 @@
<script lang="ts">
import { cn } from '$lib/components/ui/utils.js';
import { Popover as PopoverPrimitive } from 'bits-ui';
let {
ref = $bindable(null),
class: className,
...restProps
}: PopoverPrimitive.TriggerProps = $props();
</script>
<PopoverPrimitive.Trigger
bind:ref
data-slot="popover-trigger"
class={cn('', className)}
{...restProps}
/>

View File

@ -0,0 +1,7 @@
<script lang="ts">
import { Popover as PopoverPrimitive } from 'bits-ui';
let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
</script>
<PopoverPrimitive.Root bind:open {...restProps} />

View File

@ -1,3 +1,2 @@
export const VIEWPORT_GUTTER = 8;
export const MENU_OFFSET = 6;
export const MENU_MAX_WIDTH = 320;

View File

@ -295,14 +295,21 @@ class ModelsStore {
* Fetch props for a specific model from /props endpoint
* Uses caching to avoid redundant requests
*
* In ROUTER mode, this will only fetch props if the model is loaded,
* since unloaded models return 400 from /props endpoint.
*
* @param modelId - Model identifier to fetch props for
* @returns Props data or null if fetch failed
* @returns Props data or null if fetch failed or model not loaded
*/
async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
// Return cached props if available
const cached = this.modelPropsCache.get(modelId);
if (cached) return cached;
if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
return null;
}
// Avoid duplicate fetches
if (this.modelPropsFetching.has(modelId)) return null;