diff --git a/docs/docker.md b/docs/docker.md index 98502a0c50..b9e5015396 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model or with a server image: ```bash -docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 ``` ## Docker With CUDA @@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne ```bash docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ## Docker With MUSA @@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne ```bash docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a5995fdc2c..ec16cbda9f 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) { +static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) { size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", - __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks); - -#ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, addr, tensor); -#endif - struct tallocr_chunk * chunk = alloc->chunks[addr.chunk]; // see if we can merge with an existing block @@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct } // otherwise, add a new block ggml_dyn_tallocr_insert_block(chunk, addr.offset, size); - - GGML_UNUSED(tensor); } static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { @@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten GGML_ASSERT(parent_size >= node_size); + // note: we want after the freeing the chunks to continue to be aligned + struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id]; + parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment); + node_size = aligned_offset(NULL, node_size, p_alloc->alignment); + if (parent_size > node_size) { - struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id]; struct buffer_address p_addr = p_hn->addr; p_addr.offset += node_size; size_t extra_size = parent_size - node_size; AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name); - ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent); + ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size); } } @@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; size_t size = ggml_backend_buft_get_alloc_size(buft, node); - ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node); + + AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", + __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks); +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, hn->addr, node); +#endif + + ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size); hn->allocated = false; } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8d17bc669a..ab0f6fe9ce 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CUMSUM: case GGML_OP_TRI: case GGML_OP_DIAG: - return true; case GGML_OP_SOLVE_TRI: - return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32; + return true; + default: return false; } diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu index e161d4dc43..177ffc268f 100644 --- a/ggml/src/ggml-cuda/solve_tri.cu +++ b/ggml/src/ggml-cuda/solve_tri.cu @@ -3,6 +3,80 @@ #include "solve_tri.cuh" #define MAX_N_FAST 64 +#define MAX_K_FAST 32 + +static __global__ void get_batch_pointers(const float * A, + float * X, + const float ** A_ptrs, + float ** X_ptrs, + int64_t ne02, + int64_t total_batches, + size_t s02, + size_t s03, + size_t s2, + size_t s3) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_batches) { + return; + } + + const int64_t i3 = idx / ne02; + const int64_t i2 = idx % ne02; + + A_ptrs[idx] = A + i3 * s03 + i2 * s02; + X_ptrs[idx] = X + i3 * s3 + i2 * s2; +} + +static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx, + const float * A, + const float * B, + float * X, + int n, + int k, + int64_t ne02, + int64_t ne03, + size_t s02, + size_t s03, + size_t s12, + size_t s13, + size_t s2, + size_t s3, + cudaStream_t stream) { + const float alpha = 1.0f; + const int64_t total_batches = ne02 * ne03; + if (total_batches == 0) { + return; + } + + // Bulk copy B -> X (contiguous tensors) + if (X != B) { + const int64_t total_elements_BX = n * k * total_batches; + CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + } + + const int id = ggml_cuda_get_device(); + + ggml_cuda_pool_alloc A_ptrs_alloc(ctx.pool(id), total_batches); + ggml_cuda_pool_alloc X_ptrs_alloc(ctx.pool(id), total_batches); + + const float ** A_ptrs_dev = A_ptrs_alloc.get(); + float ** X_ptrs_dev = X_ptrs_alloc.get(); + + get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02, + total_batches, s02, s03, s2, s3); + + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); + + // Yes, this is necessary, without this we get RMSE errors + CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH)); + CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches)); + + // revert to standard mode from common.cuh + CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH)); + + GGML_UNUSED_VARS(s12, s13); +} // ====================== // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction @@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f; float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f; - const int half = WARP_SIZE; + const int half = WARP_SIZE; const int nrows_low = (n < half) ? n : half; #pragma unroll @@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, #pragma unroll for (int row = half; row < n; ++row) { - float sum = sA[row * n + lane] * x_low; - const int j = half + lane; + float sum = sA[row * n + lane] * x_low; + const int j = half + lane; if (j < row) { sum += sA[row * n + j] * x_high; } @@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, for (int rr = 0; rr < 2; ++rr) { const int row = rr * WARP_SIZE + lane; if (row < n) { - const float val = (row < half) ? x_low : x_high; + const float val = (row < half) ? x_low : x_high; X_batch[row * k + col_idx] = val; } } @@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A, } void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix) - const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns) + const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular) + const ggml_tensor * src1 = dst->src[1]; // B (n×k) ggml_is_contiguous(src0); ggml_is_contiguous(src1); - const int64_t n = src0->ne[0]; - const int64_t k = src1->ne[0]; + const int64_t n = src0->ne[0]; + const int64_t k = src1->ne[0]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; - GGML_ASSERT(n <= 64); - GGML_ASSERT(k <= 32); - - solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2], - src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), - src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), - dst->nb[3] / sizeof(float), ctx.stream()); + if (n <= MAX_N_FAST && k <= MAX_K_FAST) { + solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, + src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), + src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), + dst->nb[3] / sizeof(float), ctx.stream()); + } else { + solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, + ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), + src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), + dst->nb[3] / sizeof(float), ctx.stream()); + } } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index b7d6edf7fc..951a88d567 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -19,6 +19,9 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_16BF HIPBLAS_R_16B #define CUDA_R_32F HIPBLAS_R_32F +#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned @@ -30,6 +33,7 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define __all_sync(mask, var) __all(var) #define __any_sync(mask, var) __any(var) +#define cublasStrsmBatched hipblasStrsmBatched #define cublasCreate hipblasCreate #define cublasDestroy hipblasDestroy #define cublasGemmEx hipblasGemmEx diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 8c55a2e4e5..221e67f96a 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -12,11 +12,16 @@ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT #define CUBLAS_OP_N MUBLAS_OP_N #define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH +#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT +#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER +#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH #define CUDA_R_16F MUSA_R_16F #define CUDA_R_16BF MUSA_R_16BF #define CUDA_R_32F MUSA_R_32F +#define cublasStrsmBatched mublasStrsmBatched #define cublasComputeType_t cudaDataType_t #define cublasCreate mublasCreate #define cublasDestroy mublasDestroy diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 86a1a4ba18..386fab04ac 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); udata->output .resize(n_tokens); + udata->seq_id_data.reserve(n_tokens); + seq_set_t seq_set_unq; for (size_t i = 0; i < idxs.size(); ++i) { @@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; - udata->seq_id[i] = batch.seq_id[idxs[i]]; udata->output[i] = batch.logits[idxs[i]]; for (int s = 0; s < udata->n_seq_id[i]; ++s) { - seq_set_unq.set(udata->seq_id[i][s]); + const llama_seq_id seq_id = batch.seq_id[idxs[i]][s]; + + udata->seq_id_data.push_back(seq_id); + seq_set_unq.set(seq_id); } if (udata->output[i]) { @@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } } + llama_seq_id * seq_id_ptr = udata->seq_id_data.data(); + for (size_t i = 0; i < idxs.size(); ++i) { + udata->seq_id[i] = seq_id_ptr; + seq_id_ptr += udata->n_seq_id[i]; + } + for (uint32_t s = 0; s < n_seq_max; ++s) { if (seq_set_unq.test(s)) { udata->seq_idx[s] = udata->seq_id_unq.size(); diff --git a/src/llama-batch.h b/src/llama-batch.h index 209cf3699d..8e6fac0efa 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -56,13 +56,15 @@ struct llama_ubatch { std::vector embd; std::vector pos; std::vector n_seq_id; - std::vector seq_id; + std::vector seq_id; // these point into the seq_id_data below std::vector seq_id_unq; std::vector seq_idx; std::vector output; + + std::vector seq_id_data; }; - // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data + // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data std::shared_ptr data; }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be1f66038..308e752b1d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7861,9 +7861,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 })); for (bool v : {false, true}) { for (bool circular : {false, true}) { @@ -8064,12 +8079,13 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416)); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); // qwen3next with CHUNK_SIZE 64 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 })); // qwen3next with CHUNK_SIZE 128 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 })); diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 2db04e9522..4cff76429e 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json index 9c1c2499cf..4f37b308b1 100644 --- a/tools/server/webui/package-lock.json +++ b/tools/server/webui/package-lock.json @@ -41,7 +41,7 @@ "@tailwindcss/vite": "^4.0.0", "@types/node": "^22", "@vitest/browser": "^3.2.3", - "bits-ui": "^2.8.11", + "bits-ui": "^2.14.4", "clsx": "^2.1.1", "dexie": "^4.0.11", "eslint": "^9.18.0", @@ -3343,17 +3343,17 @@ } }, "node_modules/bits-ui": { - "version": "2.8.11", - "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz", - "integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==", + "version": "2.14.4", + "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz", + "integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==", "dev": true, "license": "MIT", "dependencies": { "@floating-ui/core": "^1.7.1", "@floating-ui/dom": "^1.7.1", "esm-env": "^1.1.2", - "runed": "^0.29.1", - "svelte-toolbelt": "^0.9.3", + "runed": "^0.35.1", + "svelte-toolbelt": "^0.10.6", "tabbable": "^6.2.0" }, "engines": { @@ -3368,9 +3368,9 @@ } }, "node_modules/bits-ui/node_modules/runed": { - "version": "0.29.2", - "resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz", - "integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==", + "version": "0.35.1", + "resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz", + "integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte", @@ -3378,23 +3378,31 @@ ], "license": "MIT", "dependencies": { - "esm-env": "^1.0.0" + "dequal": "^2.0.3", + "esm-env": "^1.0.0", + "lz-string": "^1.5.0" }, "peerDependencies": { + "@sveltejs/kit": "^2.21.0", "svelte": "^5.7.0" + }, + "peerDependenciesMeta": { + "@sveltejs/kit": { + "optional": true + } } }, "node_modules/bits-ui/node_modules/svelte-toolbelt": { - "version": "0.9.3", - "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz", - "integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==", + "version": "0.10.6", + "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz", + "integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte" ], "dependencies": { "clsx": "^2.1.1", - "runed": "^0.29.0", + "runed": "^0.35.1", "style-to-object": "^1.0.8" }, "engines": { diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json index 987a7239ed..c20ab3cfde 100644 --- a/tools/server/webui/package.json +++ b/tools/server/webui/package.json @@ -43,7 +43,7 @@ "@tailwindcss/vite": "^4.0.0", "@types/node": "^22", "@vitest/browser": "^3.2.3", - "bits-ui": "^2.8.11", + "bits-ui": "^2.14.4", "clsx": "^2.1.1", "dexie": "^4.0.11", "eslint": "^9.18.0", diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte index 7f8e38286d..78cc1c47da 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte @@ -331,6 +331,7 @@ class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled ? 'cursor-not-allowed opacity-60' : ''} {className}" + data-slot="chat-form" > - import { Input } from '$lib/components/ui/input'; - import { Search } from '@lucide/svelte'; + import { SearchInput } from '$lib/components/app'; interface Props { value?: string; @@ -15,19 +14,6 @@ onInput, class: className }: Props = $props(); - - function handleInput(event: Event) { - const target = event.target as HTMLInputElement; - - value = target.value; - onInput?.(target.value); - } -
- - - -
+ diff --git a/tools/server/webui/src/lib/components/app/index.ts b/tools/server/webui/src/lib/components/app/index.ts index 87b24598b7..8631d4fb3b 100644 --- a/tools/server/webui/src/lib/components/app/index.ts +++ b/tools/server/webui/src/lib/components/app/index.ts @@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte'; export { default as MarkdownContent } from './misc/MarkdownContent.svelte'; export { default as RemoveButton } from './misc/RemoveButton.svelte'; +export { default as SearchInput } from './misc/SearchInput.svelte'; export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte'; export { default as ModelsSelector } from './models/ModelsSelector.svelte'; diff --git a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte new file mode 100644 index 0000000000..15cd6abaa9 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte @@ -0,0 +1,73 @@ + + +
+ + + + + {#if showClearButton} + + {/if} +
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte index c4331e92f1..ac0937696d 100644 --- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte @@ -2,8 +2,8 @@ import { onMount, tick } from 'svelte'; import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte'; import * as Tooltip from '$lib/components/ui/tooltip'; + import * as Popover from '$lib/components/ui/popover'; import { cn } from '$lib/components/ui/utils'; - import { portalToBody } from '$lib/utils'; import { modelsStore, modelOptions, @@ -17,12 +17,8 @@ import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte'; import { ServerModelStatus } from '$lib/enums'; import { isRouterMode } from '$lib/stores/server.svelte'; - import { DialogModelInformation } from '$lib/components/app'; - import { - MENU_MAX_WIDTH, - MENU_OFFSET, - VIEWPORT_GUTTER - } from '$lib/constants/floating-ui-constraints'; + import { DialogModelInformation, SearchInput } from '$lib/components/app'; + import type { ModelOption } from '$lib/types/models'; interface Props { class?: string; @@ -145,185 +141,126 @@ return options.some((option) => option.model === currentModel); }); - let isOpen = $state(false); - let showModelDialog = $state(false); - let container: HTMLDivElement | null = null; - let menuRef = $state(null); - let triggerButton = $state(null); - let menuPosition = $state<{ - top: number; - left: number; - width: number; - placement: 'top' | 'bottom'; - maxHeight: number; - } | null>(null); + let searchTerm = $state(''); + let searchInputRef = $state(null); + let highlightedIndex = $state(-1); - onMount(async () => { - try { - await modelsStore.fetch(); - } catch (error) { - console.error('Unable to load models:', error); - } + let filteredOptions: ModelOption[] = $derived( + (() => { + const term = searchTerm.trim().toLowerCase(); + if (!term) return options; + + return options.filter( + (option) => + option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term) + ); + })() + ); + + // Get indices of compatible options for keyboard navigation + let compatibleIndices = $derived( + filteredOptions + .map((option, index) => (isModelCompatible(option) ? index : -1)) + .filter((i) => i !== -1) + ); + + // Reset highlighted index when search term changes + $effect(() => { + void searchTerm; + highlightedIndex = -1; }); - function toggleOpen() { + let isOpen = $state(false); + let showModelDialog = $state(false); + + onMount(() => { + modelsStore.fetch().catch((error) => { + console.error('Unable to load models:', error); + }); + }); + + function handleOpenChange(open: boolean) { if (loading || updating) return; - if (isRouter) { - // Router mode: show dropdown - if (isOpen) { - closeMenu(); - } else { - openMenu(); + if (open) { + isOpen = true; + searchTerm = ''; + highlightedIndex = -1; + + // Focus search input after popover opens + tick().then(() => { + requestAnimationFrame(() => searchInputRef?.focus()); + }); + + if (isRouter) { + modelsStore.fetchRouterModels().then(() => { + modelsStore.fetchModalitiesForLoadedModels(); + }); } } else { - // Single model mode: show dialog - showModelDialog = true; + isOpen = false; + searchTerm = ''; + highlightedIndex = -1; } } - async function openMenu() { + function handleTriggerClick() { if (loading || updating) return; - isOpen = true; - await tick(); - updateMenuPosition(); - requestAnimationFrame(() => updateMenuPosition()); - - if (isRouter) { - modelsStore.fetchRouterModels().then(() => { - modelsStore.fetchModalitiesForLoadedModels(); - }); + if (!isRouter) { + // Single model mode: show dialog instead of popover + showModelDialog = true; } + // For router mode, the Popover handles open/close } export function open() { if (isRouter) { - openMenu(); + handleOpenChange(true); } else { showModelDialog = true; } } function closeMenu() { - if (!isOpen) return; - - isOpen = false; - menuPosition = null; + handleOpenChange(false); } - function handlePointerDown(event: PointerEvent) { - if (!container) return; + function handleSearchKeyDown(event: KeyboardEvent) { + if (event.isComposing) return; - const target = event.target as Node | null; + if (event.key === 'ArrowDown') { + event.preventDefault(); + if (compatibleIndices.length === 0) return; - if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) { - closeMenu(); - } - } - - function handleKeydown(event: KeyboardEvent) { - if (event.key === 'Escape') { - closeMenu(); - } - } - - function handleResize() { - if (isOpen) { - updateMenuPosition(); - } - } - - function updateMenuPosition() { - if (!isOpen || !triggerButton || !menuRef) return; - - const triggerRect = triggerButton.getBoundingClientRect(); - const viewportWidth = window.innerWidth; - const viewportHeight = window.innerHeight; - - if (viewportWidth === 0 || viewportHeight === 0) return; - - const scrollWidth = menuRef.scrollWidth; - const scrollHeight = menuRef.scrollHeight; - - const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2); - const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH); - const safeMaxWidth = - constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth); - const desiredMinWidth = Math.min(160, safeMaxWidth || 160); - - let width = Math.min( - Math.max(triggerRect.width, scrollWidth, desiredMinWidth), - safeMaxWidth || 320 - ); - - const availableBelow = Math.max( - 0, - viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET - ); - const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET); - const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2); - const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight); - - function computePlacement(placement: 'top' | 'bottom') { - const available = placement === 'bottom' ? availableBelow : availableAbove; - const allowedHeight = - available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance; - const maxHeight = Math.min(scrollHeight, allowedHeight); - const height = Math.max(0, maxHeight); - - let top: number; - if (placement === 'bottom') { - const rawTop = triggerRect.bottom + MENU_OFFSET; - const minTop = VIEWPORT_GUTTER; - const maxTop = viewportHeight - VIEWPORT_GUTTER - height; - if (maxTop < minTop) { - top = minTop; - } else { - top = Math.min(Math.max(rawTop, minTop), maxTop); - } + const currentPos = compatibleIndices.indexOf(highlightedIndex); + if (currentPos === -1 || currentPos === compatibleIndices.length - 1) { + highlightedIndex = compatibleIndices[0]; } else { - const rawTop = triggerRect.top - MENU_OFFSET - height; - const minTop = VIEWPORT_GUTTER; - const maxTop = viewportHeight - VIEWPORT_GUTTER - height; - if (maxTop < minTop) { - top = minTop; - } else { - top = Math.max(Math.min(rawTop, maxTop), minTop); + highlightedIndex = compatibleIndices[currentPos + 1]; + } + } else if (event.key === 'ArrowUp') { + event.preventDefault(); + if (compatibleIndices.length === 0) return; + + const currentPos = compatibleIndices.indexOf(highlightedIndex); + if (currentPos === -1 || currentPos === 0) { + highlightedIndex = compatibleIndices[compatibleIndices.length - 1]; + } else { + highlightedIndex = compatibleIndices[currentPos - 1]; + } + } else if (event.key === 'Enter') { + event.preventDefault(); + if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) { + const option = filteredOptions[highlightedIndex]; + if (isModelCompatible(option)) { + handleSelect(option.id); } - } - - return { placement, top, height, maxHeight }; - } - - const belowMetrics = computePlacement('bottom'); - const aboveMetrics = computePlacement('top'); - - let metrics = belowMetrics; - if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) { - metrics = aboveMetrics; - } - - let left = triggerRect.right - width; - const maxLeft = viewportWidth - VIEWPORT_GUTTER - width; - if (maxLeft < VIEWPORT_GUTTER) { - left = VIEWPORT_GUTTER; - } else { - if (left > maxLeft) { - left = maxLeft; - } - if (left < VIEWPORT_GUTTER) { - left = VIEWPORT_GUTTER; + } else if (compatibleIndices.length > 0) { + // No selection - highlight first compatible option + highlightedIndex = compatibleIndices[0]; } } - - menuPosition = { - top: Math.round(metrics.top), - left: Math.round(left), - width: Math.round(width), - placement: metrics.placement, - maxHeight: Math.round(metrics.maxHeight) - }; } async function handleSelect(modelId: string) { @@ -356,6 +293,14 @@ if (shouldCloseMenu) { closeMenu(); + + // Focus the chat textarea after model selection + requestAnimationFrame(() => { + const textarea = document.querySelector( + '[data-slot="chat-form"] textarea' + ); + textarea?.focus(); + }); } } @@ -404,10 +349,7 @@ } - - - -
+
{#if loading && options.length === 0 && isRouter}
@@ -418,9 +360,8 @@ {:else} {@const selectedOption = getDisplayOption()} -
- + - {#if isOpen && isRouter} -
+ +
0 - ? `${menuPosition.maxHeight}px` - : undefined} + class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0" + > + +
+
{#if !isCurrentModelInCache() && currentModel}
- {/if} -
+
+ {/if}
diff --git a/tools/server/webui/src/lib/components/ui/popover/index.ts b/tools/server/webui/src/lib/components/ui/popover/index.ts new file mode 100644 index 0000000000..c5937fb3a0 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/index.ts @@ -0,0 +1,19 @@ +import Root from './popover.svelte'; +import Close from './popover-close.svelte'; +import Content from './popover-content.svelte'; +import Trigger from './popover-trigger.svelte'; +import Portal from './popover-portal.svelte'; + +export { + Root, + Content, + Trigger, + Close, + Portal, + // + Root as Popover, + Content as PopoverContent, + Trigger as PopoverTrigger, + Close as PopoverClose, + Portal as PopoverPortal +}; diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte new file mode 100644 index 0000000000..dc4dec4b33 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte new file mode 100644 index 0000000000..2d3513d347 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte @@ -0,0 +1,37 @@ + + + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte new file mode 100644 index 0000000000..25efb877b7 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte new file mode 100644 index 0000000000..5ef3d0e932 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte @@ -0,0 +1,17 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover.svelte b/tools/server/webui/src/lib/components/ui/popover/popover.svelte new file mode 100644 index 0000000000..f39b867a69 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts index c95d3f1841..003fc77acb 100644 --- a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts +++ b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts @@ -1,3 +1,2 @@ export const VIEWPORT_GUTTER = 8; export const MENU_OFFSET = 6; -export const MENU_MAX_WIDTH = 320; diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts index 29416c2fe5..34b26403e4 100644 --- a/tools/server/webui/src/lib/stores/models.svelte.ts +++ b/tools/server/webui/src/lib/stores/models.svelte.ts @@ -295,14 +295,21 @@ class ModelsStore { * Fetch props for a specific model from /props endpoint * Uses caching to avoid redundant requests * + * In ROUTER mode, this will only fetch props if the model is loaded, + * since unloaded models return 400 from /props endpoint. + * * @param modelId - Model identifier to fetch props for - * @returns Props data or null if fetch failed + * @returns Props data or null if fetch failed or model not loaded */ async fetchModelProps(modelId: string): Promise { // Return cached props if available const cached = this.modelPropsCache.get(modelId); if (cached) return cached; + if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) { + return null; + } + // Avoid duplicate fetches if (this.modelPropsFetching.has(modelId)) return null;