Merge branch 'ggml-org:master' into power-law-sampler

2025-12-11 12:52:53 -06:00 · 2025-12-11 12:52:53 -06:00 · 66e2d17c7f
parent 5ab4ff7e44 a81a569577
commit 66e2d17c7f
25 changed files with 501 additions and 268 deletions
--- a/docs/docker.md
+++ b/docs/docker.md
@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
 or with a server image:
 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
 ```
 ## Docker With CUDA
@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 ## Docker With MUSA
@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
 ```bash
 docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, addr, tensor);
 #endif
    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
    // see if we can merge with an existing block
@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
    }
    // otherwise, add a new block
    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
    GGML_UNUSED(tensor);
 }
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
    GGML_ASSERT(parent_size >= node_size);
-    if (parent_size > node_size) {
+    // note: we want after the freeing the chunks to continue to be aligned
    struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
    parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
    node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
    if (parent_size > node_size) {
        struct buffer_address p_addr = p_hn->addr;
        p_addr.offset += node_size;
        size_t extra_size = parent_size - node_size;
        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
-        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+        ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
    }
 }
@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+
    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
        __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, hn->addr, node);
 #endif
    ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
    hn->allocated = false;
 }
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CUMSUM:
        case GGML_OP_TRI:
        case GGML_OP_DIAG:
            return true;
        case GGML_OP_SOLVE_TRI:
-            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
+            return true;
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@ -3,6 +3,80 @@
 #include "solve_tri.cuh"
 #define MAX_N_FAST 64
 #define MAX_K_FAST 32
 static __global__ void get_batch_pointers(const float *  A,
                                          float *        X,
                                          const float ** A_ptrs,
                                          float **       X_ptrs,
                                          int64_t        ne02,
                                          int64_t        total_batches,
                                          size_t         s02,
                                          size_t         s03,
                                          size_t         s2,
                                          size_t         s3) {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= total_batches) {
        return;
    }
    const int64_t i3 = idx / ne02;
    const int64_t i2 = idx % ne02;
    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
 }
 static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
                                 const float *               A,
                                 const float *               B,
                                 float *                     X,
                                 int                         n,
                                 int                         k,
                                 int64_t                     ne02,
                                 int64_t                     ne03,
                                 size_t                      s02,
                                 size_t                      s03,
                                 size_t                      s12,
                                 size_t                      s13,
                                 size_t                      s2,
                                 size_t                      s3,
                                 cudaStream_t                stream) {
    const float   alpha         = 1.0f;
    const int64_t total_batches = ne02 * ne03;
    if (total_batches == 0) {
        return;
    }
    // Bulk copy B -> X (contiguous tensors)
    if (X != B) {
        const int64_t total_elements_BX = n * k * total_batches;
        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
    }
    const int id = ggml_cuda_get_device();
    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
    ggml_cuda_pool_alloc<float *>       X_ptrs_alloc(ctx.pool(id), total_batches);
    const float ** A_ptrs_dev = A_ptrs_alloc.get();
    float **       X_ptrs_dev = X_ptrs_alloc.get();
    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
                                                                        total_batches, s02, s03, s2, s3);
    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
    // Yes, this is necessary, without this we get RMSE errors
    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
                                    CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
    // revert to standard mode from common.cuh
    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
    GGML_UNUSED_VARS(s12, s13);
 }
 // ======================
 // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
 }
 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
+    const ggml_tensor * src0 = dst->src[0];  // A (n×n, lower triangular)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src1 = dst->src[1];  // B (n×k)
    ggml_is_contiguous(src0);
    ggml_is_contiguous(src1);
    const int64_t n    = src0->ne[0];
    const int64_t k    = src1->ne[0];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
-    GGML_ASSERT(n <= 64);
+    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
-    GGML_ASSERT(k <= 32);
+        solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
-
+                           src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
                           src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
                           dst->nb[3] / sizeof(float), ctx.stream());
    } else {
        solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
                             ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
                             src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
                             dst->nb[3] / sizeof(float), ctx.stream());
    }
 }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -19,6 +19,9 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_16BF HIPBLAS_R_16B
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
 #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
@ -30,6 +33,7 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define __all_sync(mask, var) __all(var)
 #define __any_sync(mask, var) __any(var)
 #define cublasStrsmBatched hipblasStrsmBatched
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -12,11 +12,16 @@
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N MUBLAS_OP_N
 #define CUBLAS_OP_T MUBLAS_OP_T
 #define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
 #define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
 #define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
 #define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
 #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
 #define CUDA_R_16F  MUSA_R_16F
 #define CUDA_R_16BF MUSA_R_16BF
 #define CUDA_R_32F  MUSA_R_32F
 #define cublasStrsmBatched mublasStrsmBatched
 #define cublasComputeType_t cudaDataType_t
 #define cublasCreate mublasCreate
 #define cublasDestroy mublasDestroy
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
    udata->output    .resize(n_tokens);
    udata->seq_id_data.reserve(n_tokens);
    seq_set_t seq_set_unq;
    for (size_t i = 0; i < idxs.size(); ++i) {
@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
        udata->seq_id[i]   = batch.seq_id[idxs[i]];
        udata->output[i]   = batch.logits[idxs[i]];
        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
            udata->seq_id_data.push_back(seq_id);
            seq_set_unq.set(seq_id);
        }
        if (udata->output[i]) {
@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
    }
    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
    for (size_t i = 0; i < idxs.size(); ++i) {
        udata->seq_id[i] = seq_id_ptr;
        seq_id_ptr += udata->n_seq_id[i];
    }
    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_set_unq.test(s)) {
            udata->seq_idx[s] = udata->seq_id_unq.size();
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@ -56,13 +56,15 @@ struct llama_ubatch {
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;
        std::vector<llama_seq_id> seq_id_data;
    };
-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
    std::shared_ptr<data_t> data;
 };
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -7861,9 +7861,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
    for (bool v : {false, true}) {
        for (bool circular : {false, true}) {
@ -8064,12 +8079,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    // qwen3next with CHUNK_SIZE 64
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
    // qwen3next with CHUNK_SIZE 128
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@ -41,7 +41,7 @@
 				"@tailwindcss/vite": "^4.0.0",
 				"@types/node": "^22",
 				"@vitest/browser": "^3.2.3",
-				"bits-ui": "^2.8.11",
+				"bits-ui": "^2.14.4",
 				"clsx": "^2.1.1",
 				"dexie": "^4.0.11",
 				"eslint": "^9.18.0",
@ -3343,17 +3343,17 @@
 			}
 		},
 		"node_modules/bits-ui": {
-			"version": "2.8.11",
+			"version": "2.14.4",
-			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
-			"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"@floating-ui/core": "^1.7.1",
 				"@floating-ui/dom": "^1.7.1",
 				"esm-env": "^1.1.2",
-				"runed": "^0.29.1",
+				"runed": "^0.35.1",
-				"svelte-toolbelt": "^0.9.3",
+				"svelte-toolbelt": "^0.10.6",
 				"tabbable": "^6.2.0"
 			},
 			"engines": {
@ -3368,9 +3368,9 @@
 			}
 		},
 		"node_modules/bits-ui/node_modules/runed": {
-			"version": "0.29.2",
+			"version": "0.35.1",
-			"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
-			"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte",
@ -3378,23 +3378,31 @@
 			],
 			"license": "MIT",
 			"dependencies": {
-				"esm-env": "^1.0.0"
+				"dequal": "^2.0.3",
 				"esm-env": "^1.0.0",
 				"lz-string": "^1.5.0"
 			},
 			"peerDependencies": {
 				"@sveltejs/kit": "^2.21.0",
 				"svelte": "^5.7.0"
 			},
 			"peerDependenciesMeta": {
 				"@sveltejs/kit": {
 					"optional": true
 				}
 			}
 		},
 		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
-			"version": "0.9.3",
+			"version": "0.10.6",
-			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
-			"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
 			"dev": true,
 			"funding": [
 				"https://github.com/sponsors/huntabyte"
 			],
 			"dependencies": {
 				"clsx": "^2.1.1",
-				"runed": "^0.29.0",
+				"runed": "^0.35.1",
 				"style-to-object": "^1.0.8"
 			},
 			"engines": {
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@ -43,7 +43,7 @@
 		"@tailwindcss/vite": "^4.0.0",
 		"@types/node": "^22",
 		"@vitest/browser": "^3.2.3",
-		"bits-ui": "^2.8.11",
+		"bits-ui": "^2.14.4",
 		"clsx": "^2.1.1",
 		"dexie": "^4.0.11",
 		"eslint": "^9.18.0",
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@ -331,6 +331,7 @@
 	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
 		? 'cursor-not-allowed opacity-60'
 		: ''} {className}"
 	data-slot="chat-form"
 >
 	<ChatAttachmentsList
 		bind:uploadedFiles
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
@ -1,6 +1,5 @@
 <script lang="ts">
-	import { Input } from '$lib/components/ui/input';
+	import { SearchInput } from '$lib/components/app';
 	import { Search } from '@lucide/svelte';
 	interface Props {
 		value?: string;
@ -15,19 +14,6 @@
 		onInput,
 		class: className
 	}: Props = $props();
 	function handleInput(event: Event) {
 		const target = event.target as HTMLInputElement;
 		value = target.value;
 		onInput?.(target.value);
 	}
 </script>
-<div class="relative mb-4 {className}">
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
 	<Search
 		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
 	/>
 	<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
 </div>
--- a/tools/server/webui/src/lib/components/app/index.ts
+++ b/tools/server/webui/src/lib/components/app/index.ts
@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
 export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
 export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
 export { default as RemoveButton } from './misc/RemoveButton.svelte';
 export { default as SearchInput } from './misc/SearchInput.svelte';
 export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
 export { default as ModelsSelector } from './models/ModelsSelector.svelte';
--- a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
@ -0,0 +1,73 @@
 <script lang="ts">
 	import { Input } from '$lib/components/ui/input';
 	import { Search, X } from '@lucide/svelte';
 	interface Props {
 		value?: string;
 		placeholder?: string;
 		onInput?: (value: string) => void;
 		onClose?: () => void;
 		onKeyDown?: (event: KeyboardEvent) => void;
 		class?: string;
 		id?: string;
 		ref?: HTMLInputElement | null;
 	}
 	let {
 		value = $bindable(''),
 		placeholder = 'Search...',
 		onInput,
 		onClose,
 		onKeyDown,
 		class: className,
 		id,
 		ref = $bindable(null)
 	}: Props = $props();
 	let showClearButton = $derived(!!value || !!onClose);
 	function handleInput(event: Event) {
 		const target = event.target as HTMLInputElement;
 		value = target.value;
 		onInput?.(target.value);
 	}
 	function handleClear() {
 		if (value) {
 			value = '';
 			onInput?.('');
 			ref?.focus();
 		} else {
 			onClose?.();
 		}
 	}
 </script>
 <div class="relative {className}">
 	<Search
 		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
 	/>
 	<Input
 		{id}
 		bind:value
 		bind:ref
 		class="pl-9 {showClearButton ? 'pr-9' : ''}"
 		oninput={handleInput}
 		onkeydown={onKeyDown}
 		{placeholder}
 		type="search"
 	/>
 	{#if showClearButton}
 		<button
 			type="button"
 			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
 			onclick={handleClear}
 			aria-label={value ? 'Clear search' : 'Close'}
 		>
 			<X class="h-4 w-4" />
 		</button>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@ -2,8 +2,8 @@
 	import { onMount, tick } from 'svelte';
 	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import * as Popover from '$lib/components/ui/popover';
 	import { cn } from '$lib/components/ui/utils';
 	import { portalToBody } from '$lib/utils';
 	import {
 		modelsStore,
 		modelOptions,
@ -17,12 +17,8 @@
 	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
 	import { ServerModelStatus } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
-	import { DialogModelInformation } from '$lib/components/app';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
-	import {
+	import type { ModelOption } from '$lib/types/models';
 		MENU_MAX_WIDTH,
 		MENU_OFFSET,
 		VIEWPORT_GUTTER
 	} from '$lib/constants/floating-ui-constraints';
 	interface Props {
 		class?: string;
@ -145,185 +141,126 @@
 		return options.some((option) => option.model === currentModel);
 	});
-	let isOpen = $state(false);
+	let searchTerm = $state('');
-	let showModelDialog = $state(false);
+	let searchInputRef = $state<HTMLInputElement | null>(null);
-	let container: HTMLDivElement | null = null;
+	let highlightedIndex = $state<number>(-1);
 	let menuRef = $state<HTMLDivElement | null>(null);
 	let triggerButton = $state<HTMLButtonElement | null>(null);
 	let menuPosition = $state<{
 		top: number;
 		left: number;
 		width: number;
 		placement: 'top' | 'bottom';
 		maxHeight: number;
 	} | null>(null);
-	onMount(async () => {
+	let filteredOptions: ModelOption[] = $derived(
-		try {
+		(() => {
-			await modelsStore.fetch();
+			const term = searchTerm.trim().toLowerCase();
-		} catch (error) {
+			if (!term) return options;
-			console.error('Unable to load models:', error);
+
-		}
+			return options.filter(
 				(option) =>
 					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
 			);
 		})()
 	);
 	// Get indices of compatible options for keyboard navigation
 	let compatibleIndices = $derived(
 		filteredOptions
 			.map((option, index) => (isModelCompatible(option) ? index : -1))
 			.filter((i) => i !== -1)
 	);
 	// Reset highlighted index when search term changes
 	$effect(() => {
 		void searchTerm;
 		highlightedIndex = -1;
 	});
-	function toggleOpen() {
+	let isOpen = $state(false);
-		if (loading || updating) return;
+	let showModelDialog = $state(false);
-
+
-		if (isRouter) {
+	onMount(() => {
-			// Router mode: show dropdown
+		modelsStore.fetch().catch((error) => {
-			if (isOpen) {
+			console.error('Unable to load models:', error);
-				closeMenu();
+		});
-			} else {
+	});
-				openMenu();
+
-			}
+	function handleOpenChange(open: boolean) {
 		} else {
 			// Single model mode: show dialog
 			showModelDialog = true;
 		}
 	}
 	async function openMenu() {
 		if (loading || updating) return;
 		if (open) {
 			isOpen = true;
-		await tick();
+			searchTerm = '';
-		updateMenuPosition();
+			highlightedIndex = -1;
-		requestAnimationFrame(() => updateMenuPosition());
+
 			// Focus search input after popover opens
 			tick().then(() => {
 				requestAnimationFrame(() => searchInputRef?.focus());
 			});
 			if (isRouter) {
 				modelsStore.fetchRouterModels().then(() => {
 					modelsStore.fetchModalitiesForLoadedModels();
 				});
 			}
 		} else {
 			isOpen = false;
 			searchTerm = '';
 			highlightedIndex = -1;
 		}
 	}
 	function handleTriggerClick() {
 		if (loading || updating) return;
 		if (!isRouter) {
 			// Single model mode: show dialog instead of popover
 			showModelDialog = true;
 		}
 		// For router mode, the Popover handles open/close
 	}
 	export function open() {
 		if (isRouter) {
-			openMenu();
+			handleOpenChange(true);
 		} else {
 			showModelDialog = true;
 		}
 	}
 	function closeMenu() {
-		if (!isOpen) return;
+		handleOpenChange(false);
 		isOpen = false;
 		menuPosition = null;
 	}
-	function handlePointerDown(event: PointerEvent) {
+	function handleSearchKeyDown(event: KeyboardEvent) {
-		if (!container) return;
+		if (event.isComposing) return;
-		const target = event.target as Node | null;
+		if (event.key === 'ArrowDown') {
 			event.preventDefault();
 			if (compatibleIndices.length === 0) return;
-		if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
-			closeMenu();
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
-		}
+				highlightedIndex = compatibleIndices[0];
 	}
 	function handleKeydown(event: KeyboardEvent) {
 		if (event.key === 'Escape') {
 			closeMenu();
 		}
 	}
 	function handleResize() {
 		if (isOpen) {
 			updateMenuPosition();
 		}
 	}
 	function updateMenuPosition() {
 		if (!isOpen || !triggerButton || !menuRef) return;
 		const triggerRect = triggerButton.getBoundingClientRect();
 		const viewportWidth = window.innerWidth;
 		const viewportHeight = window.innerHeight;
 		if (viewportWidth === 0 || viewportHeight === 0) return;
 		const scrollWidth = menuRef.scrollWidth;
 		const scrollHeight = menuRef.scrollHeight;
 		const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
 		const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
 		const safeMaxWidth =
 			constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
 		const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
 		let width = Math.min(
 			Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
 			safeMaxWidth || 320
 		);
 		const availableBelow = Math.max(
 			0,
 			viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
 		);
 		const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
 		const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
 		const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
 		function computePlacement(placement: 'top' | 'bottom') {
 			const available = placement === 'bottom' ? availableBelow : availableAbove;
 			const allowedHeight =
 				available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
 			const maxHeight = Math.min(scrollHeight, allowedHeight);
 			const height = Math.max(0, maxHeight);
 			let top: number;
 			if (placement === 'bottom') {
 				const rawTop = triggerRect.bottom + MENU_OFFSET;
 				const minTop = VIEWPORT_GUTTER;
 				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
 				if (maxTop < minTop) {
 					top = minTop;
 			} else {
-					top = Math.min(Math.max(rawTop, minTop), maxTop);
+				highlightedIndex = compatibleIndices[currentPos + 1];
 			}
 		} else if (event.key === 'ArrowUp') {
 			event.preventDefault();
 			if (compatibleIndices.length === 0) return;
 			const currentPos = compatibleIndices.indexOf(highlightedIndex);
 			if (currentPos === -1 || currentPos === 0) {
 				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
 			} else {
-				const rawTop = triggerRect.top - MENU_OFFSET - height;
+				highlightedIndex = compatibleIndices[currentPos - 1];
-				const minTop = VIEWPORT_GUTTER;
+			}
-				const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
+		} else if (event.key === 'Enter') {
-				if (maxTop < minTop) {
+			event.preventDefault();
-					top = minTop;
+			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
-				} else {
+				const option = filteredOptions[highlightedIndex];
-					top = Math.max(Math.min(rawTop, maxTop), minTop);
+				if (isModelCompatible(option)) {
 					handleSelect(option.id);
 				}
 			} else if (compatibleIndices.length > 0) {
 				// No selection - highlight first compatible option
 				highlightedIndex = compatibleIndices[0];
 			}
 		}
 			return { placement, top, height, maxHeight };
 		}
 		const belowMetrics = computePlacement('bottom');
 		const aboveMetrics = computePlacement('top');
 		let metrics = belowMetrics;
 		if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
 			metrics = aboveMetrics;
 		}
 		let left = triggerRect.right - width;
 		const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
 		if (maxLeft < VIEWPORT_GUTTER) {
 			left = VIEWPORT_GUTTER;
 		} else {
 			if (left > maxLeft) {
 				left = maxLeft;
 			}
 			if (left < VIEWPORT_GUTTER) {
 				left = VIEWPORT_GUTTER;
 			}
 		}
 		menuPosition = {
 			top: Math.round(metrics.top),
 			left: Math.round(left),
 			width: Math.round(width),
 			placement: metrics.placement,
 			maxHeight: Math.round(metrics.maxHeight)
 		};
 	}
 	async function handleSelect(modelId: string) {
@ -356,6 +293,14 @@
 		if (shouldCloseMenu) {
 			closeMenu();
 			// Focus the chat textarea after model selection
 			requestAnimationFrame(() => {
 				const textarea = document.querySelector<HTMLTextAreaElement>(
 					'[data-slot="chat-form"] textarea'
 				);
 				textarea?.focus();
 			});
 		}
 	}
@ -404,10 +349,7 @@
 	}
 </script>
-<svelte:window onresize={handleResize} />
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
 <svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
 <div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
@ -418,9 +360,8 @@
 	{:else}
 		{@const selectedOption = getDisplayOption()}
-		<div class="relative">
+		<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
-			<button
+			<Popover.Trigger
 				type="button"
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
 					!isCurrentModelInCache()
@ -430,15 +371,11 @@
 							: isHighlightedCurrentModelActive
 								? 'text-foreground'
 								: 'text-muted-foreground',
-					isOpen ? 'text-foreground' : '',
+					isOpen ? 'text-foreground' : ''
 					className
 				)}
 				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
-				aria-haspopup={isRouter ? 'listbox' : undefined}
+				onclick={handleTriggerClick}
-				aria-expanded={isRouter ? isOpen : undefined}
+				disabled={disabled || updating || !isRouter}
 				onclick={toggleOpen}
 				bind:this={triggerButton}
 				disabled={disabled || updating}
 			>
 				<Package class="h-3.5 w-3.5" />
@ -451,33 +388,35 @@
 				{:else if isRouter}
 					<ChevronDown class="h-3 w-3.5" />
 				{/if}
-			</button>
+			</Popover.Trigger>
-			{#if isOpen && isRouter}
+			<Popover.Content
-				<div
+				class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
-					bind:this={menuRef}
+				align="end"
-					use:portalToBody
+				sideOffset={8}
-					class={cn(
+				collisionPadding={16}
 						'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
 						menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
 					)}
 					role="listbox"
 					style:top={menuPosition ? `${menuPosition.top}px` : undefined}
 					style:left={menuPosition ? `${menuPosition.left}px` : undefined}
 					style:width={menuPosition ? `${menuPosition.width}px` : undefined}
 					data-placement={menuPosition?.placement ?? 'bottom'}
 			>
 				<div class="flex max-h-[50dvh] flex-col overflow-hidden">
 					<div
-						class="overflow-y-auto py-1"
+						class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
-						style:max-height={menuPosition && menuPosition.maxHeight > 0
+					>
-							? `${menuPosition.maxHeight}px`
+						<SearchInput
-							: undefined}
+							id="model-search"
 							placeholder="Search models..."
 							bind:value={searchTerm}
 							bind:ref={searchInputRef}
 							onClose={closeMenu}
 							onKeyDown={handleSearchKeyDown}
 						/>
 					</div>
 					<div
 						class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
 					>
 						{#if !isCurrentModelInCache() && currentModel}
 							<!-- Show unavailable model as first option (disabled) -->
 							<button
 								type="button"
-								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
+								class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
 								role="option"
 								aria-selected="true"
 								aria-disabled="true"
@ -488,20 +427,25 @@
 							</button>
 							<div class="my-1 h-px bg-border"></div>
 						{/if}
-						{#each options as option (option.id)}
+						{#if filteredOptions.length === 0}
 							<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
 						{/if}
 						{#each filteredOptions as option, index (option.id)}
 							{@const status = getModelStatus(option.model)}
 							{@const isLoaded = status === ServerModelStatus.LOADED}
 							{@const isLoading = status === ServerModelStatus.LOADING}
 							{@const isSelected = currentModel === option.model || activeId === option.id}
 							{@const isCompatible = isModelCompatible(option)}
 							{@const isHighlighted = index === highlightedIndex}
 							{@const missingModalities = getMissingModalities(option)}
 							<div
 								class={cn(
-									'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
+									'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
 									isCompatible
 										? 'cursor-pointer hover:bg-muted focus:bg-muted'
 										: 'cursor-not-allowed opacity-50',
-									isSelected
+									isSelected || isHighlighted
 										? 'bg-accent text-accent-foreground'
 										: isCompatible
 											? 'hover:bg-accent hover:text-accent-foreground'
@ -509,10 +453,11 @@
 									isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
 								)}
 								role="option"
-								aria-selected={isSelected}
+								aria-selected={isSelected || isHighlighted}
 								aria-disabled={!isCompatible}
 								tabindex={isCompatible ? 0 : -1}
 								onclick={() => isCompatible && handleSelect(option.id)}
 								onmouseenter={() => (highlightedIndex = index)}
 								onkeydown={(e) => {
 									if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
 										e.preventDefault();
@ -586,8 +531,8 @@
 						{/each}
 					</div>
 				</div>
-			{/if}
+			</Popover.Content>
-		</div>
+		</Popover.Root>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/ui/popover/index.ts
+++ b/tools/server/webui/src/lib/components/ui/popover/index.ts
@ -0,0 +1,19 @@
 import Root from './popover.svelte';
 import Close from './popover-close.svelte';
 import Content from './popover-content.svelte';
 import Trigger from './popover-trigger.svelte';
 import Portal from './popover-portal.svelte';
 export {
 	Root,
 	Content,
 	Trigger,
 	Close,
 	Portal,
 	//
 	Root as Popover,
 	Content as PopoverContent,
 	Trigger as PopoverTrigger,
 	Close as PopoverClose,
 	Portal as PopoverPortal
 };
--- a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
@ -0,0 +1,7 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
 </script>
 <PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
--- a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
@ -0,0 +1,37 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	import PopoverPortal from './popover-portal.svelte';
 	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
 	import type { ComponentProps } from 'svelte';
 	let {
 		ref = $bindable(null),
 		class: className,
 		sideOffset = 4,
 		side,
 		align = 'center',
 		collisionPadding = 8,
 		avoidCollisions = true,
 		portalProps,
 		...restProps
 	}: PopoverPrimitive.ContentProps & {
 		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
 	} = $props();
 </script>
 <PopoverPortal {...portalProps}>
 	<PopoverPrimitive.Content
 		bind:ref
 		data-slot="popover-content"
 		{sideOffset}
 		{side}
 		{align}
 		{collisionPadding}
 		{avoidCollisions}
 		class={cn(
 			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
 			className
 		)}
 		{...restProps}
 	/>
 </PopoverPortal>
--- a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
@ -0,0 +1,7 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
 </script>
 <PopoverPrimitive.Portal {...restProps} />
--- a/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
@ -0,0 +1,17 @@
 <script lang="ts">
 	import { cn } from '$lib/components/ui/utils.js';
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let {
 		ref = $bindable(null),
 		class: className,
 		...restProps
 	}: PopoverPrimitive.TriggerProps = $props();
 </script>
 <PopoverPrimitive.Trigger
 	bind:ref
 	data-slot="popover-trigger"
 	class={cn('', className)}
 	{...restProps}
 />
--- a/tools/server/webui/src/lib/components/ui/popover/popover.svelte
+++ b/tools/server/webui/src/lib/components/ui/popover/popover.svelte
@ -0,0 +1,7 @@
 <script lang="ts">
 	import { Popover as PopoverPrimitive } from 'bits-ui';
 	let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
 </script>
 <PopoverPrimitive.Root bind:open {...restProps} />
--- a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
+++ b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
@ -1,3 +1,2 @@
 export const VIEWPORT_GUTTER = 8;
 export const MENU_OFFSET = 6;
 export const MENU_MAX_WIDTH = 320;
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@ -295,14 +295,21 @@ class ModelsStore {
 	 * Fetch props for a specific model from /props endpoint
 	 * Uses caching to avoid redundant requests
 	 *
 	 * In ROUTER mode, this will only fetch props if the model is loaded,
 	 * since unloaded models return 400 from /props endpoint.
 	 *
 	 * @param modelId - Model identifier to fetch props for
-	 * @returns Props data or null if fetch failed
+	 * @returns Props data or null if fetch failed or model not loaded
 	 */
 	async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
 		// Return cached props if available
 		const cached = this.modelPropsCache.get(modelId);
 		if (cached) return cached;
 		if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
 			return null;
 		}
 		// Avoid duplicate fetches
 		if (this.modelPropsFetching.has(modelId)) return null;