Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr

This commit is contained in:
bluebread 2025-12-03 07:52:26 +00:00
commit 43dfc0c8d6
234 changed files with 13308 additions and 4623 deletions

View File

@ -66,14 +66,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
name: llama-bin-macos-arm64.tar.gz
macOS-x64:
runs-on: macos-15-intel
@ -120,14 +127,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz
ubuntu-22-cpu:
strategy:
matrix:
@ -182,14 +196,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
@ -235,14 +256,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
windows-cpu:
runs-on: windows-2025
@ -298,7 +326,7 @@ jobs:
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -380,7 +408,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -434,7 +462,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -526,7 +554,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
@ -632,7 +660,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -685,13 +713,20 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
openEuler-cann:
strategy:
@ -730,14 +765,21 @@ jobs:
- name: Pack artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -814,6 +856,7 @@ jobs:
echo "Moving other artifacts..."
mv -v artifact/*.zip release
mv -v artifact/*.tar.gz release
- name: Create release
id: create_release
@ -822,6 +865,39 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
body: |
> [!WARNING]
> **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
**macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- [Windows x64 (CUDA)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
**openEuler:**
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
- [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
- [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
<details>
${{ github.event.head_commit.message }}
</details>
- name: Upload release
id: upload_release
@ -833,7 +909,7 @@ jobs:
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./release')) {
if (path.extname(file) === '.zip') {
if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,

View File

@ -9,6 +9,7 @@ jobs:
update:
name: Update Winget Package
runs-on: ubuntu-latest
if: ${{ github.repository.owner.login == 'ggml-org' }}
steps:
- name: Install cargo binstall

View File

@ -7,7 +7,7 @@
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
/common/arg.* @ggerganov @ericcurtin
/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/common.* @ggerganov
@ -87,8 +87,7 @@
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/run/ @ericcurtin
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov

View File

@ -19,6 +19,7 @@ The project differentiates between 3 levels of contributors:
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
# Pull requests (for maintainers)

View File

@ -613,3 +613,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain

View File

@ -65,4 +65,6 @@ However, If you have discovered a security vulnerability in this project, please
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View File

@ -212,13 +212,13 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo; // set name for consistency
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
@ -227,7 +227,8 @@ static handle_model_result common_params_handle_model(
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
model.hf_repo = auto_detected.repo;
model.name = model.hf_repo; // repo name with tag
model.hf_repo = auto_detected.repo; // repo name without tag
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
@ -257,8 +258,6 @@ static handle_model_result common_params_handle_model(
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.path.empty()) {
model.path = model_path_default;
}
}
@ -405,7 +404,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -415,12 +414,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
}
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@ -2105,11 +2110,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
? std::string("model path from which to load base model")
: string_format(
"model path (default: `models/$filename` with filename from `--hf-file` "
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
),
? "model path from which to load base model"
: "model path to load",
[](common_params & params, const std::string & value) {
params.model.path = value;
}
@ -2507,6 +2509,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--models-dir"}, "PATH",
"directory containing models for the router server (default: disabled)",
[](common_params & params, const std::string & value) {
params.models_dir = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
add_opt(common_arg(
{"--models-max"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
[](common_params & params, int value) {
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--no-models-autoload"},
"disables automatic loading of models (default: enabled)",
[](common_params & params) {
params.models_autoload = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
@ -2654,7 +2677,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params &, const std::string & value) {
common_log_set_file(common_log_main(), value.c_str());
}
));
).set_env("LLAMA_LOG_FILE"));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@ -2689,7 +2712,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
" - 0: generic output\n"
" - 1: error\n"
" - 2: warning\n"
" - 3: info\n"
" - 4: debug\n"
"(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
common_log_set_verbosity_thold(value);

View File

@ -163,7 +163,7 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
if (tool_choice == "required") {
return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
}
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
}
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
@ -186,17 +186,17 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
try {
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
}
for (const auto & message : messages) {
if (!message.is_object()) {
throw std::runtime_error("Expected 'message' to be an object, got " + message.dump());
throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
}
common_chat_msg msg;
if (!message.contains("role")) {
throw std::runtime_error("Missing 'role' in message: " + message.dump());
throw std::invalid_argument("Missing 'role' in message: " + message.dump());
}
msg.role = message.at("role");
@ -209,11 +209,11 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
} else if (content.is_array()) {
for (const auto & part : content) {
if (!part.contains("type")) {
throw std::runtime_error("Missing content part type: " + part.dump());
throw std::invalid_argument("Missing content part type: " + part.dump());
}
const auto & type = part.at("type");
if (type != "text") {
throw std::runtime_error("Unsupported content part type: " + type.dump());
throw std::invalid_argument("Unsupported content part type: " + type.dump());
}
common_chat_msg_content_part msg_part;
msg_part.type = type;
@ -221,25 +221,25 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
msg.content_parts.push_back(msg_part);
}
} else if (!content.is_null()) {
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
}
}
if (has_tool_calls) {
for (const auto & tool_call : message.at("tool_calls")) {
common_chat_tool_call tc;
if (!tool_call.contains("type")) {
throw std::runtime_error("Missing tool call type: " + tool_call.dump());
throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
}
const auto & type = tool_call.at("type");
if (type != "function") {
throw std::runtime_error("Unsupported tool call type: " + tool_call.dump());
throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
}
if (!tool_call.contains("function")) {
throw std::runtime_error("Missing tool call function: " + tool_call.dump());
throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
}
const auto & fc = tool_call.at("function");
if (!fc.contains("name")) {
throw std::runtime_error("Missing tool call name: " + tool_call.dump());
throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
}
tc.name = fc.at("name");
tc.arguments = fc.at("arguments");
@ -250,7 +250,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
}
}
if (!has_content && !has_tool_calls) {
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
}
if (message.contains("reasoning_content")) {
msg.reasoning_content = message.at("reasoning_content");
@ -353,18 +353,18 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
try {
if (!tools.is_null()) {
if (!tools.is_array()) {
throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump());
throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
}
for (const auto & tool : tools) {
if (!tool.contains("type")) {
throw std::runtime_error("Missing tool type: " + tool.dump());
throw std::invalid_argument("Missing tool type: " + tool.dump());
}
const auto & type = tool.at("type");
if (!type.is_string() || type != "function") {
throw std::runtime_error("Unsupported tool type: " + tool.dump());
throw std::invalid_argument("Unsupported tool type: " + tool.dump());
}
if (!tool.contains("function")) {
throw std::runtime_error("Missing tool function: " + tool.dump());
throw std::invalid_argument("Missing tool function: " + tool.dump());
}
const auto & function = tool.at("function");

View File

@ -912,7 +912,7 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}
std::vector<common_file_info> fs_list_files(const std::string & path) {
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
std::vector<common_file_info> files;
if (path.empty()) return files;
@ -929,12 +929,20 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.is_dir = false;
try {
info.size = static_cast<size_t>(std::filesystem::file_size(p));
} catch (const std::filesystem::filesystem_error &) {
info.size = 0;
}
files.push_back(std::move(info));
} else if (include_directories && std::filesystem::is_directory(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.size = 0; // Directories have no size
info.is_dir = true;
files.push_back(std::move(info));
}
} catch (const std::filesystem::filesystem_error &) {
// skip entries we cannot inspect

View File

@ -26,8 +26,6 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();
@ -223,6 +221,7 @@ struct common_params_model {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};
struct common_params_speculative {
@ -369,7 +368,7 @@ struct common_params {
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t verbosity = 3; // LOG_LEVEL_INFO
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
@ -479,6 +478,11 @@ struct common_params {
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
bool log_json = false;
std::string slot_save_path;
@ -642,8 +646,9 @@ struct common_file_info {
std::string path;
std::string name;
size_t size = 0; // in bytes
bool is_dir = false;
};
std::vector<common_file_info> fs_list_files(const std::string & path);
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
//
// Model utils

View File

@ -430,7 +430,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
auto data_vec = static_cast<std::vector<char> *>(data);
@ -517,16 +517,18 @@ static bool common_pull_file(httplib::Client & cli,
headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
}
std::atomic<size_t> downloaded{existing_size};
const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size;
size_t progress_step = 0;
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
if (existing_size > 0 && response.status != 206) {
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
return false;
}
if (existing_size == 0 && response.status != 200) {
LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
return false;
}
if (total_size == 0 && response.has_header("Content-Length")) {
@ -534,7 +536,7 @@ static bool common_pull_file(httplib::Client & cli,
size_t content_length = std::stoull(response.get_header_value("Content-Length"));
total_size = existing_size + content_length;
} catch (const std::exception &e) {
LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
}
}
return true;
@ -542,11 +544,16 @@ static bool common_pull_file(httplib::Client & cli,
[&](const char *data, size_t len) {
ofs.write(data, len);
if (!ofs) {
LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
return false;
}
downloaded += len;
progress_step += len;
if (progress_step >= total_size / 1000 || downloaded == total_size) {
print_progress(downloaded, total_size);
progress_step = 0;
}
return true;
},
nullptr
@ -1047,7 +1054,7 @@ std::string common_docker_resolve_model(const std::string &) {
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();
const std::vector<common_file_info> files = fs_list_files(cache_dir);
const std::vector<common_file_info> files = fs_list(cache_dir, false);
for (const auto & file : files) {
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
common_cached_model_info model_info;

View File

@ -14,8 +14,10 @@ struct common_cached_model_info {
std::string model;
std::string tag;
size_t size = 0; // GGUF size in bytes
// return string representation like "user/model:tag"
// if tag is "latest", it will be omitted
std::string to_string() const {
return user + "/" + model + ":" + tag;
return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
}
};

View File

@ -974,7 +974,7 @@ public:
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());

View File

@ -443,8 +443,22 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
static int common_get_verbosity(enum ggml_log_level level) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
case GGML_LOG_LEVEL_NONE:
default:
return LOG_LEVEL_OUTPUT;
}
}
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
auto verbosity = common_get_verbosity(level);
if (verbosity <= common_log_verbosity_thold) {
common_log_add(common_log_main(), level, "%s", text);
}
}

View File

@ -21,8 +21,14 @@
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
#define LOG_LEVEL_DEBUG 4
#define LOG_LEVEL_INFO 3
#define LOG_LEVEL_WARN 2
#define LOG_LEVEL_ERROR 1
#define LOG_LEVEL_OUTPUT 0 // output data from tools
#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
enum log_colors {
LOG_COLORS_AUTO = -1,
@ -67,10 +73,11 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
//
// I - info (stdout, V = 0)
// W - warning (stderr, V = 0)
// E - error (stderr, V = 0)
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
// I - info (stdout, V = LOG_DEFAULT_INFO)
// W - warning (stderr, V = LOG_DEFAULT_WARN)
// E - error (stderr, V = LOG_DEFAULT_ERROR)
// O - output (stdout, V = LOG_DEFAULT_OUTPUT)
//
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
@ -95,14 +102,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps); // w
} \
} while (0)
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)

View File

@ -1592,17 +1592,27 @@ class MmprojModel(ModelBase):
# load preprocessor config
self.preprocessor_config = {}
if not self.is_mistral_format:
# check if preprocessor_config.json exists
if (self.dir_model / "preprocessor_config.json").is_file():
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
else:
# try "processing_config" file if exists
if (self.dir_model / "processing_config.json").is_file():
with open(self.dir_model / "processing_config.json", "r", encoding="utf-8") as f:
# prefer preprocessor_config.json if possible
preprocessor_config_path = self.dir_model / "preprocessor_config.json"
if preprocessor_config_path.is_file():
with open(preprocessor_config_path, "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
# prefer processor_config.json if possible
processor_config_path = self.dir_model / "processor_config.json"
if processor_config_path.is_file():
with open(processor_config_path, "r", encoding="utf-8") as f:
cfg = json.load(f)
# move image_processor to root level for compat
if "image_processor" in cfg:
cfg = {
**cfg,
**cfg["image_processor"],
}
# merge configs
self.preprocessor_config = {**self.preprocessor_config, **cfg}
def get_vision_config(self) -> dict[str, Any] | None:
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
return self.global_config.get(config_name)
@ -2815,9 +2825,38 @@ class Llama4VisionModel(MmprojModel):
@ModelBase.register("Mistral3ForConditionalGeneration")
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_arch = gguf.MODEL_ARCH.MISTRAL3
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# for compatibility, we use LLAMA arch for older models
# TODO: remove this once everyone has migrated to newer version of llama.cpp
if self.hparams.get("model_type") != "ministral3":
self.model_arch = gguf.MODEL_ARCH.LLAMA
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_params = self.hparams.get("rope_parameters")
if self.hparams.get("model_type") == "ministral3":
assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
if name.endswith("weight_scale_inv"):
raise ValueError("This is a quantized weight, please use BF16 weight instead")
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return []
@ -9941,12 +9980,22 @@ class ApertusModel(LlamaModel):
class MistralModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_arch = gguf.MODEL_ARCH.MISTRAL3
model_name = "Mistral"
hf_arch = ""
is_mistral_format = True
undo_permute = False
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# for compatibility, we use LLAMA arch for older models
# TODO: remove this once everyone migrates to newer version of llama.cpp
if "llama_4_scaling" not in self.hparams:
self.model_arch = gguf.MODEL_ARCH.LLAMA
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
@ -9986,6 +10035,20 @@ class MistralModel(LlamaModel):
return template
def set_gguf_parameters(self):
super().set_gguf_parameters()
if "yarn" in self.hparams:
yarn_params = self.hparams["yarn"]
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
if "llama_4_scaling" in self.hparams:
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
class PixtralModel(LlavaVisionModel):
model_name = "Pixtral"

View File

@ -21,11 +21,11 @@ Legend:
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@ -36,10 +36,10 @@ Legend:
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
@ -102,7 +102,7 @@ Legend:
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
@ -115,7 +115,8 @@ Legend:
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |

View File

@ -5005,8 +5005,8 @@
"Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=f32,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","0","no","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","0","no","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","1","yes","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=1","support","0","no","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=2","support","0","no","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=3","support","0","no","Vulkan"
@ -5032,14 +5032,14 @@
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
@ -5271,7 +5271,7 @@
"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
@ -5415,21 +5415,49 @@
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,3,5,7]","support","0","no","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
@ -5655,6 +5683,7 @@
"Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan"
@ -8644,9 +8673,13 @@
"Vulkan0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
"Vulkan0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","Vulkan"
"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","Vulkan"
"Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
@ -8666,9 +8699,13 @@
"Vulkan0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
"Vulkan0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","Vulkan"
@ -9411,28 +9448,405 @@
"Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","Vulkan"
"Vulkan0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
"Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Vulkan"
@ -9445,6 +9859,10 @@
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
@ -9479,23 +9897,37 @@
"Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
"Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan"
"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","Vulkan"
"Vulkan0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","Vulkan"
"Vulkan0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","Vulkan"
"Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Vulkan"
"Vulkan0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Vulkan"
"Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Vulkan"
"Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","Vulkan"

Can't render this file because it is too large.

View File

@ -408,62 +408,67 @@ if (MSVC)
/wd4996 # Disable POSIX deprecation warnings
/wd4702 # Unreachable code warnings
)
function(disable_msvc_warnings target_name)
set(MSVC_COMPILE_OPTIONS
"$<$<COMPILE_LANGUAGE:C>:/utf-8>"
"$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
)
function(configure_msvc_target target_name)
if(TARGET ${target_name})
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
endif()
endfunction()
disable_msvc_warnings(ggml-base)
disable_msvc_warnings(ggml)
disable_msvc_warnings(ggml-cpu)
disable_msvc_warnings(ggml-cpu-x64)
disable_msvc_warnings(ggml-cpu-sse42)
disable_msvc_warnings(ggml-cpu-sandybridge)
disable_msvc_warnings(ggml-cpu-haswell)
disable_msvc_warnings(ggml-cpu-skylakex)
disable_msvc_warnings(ggml-cpu-icelake)
disable_msvc_warnings(ggml-cpu-alderlake)
configure_msvc_target(ggml-base)
configure_msvc_target(ggml)
configure_msvc_target(ggml-cpu)
configure_msvc_target(ggml-cpu-x64)
configure_msvc_target(ggml-cpu-sse42)
configure_msvc_target(ggml-cpu-sandybridge)
configure_msvc_target(ggml-cpu-haswell)
configure_msvc_target(ggml-cpu-skylakex)
configure_msvc_target(ggml-cpu-icelake)
configure_msvc_target(ggml-cpu-alderlake)
if (GGML_BUILD_EXAMPLES)
disable_msvc_warnings(common-ggml)
disable_msvc_warnings(common)
configure_msvc_target(common-ggml)
configure_msvc_target(common)
disable_msvc_warnings(mnist-common)
disable_msvc_warnings(mnist-eval)
disable_msvc_warnings(mnist-train)
configure_msvc_target(mnist-common)
configure_msvc_target(mnist-eval)
configure_msvc_target(mnist-train)
disable_msvc_warnings(gpt-2-ctx)
disable_msvc_warnings(gpt-2-alloc)
disable_msvc_warnings(gpt-2-backend)
disable_msvc_warnings(gpt-2-sched)
disable_msvc_warnings(gpt-2-quantize)
disable_msvc_warnings(gpt-2-batched)
configure_msvc_target(gpt-2-ctx)
configure_msvc_target(gpt-2-alloc)
configure_msvc_target(gpt-2-backend)
configure_msvc_target(gpt-2-sched)
configure_msvc_target(gpt-2-quantize)
configure_msvc_target(gpt-2-batched)
disable_msvc_warnings(gpt-j)
disable_msvc_warnings(gpt-j-quantize)
configure_msvc_target(gpt-j)
configure_msvc_target(gpt-j-quantize)
disable_msvc_warnings(magika)
disable_msvc_warnings(yolov3-tiny)
disable_msvc_warnings(sam)
configure_msvc_target(magika)
configure_msvc_target(yolov3-tiny)
configure_msvc_target(sam)
disable_msvc_warnings(simple-ctx)
disable_msvc_warnings(simple-backend)
configure_msvc_target(simple-ctx)
configure_msvc_target(simple-backend)
endif()
if (GGML_BUILD_TESTS)
disable_msvc_warnings(test-mul-mat)
disable_msvc_warnings(test-arange)
disable_msvc_warnings(test-backend-ops)
disable_msvc_warnings(test-cont)
disable_msvc_warnings(test-conv-transpose)
disable_msvc_warnings(test-conv-transpose-1d)
disable_msvc_warnings(test-conv1d)
disable_msvc_warnings(test-conv2d)
disable_msvc_warnings(test-conv2d-dw)
disable_msvc_warnings(test-customop)
disable_msvc_warnings(test-dup)
disable_msvc_warnings(test-opt)
disable_msvc_warnings(test-pool)
configure_msvc_target(test-mul-mat)
configure_msvc_target(test-arange)
configure_msvc_target(test-backend-ops)
configure_msvc_target(test-cont)
configure_msvc_target(test-conv-transpose)
configure_msvc_target(test-conv-transpose-1d)
configure_msvc_target(test-conv1d)
configure_msvc_target(test-conv2d)
configure_msvc_target(test-conv2d-dw)
configure_msvc_target(test-customop)
configure_msvc_target(test-dup)
configure_msvc_target(test-opt)
configure_msvc_target(test-pool)
endif ()
endif()

View File

@ -2148,7 +2148,8 @@ extern "C" {
};
enum ggml_scale_flag {
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
GGML_SCALE_FLAG_ANTIALIAS = (1 << 9),
};
// interpolate

View File

@ -723,6 +723,12 @@ struct ggml_backend_sched {
bool op_offload;
int debug;
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
// ref: https://github.com/ggml-org/llama.cpp/pull/17617
int debug_realloc;
int debug_graph_size;
int debug_prev_graph_size;
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1234,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
}
if (sched->n_copies > 1) {
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
@ -1289,6 +1293,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
}
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
// remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
sched->debug_prev_graph_size = sched->debug_graph_size;
sched->debug_graph_size = graph_size;
if (sched->graph.size < graph_size) {
sched->graph.size = graph_size;
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
@ -1395,14 +1404,21 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
// allocate graph
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
#ifdef GGML_SCHED_NO_REALLOC
GGML_ABORT("%s: failed to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
#endif
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
if (sched->debug_realloc > 0) {
// we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
// example: https://github.com/ggml-org/llama.cpp/pull/17143
const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
if (unexpected || sched->debug_realloc > 1) {
GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
}
}
// the re-allocation may cause the split inputs to be moved to a different address
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
for (int i = 0; i < sched->n_backends; i++) {
@ -1620,6 +1636,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
sched->debug_realloc = 0;
#ifdef GGML_SCHED_NO_REALLOC
sched->debug_realloc = 1;
#endif
const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
@ -1636,6 +1660,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->debug_graph_size = 0;
sched->debug_prev_graph_size = 0;
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer = (char *) malloc(sched->context_buffer_size);

View File

@ -2500,6 +2500,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
return false;
}
if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
return false;
}
return true;
}
case GGML_OP_POOL_2D:
@ -2561,6 +2564,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
return true;
case GGML_OP_OUT_PROD:
{
#ifdef ASCEND_310P
// Ger is not supported on 310p device
return false;
#endif
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:

View File

@ -8,6 +8,10 @@
#include <sys/sysctl.h>
#endif
#if !defined(HWCAP2_SVE2)
#define HWCAP2_SVE2 (1 << 1)
#endif
#if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM (1 << 13)
#endif

View File

@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
}
#if defined(__ARM_ARCH)
#if defined(__linux__) && defined(__aarch64__)
#include <sys/auxv.h>
#endif
static void ggml_init_arm_arch_features(void) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
#if defined(__linux__)
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
#else
// TODO: add support of SVE for non-linux systems
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
#endif
#endif
#include <arm_sve.h>
static void ggml_init_arm_arch_features(void) {
ggml_arm_arch_features.sve_cnt = svcntb();
}
#else
static void ggml_init_arm_arch_features(void) {}
#endif
#endif // __ARM_ARCH
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {

View File

@ -7420,6 +7420,65 @@ static void ggml_compute_forward_upscale_f32(
}
}
}
} else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
auto triangle_filter = [](float x) -> float {
return std::max(1.0f - fabsf(x), 0.0f);
};
// support and invscale, minimum 1 pixel for bilinear
const float support1 = std::max(1.0f, 1.0f / sf1);
const float invscale1 = 1.0f / support1;
const float support0 = std::max(1.0f, 1.0f / sf0);
const float invscale0 = 1.0f / support0;
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2 / sf2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const float y = ((float) i1 + pixel_offset) / sf1;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const float x = ((float) i0 + pixel_offset) / sf0;
// the range of source pixels that contribute
const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
// bilinear filter with antialiasing
float val = 0.0f;
float total_weight = 0.0f;
for (int64_t sy = y_min; sy < y_max; sy++) {
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
for (int64_t sx = x_min; sx < x_max; sx++) {
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
const float weight = weight_x * weight_y;
if (weight <= 0.0f) {
continue;
}
const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
val += pixel * weight;
total_weight += weight;
}
}
if (total_weight > 0.0f) {
val /= total_weight;
}
float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
*dst_ptr = val;
}
}
}
}
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;

View File

@ -989,6 +989,10 @@ struct ggml_cuda_concurrent_event {
int n_streams = 0;
std::unordered_map<const ggml_tensor *, int> stream_mapping;
// Original order of nodes in this concurrent region (before interleaving)
// Used to restore grouping for fusion within streams
std::vector<const ggml_tensor *> original_order;
const ggml_tensor * join_node;
ggml_cuda_concurrent_event() = default;
@ -1011,6 +1015,7 @@ struct ggml_cuda_concurrent_event {
, fork_event(other.fork_event)
, n_streams(other.n_streams)
, stream_mapping(std::move(other.stream_mapping))
, original_order(std::move(other.original_order))
, join_node(other.join_node) {
other.fork_event = nullptr;
}
@ -1121,11 +1126,9 @@ struct ggml_cuda_concurrent_event {
};
struct ggml_cuda_stream_context {
std::vector<const ggml_tensor *> original_nodes;
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
void reset() {
original_nodes.clear();
concurrent_events.clear();
}
};

View File

@ -3238,9 +3238,56 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
if (should_launch_concurrent_events) {
//Restore the original graph to enable fusion within the streams
cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_nodes.data());
cgraph->n_nodes = (int) stream_ctx.original_nodes.size();
// Restore original node order within each concurrent region to enable fusion within streams
std::unordered_map<const ggml_tensor *, int> node_to_idx;
node_to_idx.reserve(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; ++i) {
node_to_idx[cgraph->nodes[i]] = i;
}
for (auto & [fork_node, event] : stream_ctx.concurrent_events) {
// Find positions of all nodes from this event in the current graph
std::vector<int> positions;
positions.reserve(event.original_order.size());
bool all_found = true;
for (const ggml_tensor * orig_node : event.original_order) {
auto it = node_to_idx.find(orig_node);
if (it != node_to_idx.end()) {
positions.push_back(it->second);
} else {
all_found = false;
break;
}
}
if (!all_found || positions.size() != event.original_order.size()) {
continue;
}
// Sort positions to get contiguous range
std::vector<int> sorted_positions = positions;
std::sort(sorted_positions.begin(), sorted_positions.end());
bool is_contiguous = true;
for (size_t i = 1; i < sorted_positions.size(); ++i) {
if (sorted_positions[i] != sorted_positions[i-1] + 1) {
is_contiguous = false;
break;
}
}
if (!is_contiguous) {
continue;
}
// Restore original order at the sorted positions
int start_pos = sorted_positions[0];
for (size_t i = 0; i < event.original_order.size(); ++i) {
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
}
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
@ -3274,7 +3321,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
}
}
prev_i = i;
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
@ -3282,6 +3328,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
}
#endif
prev_i = i;
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
@ -3805,14 +3852,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
// store {fork_idx, join_idx}
std::vector<std::pair<int, int>> concurrent_node_ranges;
// save the original nodes
std::vector<const ggml_tensor *> original_nodes;
original_nodes.reserve(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; ++i) {
original_nodes.push_back(cgraph->nodes[i]);
}
cuda_ctx->stream_context().original_nodes = std::move(original_nodes);
for (const auto & [root_node, count] : fan_out) {
if (count >= min_fan_out && count <= max_fan_out) {
const int root_node_idx = node_indices[root_node];
@ -3917,6 +3956,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
continue;
}
// Save the original order of nodes in this region before interleaving
// This is used later to restore grouping for fusion within streams
concurrent_event.original_order.reserve(total_branch_nodes);
for (int i = fork_node_idx + 1; i < join_node_idx; ++i) {
concurrent_event.original_order.push_back(cgraph->nodes[i]);
}
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
concurrent_events.emplace(root_node, std::move(concurrent_event));

View File

@ -81,6 +81,76 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst,
dst[index] = result;
}
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
static __global__ void upscale_f32_bilinear_antialias(const float * src0, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset) {
const int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
if (index >= dst_total_elements) {
return;
}
const int i10_dst = index % ne10_dst;
const int i11_dst = (index / ne10_dst) % ne11_dst;
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
const int i02_src = (int)(i12_dst / sf2);
const int i03_src = (int)(i13_dst / sf3);
const float y = ((float)i11_dst + pixel_offset) / sf1;
const float x = ((float)i10_dst + pixel_offset) / sf0;
// support and invscale, minimum 1 pixel for bilinear
const float support1 = max(1.0f / sf1, 1.0f);
const float invscale1 = 1.0f / support1;
const float support0 = max(1.0f / sf0, 1.0f);
const float invscale0 = 1.0f / support0;
// the range of source pixels that contribute
const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset));
const int64_t x_max = min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset));
const int64_t y_min = max(int64_t(0), int64_t(y - support1 + pixel_offset));
const int64_t y_max = min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset));
// bilinear filter with antialiasing
float val = 0.0f;
float total_weight = 0.0f;
auto triangle_filter = [](float x) -> float {
return max(1.0f - fabsf(x), 0.0f);
};
for (int64_t sy = y_min; sy < y_max; sy++) {
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
for (int64_t sx = x_min; sx < x_max; sx++) {
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
const float weight = weight_x * weight_y;
if (weight <= 0.0f) {
continue;
}
const float pixel = *(const float *)((const char *)src0 + sx*nb00 + sy*nb01 + i02_src*nb02 + i03_src*nb03);
val += pixel * weight;
total_weight += weight;
}
}
if (total_weight > 0.0f) {
val /= total_weight;
}
dst[index] = val;
}
namespace bicubic_interpolation {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
__device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
@ -161,12 +231,16 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset, cudaStream_t stream) {
const float pixel_offset, bool antialias, cudaStream_t stream) {
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
if (antialias) {
upscale_f32_bilinear_antialias<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
} else {
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
}
}
static void upscale_f32_bicubic_cuda(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
@ -207,9 +281,10 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
if (mode == GGML_SCALE_MODE_NEAREST) {
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
sf0, sf1, sf2, sf3, pixel_offset, antialias, stream);
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],

View File

@ -894,7 +894,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_POOL_1D:
return false;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
case GGML_OP_POOL_2D:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_PAD:
@ -912,6 +912,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
// for new head sizes, add checks here
if (op->src[0]->ne[0] != 32 &&
op->src[0]->ne[0] != 40 &&
op->src[0]->ne[0] != 48 &&
op->src[0]->ne[0] != 64 &&
op->src[0]->ne[0] != 72 &&
op->src[0]->ne[0] != 80 &&

View File

@ -5757,6 +5757,7 @@ typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, hal
template [[host_name("kernel_flash_attn_ext_f32_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 32, 32>;
template [[host_name("kernel_flash_attn_ext_f32_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 40, 40>;
template [[host_name("kernel_flash_attn_ext_f32_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 48, 48>;
template [[host_name("kernel_flash_attn_ext_f32_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 64, 64>;
template [[host_name("kernel_flash_attn_ext_f32_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 72, 72>;
template [[host_name("kernel_flash_attn_ext_f32_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 80, 80>;
@ -5770,6 +5771,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
template [[host_name("kernel_flash_attn_ext_f16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 40, 40>;
template [[host_name("kernel_flash_attn_ext_f16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 48, 48>;
template [[host_name("kernel_flash_attn_ext_f16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>;
template [[host_name("kernel_flash_attn_ext_f16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 72, 72>;
template [[host_name("kernel_flash_attn_ext_f16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 80, 80>;
@ -5784,6 +5786,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_at
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_flash_attn_ext_bf16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 32, 32>;
template [[host_name("kernel_flash_attn_ext_bf16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 40, 40>;
template [[host_name("kernel_flash_attn_ext_bf16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 48, 48>;
template [[host_name("kernel_flash_attn_ext_bf16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
template [[host_name("kernel_flash_attn_ext_bf16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 72, 72>;
template [[host_name("kernel_flash_attn_ext_bf16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80, 80>;
@ -5798,6 +5801,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80, 80>;
@ -5811,6 +5815,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80, 80>;
@ -5824,6 +5829,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80, 80>;
@ -5837,6 +5843,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80, 80>;
@ -5850,6 +5857,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80, 80>;

View File

@ -3086,8 +3086,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_UPSCALE: {
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS);
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias;
}
case GGML_OP_CONV_2D:
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||

View File

@ -1787,6 +1787,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
const sycl::range<3> block_dims(1, 1, nth);
const sycl::range<3> block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
GGML_ASSERT(shared_mem<=ggml_sycl_info().devices[device].smpbo);
if (order == GGML_SORT_ORDER_ASC) {
stream->submit([&](sycl::handler &cgh) {
@ -4348,6 +4349,9 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_sycl_device_context *sycl_ctx =
(ggml_backend_sycl_device_context *)dev->context;
int device = sycl_ctx->device;
switch (op->op) {
case GGML_OP_CONV_TRANSPOSE_1D:
{
@ -4597,12 +4601,14 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_IM2COL:
return true;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGSORT:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_ARGSORT:
return op->src[0]->ne[0] * sizeof(int) <=
ggml_sycl_info().devices[device].smpbo;
case GGML_OP_POOL_2D:
case GGML_OP_ACC:
return true;

View File

@ -1227,6 +1227,7 @@ struct vk_op_topk_push_constants {
uint32_t orig_ncols;
uint32_t ncols_input;
uint32_t ncols_output;
uint32_t k;
uint32_t nrows;
uint32_t first_pass;
uint32_t last_pass;
@ -1673,6 +1674,14 @@ class vk_perf_logger {
timings[name.str()].push_back(time);
return;
}
if (node->op == GGML_OP_TOP_K) {
std::stringstream name;
name << ggml_op_name(node->op) <<
" K=" << node->ne[0] <<
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
timings[name.str()].push_back(time);
return;
}
timings[ggml_op_name(node->op)].push_back(time);
}
private:
@ -10345,17 +10354,8 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
uint32_t nrows = ggml_nrows(src0);
uint32_t k = dst->ne[0];
vk_op_topk_push_constants pc { ncols, ncols, k, nrows, 0, 0 };
vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 };
// Reserve space for ivec2 per element, double buffered
const size_t dbl_buf_size = size_t{ncols} * nrows * 2 * sizeof(int);
const size_t x_sz = dbl_buf_size * 2;
uint32_t dbl_buf_index = 0;
if (ctx->prealloc_size_x < x_sz) {
ctx->prealloc_size_x = x_sz;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_x_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
@ -10370,8 +10370,9 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
// largest elements. Repeat until we have the top K elements.
// Need to do at least one iteration to write out the results.
bool done_one_iter = false;
uint32_t dbl_buf_index = 0;
size_t dbl_buf_size;
while (num_elements > k || !done_one_iter) {
done_one_iter = true;
// Prefer going as small as num_topk_pipelines - 3 for perf reasons.
// But if K is larger, then we need a larger workgroup
@ -10411,6 +10412,21 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
// Number of elements remaining after this pass
uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]);
pc2.ncols_output = num_dst_elements;
if (!done_one_iter) {
// Reserve space for ivec2 per element, double buffered
// K per workgroup per row
dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int);
dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
const size_t x_sz = dbl_buf_size * 2;
if (ctx->prealloc_size_x < x_sz) {
ctx->prealloc_size_x = x_sz;
ggml_vk_preallocate_buffers(ctx, subctx);
}
}
vk_subbuffer src_buf;
vk_subbuffer dst_buf;
@ -10436,6 +10452,7 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
if (num_elements > k) {
ggml_vk_sync_buffers(ctx, subctx);
}
done_one_iter = true;
}
ctx->prealloc_x_need_sync = true;
}
@ -14113,6 +14130,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
}
return true;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
case GGML_OP_ACC:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_CONCAT:

View File

@ -19,6 +19,7 @@ layout (push_constant) uniform parameter {
uint orig_ncols;
uint ncols_input;
uint ncols_output;
uint k;
uint nrows;
uint first_pass;
uint last_pass;
@ -36,7 +37,7 @@ void topk(bool needs_bounds_check, const uint row) {
const uint row_offset = row * p.ncols_input;
dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
} else {
const uint row_offset = row * p.orig_ncols;
const uint row_offset = row * p.ncols_input;
dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
}
} else {
@ -44,7 +45,7 @@ void topk(bool needs_bounds_check, const uint row) {
}
barrier();
if (p.ncols_output == 1) {
if (p.k == 1) {
// Fast path for single output - just do a max reduction
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
if (col < s) {
@ -84,16 +85,20 @@ void topk(bool needs_bounds_check, const uint row) {
}
}
if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
if (col < p.k) {
if (p.last_pass != 0) {
const uint row_offset = row * p.ncols_output;
if (gl_GlobalInvocationID.x < p.ncols_input) {
const uint row_offset = row * p.k;
data_d[row_offset + col] = dst_row[col].x;
}
} else {
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
if (gl_WorkGroupID.x * p.k + col < p.ncols_output) {
const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
data_t[row_offset + col] = dst_row[col];
}
}
}
}
void main() {
// Fast path for fully occupied workgroups

View File

@ -25,6 +25,7 @@ layout (push_constant) uniform parameter {
uint orig_ncols;
uint ncols_input;
uint ncols_output;
uint k;
uint nrows;
uint first_pass;
uint last_pass;
@ -60,7 +61,7 @@ void topk(const uint row) {
const uint row_offset = row * p.ncols_input;
dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
} else {
const uint row_offset = row * p.orig_ncols;
const uint row_offset = row * p.ncols_input;
dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
}
} else {
@ -68,7 +69,7 @@ void topk(const uint row) {
}
barrier();
if (p.ncols_output == 1) {
if (p.k == 1) {
// Fast path for single output - just do a max reduction
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
if (tid < s) {
@ -98,7 +99,7 @@ void topk(const uint row) {
uint range_max = 0xFF800000;
// How many are above the current range, and how many we need to find.
uint total = 0;
uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
uint limit = min(p.k, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
while (mask != 0) {
barrier();
@ -139,7 +140,7 @@ void topk(const uint row) {
range_max = range_min + ((min_idx + 1) << shift);
range_min = range_min + (min_idx << shift);
if (total == p.ncols_output) {
if (total == p.k) {
break;
}
total -= counts[min_idx];
@ -179,16 +180,20 @@ void topk(const uint row) {
barrier();
}
if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
if (tid < p.k) {
if (p.last_pass != 0) {
const uint row_offset = row * p.ncols_output;
if (gl_GlobalInvocationID.x < p.ncols_input) {
const uint row_offset = row * p.k;
data_d[row_offset + tid] = dst_row[tid].x;
}
} else {
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
if (gl_WorkGroupID.x * p.k + tid < p.ncols_output) {
const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
data_t[row_offset + tid] = dst_row[tid];
}
}
}
}
void main() {
uint row = gl_WorkGroupID.y;

View File

@ -4891,6 +4891,8 @@ static struct ggml_tensor * ggml_interpolate_impl(
int64_t ne3,
uint32_t mode) {
GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
// TODO: implement antialias for modes other than bilinear
GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);

View File

@ -175,6 +175,7 @@ class Keys:
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -448,6 +449,7 @@ class MODEL_ARCH(IntEnum):
MINIMAXM2 = auto()
RND1 = auto()
PANGU_EMBED = auto()
MISTRAL3 = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
@ -837,6 +839,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.COGVLM: "cogvlm",
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -3124,6 +3127,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.MISTRAL3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
# TODO
}

View File

@ -904,6 +904,9 @@ class GGUFWriter:
def add_attn_temperature_length(self, value: int) -> None:
self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)
def add_attn_temperature_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

View File

@ -17,6 +17,8 @@ vendor = {
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.28.0/httplib.h": "vendor/cpp-httplib/httplib.h",
"https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
}
for url, filename in vendor.items():

View File

@ -132,6 +132,7 @@ add_library(llama
models/t5-enc.cpp
models/wavtokenizer-dec.cpp
models/xverse.cpp
models/mistral3.cpp
models/graph-context-mamba.cpp
)

View File

@ -111,6 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_COGVLM, "cogvlm" },
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@ -853,7 +855,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
@ -2514,6 +2516,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_MISTRAL3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@ -2613,6 +2641,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
{LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},

View File

@ -115,6 +115,7 @@ enum llm_arch {
LLM_ARCH_COGVLM,
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
LLM_ARCH_UNKNOWN,
};
@ -208,6 +209,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -377,6 +379,7 @@ enum llm_tensor {
LLM_TENSOR_SSM_DT,
LLM_TENSOR_SSM_DT_NORM,
LLM_TENSOR_SSM_A,
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
LLM_TENSOR_SSM_B_NORM,
LLM_TENSOR_SSM_C_NORM,
LLM_TENSOR_SSM_D,

View File

@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(f_attn_temp_scale != 0.0f);
GGML_ASSERT(n_attn_temp_floor_scale != 0);
std::vector<float> attn_scale_data(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const float pos = ubatch->pos[i];
@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
GGML_ABORT("fatal error");
}
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
if (gate && type_gate == LLM_FFN_PAR) {
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
//expand here so that we can fuse ffn gate
ggml_build_forward_expand(gf, cur);
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);

View File

@ -162,8 +162,8 @@ struct llama_hparams {
// llama4 smallthinker
uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4;
uint32_t n_attn_temp_floor_scale = 8192;
float f_attn_temp_scale = 0.1;
uint32_t n_attn_temp_floor_scale = 0;
float f_attn_temp_scale = 0.0f;
// gemma3n altup
uint32_t n_altup = 4; // altup_num_inputs

View File

@ -485,7 +485,7 @@ struct llama_mlock::impl {
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
suggest = false;
}
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
suggest = false;
}
#endif

View File

@ -665,6 +665,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} else {
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
hparams.n_swa = 8192;
hparams.n_attn_temp_floor_scale = 8192;
hparams.f_attn_temp_scale = 0.1f;
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
}
@ -2251,6 +2253,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MISTRAL3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
if (hparams.f_attn_temp_scale != 0.0f) {
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
if (hparams.n_attn_temp_floor_scale == 0) {
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
}
}
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
// but may need further verification with other values
if (hparams.rope_yarn_log_mul != 0.0f) {
float factor = 1.0f / hparams.rope_freq_scale_train;
float mscale = 1.0f;
float mscale_all_dims = hparams.rope_yarn_log_mul;
static auto get_mscale = [](float scale, float mscale) {
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
};
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
}
switch (hparams.n_layer) {
case 26: type = LLM_TYPE_3B; break;
case 34: type = LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
default: throw std::runtime_error("unsupported model architecture");
}
@ -2564,6 +2602,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_MISTRAL3:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -6521,7 +6560,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
@ -7556,6 +7595,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_qwen3next>(*this, params);
} break;
case LLM_ARCH_MISTRAL3:
{
llm = std::make_unique<llm_build_mistral3>(*this, params);
} break;
default:
GGML_ABORT("fatal error");
}
@ -7724,6 +7767,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ARCEE:
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2

160
src/models/mistral3.cpp Normal file
View File

@ -0,0 +1,160 @@
#include "models.h"
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
// (optional) temperature tuning
ggml_tensor * inp_attn_scale = nullptr;
if (hparams.f_attn_temp_scale != 0.0f) {
inp_attn_scale = build_inp_attn_scale();
}
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
if (inp_attn_scale) {
// apply llama 4 temperature scaling
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
cb(Qcur, "Qcur_attn_temp_scaled", il);
}
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network (non-MoE)
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mpt : public llm_graph_context {
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
};

View File

@ -7660,7 +7660,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
// test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {i, 2, 1, 3}, rand() % i + 1));
//}
for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode));

View File

@ -1375,7 +1375,7 @@ int main() {
try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
tc.verify_status(SUCCESS);
} catch (const std::runtime_error & ex) {
} catch (const std::invalid_argument & ex) {
fprintf(stderr, "Error: %s\n", ex.what());
tc.verify_status(FAILURE);
}

View File

@ -23,7 +23,7 @@
#endif
struct quantize_stats_params {
std::string model = DEFAULT_MODEL_PATH;
std::string model = "models/7B/ggml-model-f16.gguf";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;

View File

@ -521,6 +521,12 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
LOG_WRN("*****************************\n");
LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n");
LOG_WRN(" New llama-cli will have enhanced features and improved user experience\n");
LOG_WRN(" More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n");
LOG_WRN("*****************************\n");
bool is_antiprompt = false;
bool input_echo = true;
bool display = true;

View File

@ -1263,12 +1263,20 @@ struct clip_graph {
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], n_embd * sizeof(float));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 2 * n_embd * sizeof(float));
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
@ -2344,7 +2352,7 @@ private:
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
GGML_ASSERT(pos_embd);
@ -3331,7 +3339,8 @@ struct clip_model_loader {
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
hparams.set_limit_image_tokens(64, 256);
// config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
hparams.set_limit_image_tokens(64, 1024);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
@ -4106,15 +4115,19 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_vision = new clip_ctx(ctx_params);
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
loader.load_tensors(*ctx_vision);
if (ctx_params.warmup) {
loader.warmup(*ctx_vision);
}
}
if (loader.has_audio) {
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
if (ctx_params.warmup) {
loader.warmup(*ctx_audio);
}
}
} catch (const std::exception & e) {
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
@ -4333,12 +4346,13 @@ struct img_tool {
const int width = inp_size.width;
const int height = inp_size.height;
auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
// always align up first
int h_bar = std::max(align_size, ceil_by_factor(height));
int w_bar = std::max(align_size, ceil_by_factor(width));
int h_bar = std::max(align_size, round_by_factor(height));
int w_bar = std::max(align_size, round_by_factor(width));
if (h_bar * w_bar > max_pixels) {
const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
@ -5209,7 +5223,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
clip_image_u8 resized_img;
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2);
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));

View File

@ -44,6 +44,7 @@ struct clip_context_params {
enum clip_flash_attn_type flash_attn_type;
int image_min_tokens;
int image_max_tokens;
bool warmup;
enum clip_dsocr_mode dsocr_mode;
};

View File

@ -136,6 +136,7 @@ struct mtmd_cli_context {
mparams.print_timings = true;
mparams.n_threads = params.cpuparams.n_threads;
mparams.flash_attn_type = params.flash_attn_type;
mparams.warmup = params.warmup;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
mparams.dsocr_mode = params.dsocr_mode.c_str();

View File

@ -108,6 +108,7 @@ mtmd_context_params mtmd_context_params_default() {
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
/* media_marker */ mtmd_default_marker(),
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* warmup */ true,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
/* dsocr_mode */ "auto",
@ -198,6 +199,7 @@ struct mtmd_context {
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
/* warmup */ ctx_params.warmup,
/* dsocr_mode */ dsocr_mode,
};
@ -326,6 +328,10 @@ struct mtmd_context {
img_beg = "<|im_start|>";
img_end = "<|im_end|>";
} else if (proj == PROJECTOR_TYPE_LFM2) {
img_beg = "<|image_start|>";
img_end = "<|image_end|>";
}
}

View File

@ -82,6 +82,7 @@ struct mtmd_context_params {
const char * image_marker; // deprecated, use media_marker instead
const char * media_marker;
enum llama_flash_attn_type flash_attn_type;
bool warmup; // whether to run a warmup encode pass after initialization
// limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)

View File

@ -15,6 +15,8 @@ set(TARGET_SRCS
server.cpp
server-http.cpp
server-http.h
server-models.cpp
server-models.h
server-task.cpp
server-task.h
server-queue.cpp

View File

@ -52,7 +52,7 @@ The project is under active development, and we are [looking for feedback and co
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_SPLIT) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
@ -93,7 +93,7 @@ The project is under active development, and we are [looking for feedback and co
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@ -103,11 +103,11 @@ The project is under active development, and we are [looking for feedback and co
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
| `--log-disable` | Log disable |
| `--log-file FNAME` | Log to file |
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
@ -196,6 +196,10 @@ The project is under active development, and we are [looking for feedback and co
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
@ -287,38 +291,66 @@ For more details, please refer to [multimodal documentation](../../docs/multimod
## Web UI
The project includes a web-based user interface that enables interaction with the model through the `/v1/chat/completions` endpoint.
The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation.
The web UI is developed using:
- `react` framework for frontend development
- `tailwindcss` and `daisyui` for styling
- `vite` for build tooling
### Features
A pre-built version is available as a single HTML file under `/public` directory.
- **Chat interface** with streaming responses
- **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection
- **Modality validation** - ensures selected model supports conversation's attachments (images, audio)
- **Conversation management** - branching, regeneration, editing with history preservation
- **Attachment support** - images, audio, PDFs (with vision/text fallback)
- **Configurable parameters** - temperature, top_p, etc. synced with server defaults
- **Dark/light theme**
To build or to run the dev server (with hot reload):
### Tech Stack
- **SvelteKit** - frontend framework with Svelte 5 runes for reactive state
- **TailwindCSS** + **shadcn-svelte** - styling and UI components
- **Vite** - build tooling
- **IndexedDB** (Dexie) - local storage for conversations
- **LocalStorage** - user settings persistence
### Architecture
The WebUI follows a layered architecture:
```
Routes → Components → Hooks → Stores → Services → Storage/API
```
- **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`)
- **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`)
- **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`)
For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/):
- `high-level-architecture.mmd` - full architecture with all modules
- `high-level-architecture-simplified.mmd` - simplified overview
- `data-flow-simplified-model-mode.mmd` - data flow for single-model mode
- `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode
- `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.)
### Development
```sh
# make sure you have nodejs installed
# make sure you have Node.js installed
cd tools/server/webui
npm i
# to run the dev server
# run dev server (with hot reload)
npm run dev
# to build the public/index.html.gz
# run tests
npm run test
# build production bundle
npm run build
```
After `public/index.html.gz` has been generated we need to generate the c++
headers (like build/tools/server/index.html.gz.hpp) that will be included
by server.cpp. This is done by building `llama-server` as described in the
[build](#build) section above.
NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
```js
localStorage.setItem('base', 'http://localhost:8080')
```
**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
## Quick Start
@ -1424,6 +1456,184 @@ curl http://localhost:8080/v1/messages/count_tokens \
{"input_tokens": 10}
```
## Using multiple models
`llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance.
To start in router mode, launch `llama-server` **without specifying any model**:
```sh
llama-server
```
### Model sources
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
```sh
llama-server -hf <user>/<model>:<tag>
```
*The server must be restarted after adding a new model.*
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command:
```sh
llama-server --models-dir ./models_directory
```
If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
```sh
models_directory
│ # single file
├─ llama-3.2-1b-Q4_K_M.gguf
├─ Qwen3-8B-Q4_K_M.gguf
│ # multimodal
├─ gemma-3-4b-it-Q8_0
│ ├─ gemma-3-4b-it-Q8_0.gguf
│ └─ mmproj-F16.gguf # file name must start with "mmproj"
│ # multi-shard
├─ Kimi-K2-Thinking-UD-IQ1_S
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
│ ├─ ...
│ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
```
You may also specify default arguments that will be passed to every model instance:
```sh
llama-server -ctx 8192 -n 1024 -np 2
```
Note: model instances inherit both command line arguments and environment variables from the router server.
### Routing requests
Requests are routed according to the requested model name.
For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, etc.) The router uses the `"model"` field in the JSON body:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"messages": [
{
"role": "user",
"content": "hello"
}
]
}
```
For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` query parameter (URL-encoded):
```
GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
```
By default, the model will be loaded automatically if it's not loaded. To disable this, add `--no-models-autoload` when starting the server. Additionally, you can include `?autoload=true|false` in the query param to control this behavior per-request.
### GET `/models`: List available models
Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
```json
{
"data": [{
"id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"in_cache": true,
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
"status": {
"value": "loaded",
"args": ["llama-server", "-ctx", "4096"]
},
...
}]
}
```
Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
The `status` object can be:
```json
"status": {
"value": "unloaded"
}
```
```json
"status": {
"value": "loading",
"args": ["llama-server", "-ctx", "4096"]
}
```
```json
"status": {
"value": "unloaded",
"args": ["llama-server", "-ctx", "4096"],
"failed": true,
"exit_code": 1
}
```
```json
"status": {
"value": "loaded",
"args": ["llama-server", "-ctx", "4096"]
}
```
### POST `/models/load`: Load a model
Load a model
Payload:
- `model`: name of the model to be loaded.
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"extra_args": ["-n", "128", "--top-k", "4"]
}
```
Response:
```json
{
"success": true
}
```
### POST `/models/unload`: Unload a model
Unload a model
Payload:
```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
}
```
Response:
```json
{
"success": true
}
```
## More examples
### Interactive mode

Binary file not shown.

View File

@ -819,26 +819,26 @@ json oaicompat_chat_params_parse(
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
json_schema = json_value(schema_wrapper, "schema", json::object());
} else if (!response_type.empty() && response_type != "text") {
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
}
}
// get input files
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
throw std::invalid_argument("'messages' is required");
}
json & messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
throw std::invalid_argument("Expected 'messages' to be an array");
}
for (auto & msg : messages) {
std::string role = json_value(msg, "role", std::string());
if (role != "assistant" && !msg.contains("content")) {
throw std::runtime_error("All non-assistant messages must contain 'content'");
throw std::invalid_argument("All non-assistant messages must contain 'content'");
}
if (role == "assistant") {
if (!msg.contains("content") && !msg.contains("tool_calls")) {
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
}
if (!msg.contains("content")) {
continue; // avoid errors with no content
@ -850,7 +850,7 @@ json oaicompat_chat_params_parse(
}
if (!content.is_array()) {
throw std::runtime_error("Expected 'content' to be a string or an array");
throw std::invalid_argument("Expected 'content' to be a string or an array");
}
for (auto & p : content) {
@ -884,11 +884,11 @@ json oaicompat_chat_params_parse(
// try to decode base64 image
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
if (parts.size() != 2) {
throw std::runtime_error("Invalid image_url.url value");
throw std::invalid_argument("Invalid image_url.url value");
} else if (!string_starts_with(parts[0], "data:image/")) {
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
throw std::invalid_argument("Invalid image_url.url format: " + parts[0]);
} else if (!string_ends_with(parts[0], "base64")) {
throw std::runtime_error("image_url.url must be base64 encoded");
throw std::invalid_argument("image_url.url must be base64 encoded");
} else {
auto base64_data = parts[1];
auto decoded_data = base64_decode(base64_data);
@ -911,7 +911,7 @@ json oaicompat_chat_params_parse(
std::string format = json_value(input_audio, "format", std::string());
// while we also support flac, we don't allow it here so we matches the OAI spec
if (format != "wav" && format != "mp3") {
throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
}
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
@ -922,7 +922,7 @@ json oaicompat_chat_params_parse(
p.erase("input_audio");
} else if (type != "text") {
throw std::runtime_error("unsupported content[].type");
throw std::invalid_argument("unsupported content[].type");
}
}
}
@ -940,7 +940,7 @@ json oaicompat_chat_params_parse(
inputs.enable_thinking = opt.enable_thinking;
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
if (body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
}
llama_params["parse_tool_calls"] = true;
}
@ -959,7 +959,7 @@ json oaicompat_chat_params_parse(
} else if (enable_thinking_kwarg == "false") {
inputs.enable_thinking = false;
} else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
}
// if the assistant message appears at the end of list, we do not add end-of-turn token
@ -972,14 +972,14 @@ json oaicompat_chat_params_parse(
/* sanity check, max one assistant message at the end of the list */
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
}
/* TODO: test this properly */
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
if ( inputs.enable_thinking ) {
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
}
inputs.add_generation_prompt = true;
@ -1020,18 +1020,18 @@ json oaicompat_chat_params_parse(
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
throw std::runtime_error("Only one completion choice is allowed");
throw std::invalid_argument("Only one completion choice is allowed");
}
// Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
if (json_value(body, "logprobs", false)) {
if (has_tools && stream) {
throw std::runtime_error("logprobs is not supported with tools + stream");
throw std::invalid_argument("logprobs is not supported with tools + stream");
}
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
}
// Copy remaining properties to llama_params
@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) {
return oai_body;
}
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) {
json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64) {
json data = json::array();
int32_t n_tokens = 0;
int i = 0;
@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
}
json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"model", json_value(request, "model", model_name)},
{"object", "list"},
{"usage", json {
{"prompt_tokens", n_tokens},
@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
json format_response_rerank(
const json & request,
const std::string & model_name,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts,
@ -1338,7 +1343,7 @@ json format_response_rerank(
if (is_tei_format) return results;
json res = json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"model", json_value(request, "model", model_name)},
{"object", "list"},
{"usage", json{
{"prompt_tokens", n_tokens},

View File

@ -13,8 +13,6 @@
#include <vector>
#include <cinttypes>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
using json = nlohmann::ordered_json;
@ -298,11 +296,16 @@ json oaicompat_chat_params_parse(
json convert_anthropic_to_oai(const json & body);
// TODO: move it to server-task.cpp
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64 = false);
// TODO: move it to server-task.cpp
json format_response_rerank(
const json & request,
const std::string & model_name,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts,

View File

@ -17,6 +17,7 @@
#include <cinttypes>
#include <memory>
#include <unordered_set>
#include <filesystem>
// fix problem with std::min and std::max
#if defined(_WIN32)
@ -518,6 +519,8 @@ struct server_context_impl {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API
common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt;
@ -621,6 +624,7 @@ struct server_context_impl {
mparams.print_timings = false;
mparams.n_threads = params_base.cpuparams.n_threads;
mparams.flash_attn_type = params_base.flash_attn_type;
mparams.warmup = params_base.warmup;
mparams.image_min_tokens = params_base.image_min_tokens;
mparams.image_max_tokens = params_base.image_max_tokens;
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
@ -757,6 +761,18 @@ struct server_context_impl {
}
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
// user explicitly specified model name
model_name = params_base.model_alias;
} else if (!params_base.model.name.empty()) {
// use model name in registry format (for models in cache)
model_name = params_base.model.name;
} else {
// fallback: derive model name from file name
auto model_path = std::filesystem::path(params_base.model.path);
model_name = model_path.filename().string();
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
@ -2610,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
// OAI-compat
task.params.res_type = res_type;
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl
task.params.oaicompat_model = ctx_server.model_name;
tasks.push_back(std::move(task));
}
@ -2938,7 +2954,7 @@ void server_routes::init_routes() {
json data = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_alias", ctx_server.params_base.model_alias },
{ "model_alias", ctx_server.model_name },
{ "model_path", ctx_server.params_base.model.path },
{ "modalities", json {
{"vision", ctx_server.oai_parser_opt.allow_image},
@ -3180,8 +3196,8 @@ void server_routes::init_routes() {
json models = {
{"models", {
{
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
{"name", ctx_server.model_name},
{"model", ctx_server.model_name},
{"modified_at", ""},
{"size", ""},
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
@ -3203,7 +3219,7 @@ void server_routes::init_routes() {
{"object", "list"},
{"data", {
{
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
{"id", ctx_server.model_name},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
@ -3350,6 +3366,7 @@ void server_routes::init_routes() {
// write JSON response
json root = format_response_rerank(
body,
ctx_server.model_name,
responses,
is_tei_format,
documents,
@ -3612,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
// write JSON response
json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
? format_embeddings_response_oaicompat(body, responses, use_base64)
? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
: json(responses);
res->ok(root);
return res;

View File

@ -0,0 +1,975 @@
#include "server-common.h"
#include "server-models.h"
#include "download.h"
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
#include <sheredom/subprocess.h>
#include <functional>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <cstring>
#include <atomic>
#include <chrono>
#include <queue>
#ifdef _WIN32
#include <winsock2.h>
#else
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#endif
#if defined(__APPLE__) && defined(__MACH__)
// macOS: use _NSGetExecutablePath to get the executable path
#include <mach-o/dyld.h>
#include <limits.h>
#endif
#define CMD_EXIT "exit"
static std::filesystem::path get_server_exec_path() {
#if defined(_WIN32)
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
if (len == 0 || len >= _countof(buf)) {
throw std::runtime_error("GetModuleFileNameW failed or path too long");
}
return std::filesystem::path(buf);
#elif defined(__APPLE__) && defined(__MACH__)
char small_path[PATH_MAX];
uint32_t size = sizeof(small_path);
if (_NSGetExecutablePath(small_path, &size) == 0) {
// resolve any symlinks to get absolute path
try {
return std::filesystem::canonical(std::filesystem::path(small_path));
} catch (...) {
return std::filesystem::path(small_path);
}
} else {
// buffer was too small, allocate required size and call again
std::vector<char> buf(size);
if (_NSGetExecutablePath(buf.data(), &size) == 0) {
try {
return std::filesystem::canonical(std::filesystem::path(buf.data()));
} catch (...) {
return std::filesystem::path(buf.data());
}
}
throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
}
#else
char path[FILENAME_MAX];
ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
if (count <= 0) {
throw std::runtime_error("failed to resolve /proc/self/exe");
}
return std::filesystem::path(std::string(path, count));
#endif
}
struct local_model {
std::string name;
std::string path;
std::string path_mmproj;
};
static std::vector<local_model> list_local_models(const std::string & dir) {
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
}
std::vector<local_model> models;
auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
auto files = fs_list(subdir_path, false);
common_file_info model_file;
common_file_info first_shard_file;
common_file_info mmproj_file;
for (const auto & file : files) {
if (string_ends_with(file.name, ".gguf")) {
if (file.name.find("mmproj") != std::string::npos) {
mmproj_file = file;
} else if (file.name.find("-00001-of-") != std::string::npos) {
first_shard_file = file;
} else {
model_file = file;
}
}
}
// single file model
local_model model{
/* name */ name,
/* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
/* path_mmproj */ mmproj_file.path // can be empty
};
if (!model.path.empty()) {
models.push_back(model);
}
};
auto files = fs_list(dir, true);
for (const auto & file : files) {
if (file.is_dir) {
scan_subdir(file.path, file.name);
} else if (string_ends_with(file.name, ".gguf")) {
// single file model
std::string name = file.name;
string_replace_all(name, ".gguf", "");
local_model model{
/* name */ name,
/* path */ file.path,
/* path_mmproj */ ""
};
models.push_back(model);
}
}
return models;
}
//
// server_models
//
server_models::server_models(
const common_params & params,
int argc,
char ** argv,
char ** envp) : base_params(params) {
for (int i = 0; i < argc; i++) {
base_args.push_back(std::string(argv[i]));
}
for (char ** env = envp; *env != nullptr; env++) {
base_env.push_back(std::string(*env));
}
GGML_ASSERT(!base_args.empty());
// set binary path
try {
base_args[0] = get_server_exec_path().string();
} catch (const std::exception & e) {
LOG_WRN("failed to get server executable path: %s\n", e.what());
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
}
// TODO: allow refreshing cached model list
// add cached models
auto cached_models = common_list_cached_models();
for (const auto & model : cached_models) {
server_model_meta meta{
/* name */ model.to_string(),
/* path */ model.manifest_path,
/* path_mmproj */ "", // auto-detected when loading
/* in_cache */ true,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ meta
};
}
// add local models specificed via --models-dir
if (!params.models_dir.empty()) {
auto local_models = list_local_models(params.models_dir);
for (const auto & model : local_models) {
if (mapping.find(model.name) != mapping.end()) {
// already exists in cached models, skip
continue;
}
server_model_meta meta{
/* name */ model.name,
/* path */ model.path,
/* path_mmproj */ model.path_mmproj,
/* in_cache */ false,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ meta
};
}
}
}
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
it->second.meta = meta;
}
cv.notify_all(); // notify wait_until_loaded
}
bool server_models::has_model(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
return mapping.find(name) != mapping.end();
}
std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
return it->second.meta;
}
return std::nullopt;
}
static int get_free_port() {
#ifdef _WIN32
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
return -1;
}
typedef SOCKET native_socket_t;
#define INVALID_SOCKET_VAL INVALID_SOCKET
#define CLOSE_SOCKET(s) closesocket(s)
#else
typedef int native_socket_t;
#define INVALID_SOCKET_VAL -1
#define CLOSE_SOCKET(s) close(s)
#endif
native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == INVALID_SOCKET_VAL) {
#ifdef _WIN32
WSACleanup();
#endif
return -1;
}
struct sockaddr_in serv_addr;
std::memset(&serv_addr, 0, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
serv_addr.sin_port = htons(0);
if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) {
CLOSE_SOCKET(sock);
#ifdef _WIN32
WSACleanup();
#endif
return -1;
}
#ifdef _WIN32
int namelen = sizeof(serv_addr);
#else
socklen_t namelen = sizeof(serv_addr);
#endif
if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) {
CLOSE_SOCKET(sock);
#ifdef _WIN32
WSACleanup();
#endif
return -1;
}
int port = ntohs(serv_addr.sin_port);
CLOSE_SOCKET(sock);
#ifdef _WIN32
WSACleanup();
#endif
return port;
}
// helper to convert vector<string> to char **
// pointers are only valid as long as the original vector is valid
static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
std::vector<char *> result;
result.reserve(vec.size() + 1);
for (const auto & s : vec) {
result.push_back(const_cast<char*>(s.c_str()));
}
result.push_back(nullptr);
return result;
}
std::vector<server_model_meta> server_models::get_all_meta() {
std::lock_guard<std::mutex> lk(mutex);
std::vector<server_model_meta> result;
result.reserve(mapping.size());
for (const auto & [name, inst] : mapping) {
result.push_back(inst.meta);
}
return result;
}
void server_models::unload_lru() {
if (base_params.models_max <= 0) {
return; // no limit
}
// remove one of the servers if we passed the models_max (least recently used - LRU)
std::string lru_model_name = "";
int64_t lru_last_used = ggml_time_ms();
size_t count_active = 0;
{
std::lock_guard<std::mutex> lk(mutex);
for (const auto & m : mapping) {
if (m.second.meta.is_active()) {
count_active++;
if (m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
}
}
}
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
unload(lru_model_name);
}
}
static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
for (size_t i = 0; i < args.size(); i++) {
if (args[i] == key && i + 1 < args.size()) {
args[i + 1] = value;
return;
}
}
// not found, append
args.push_back(key);
args.push_back(value);
}
void server_models::load(const std::string & name, bool auto_load) {
if (!has_model(name)) {
throw std::runtime_error("model name=" + name + " is not found");
}
unload_lru();
std::lock_guard<std::mutex> lk(mutex);
auto meta = mapping[name].meta;
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model %s is not ready\n", name.c_str());
return;
}
// prepare new instance info
instance_t inst;
inst.meta = meta;
inst.meta.port = get_free_port();
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
inst.meta.last_used = ggml_time_ms();
if (inst.meta.port <= 0) {
throw std::runtime_error("failed to get a port number");
}
inst.subproc = std::make_shared<subprocess_s>();
{
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
std::vector<std::string> child_args;
if (auto_load && !meta.args.empty()) {
child_args = meta.args; // copy previous args
} else {
child_args = base_args; // copy
if (inst.meta.in_cache) {
add_or_replace_arg(child_args, "-hf", inst.meta.name);
} else {
add_or_replace_arg(child_args, "-m", inst.meta.path);
if (!inst.meta.path_mmproj.empty()) {
add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
}
}
}
// set model args
add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
add_or_replace_arg(child_args, "--alias", inst.meta.name);
std::vector<std::string> child_env = base_env; // copy
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
SRV_INF("%s", "spawning server instance with args:\n");
for (const auto & arg : child_args) {
SRV_INF(" %s\n", arg.c_str());
}
inst.meta.args = child_args; // save for debugging
std::vector<char *> argv = to_char_ptr_array(child_args);
std::vector<char *> envp = to_char_ptr_array(child_env);
int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
if (result != 0) {
throw std::runtime_error("failed to spawn server instance");
}
inst.stdin_file = subprocess_stdin(inst.subproc.get());
}
// start a thread to manage the child process
// captured variables are guaranteed to be destroyed only after the thread is joined
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
// read stdout/stderr and forward to main server log
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
if (p_stdout_stderr) {
char buffer[4096];
while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
LOG("[%5d] %s", port, buffer);
}
} else {
SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
}
// we reach here when the child process exits
int exit_code = 0;
subprocess_join(child_proc.get(), &exit_code);
subprocess_destroy(child_proc.get());
// update PID and status
{
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.exit_code = exit_code;
meta.status = SERVER_MODEL_STATUS_UNLOADED;
}
cv.notify_all();
}
SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
});
// clean up old process/thread if exists
{
auto & old_instance = mapping[name];
// old process should have exited already, but just in case, we clean it up here
if (subprocess_alive(old_instance.subproc.get())) {
SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
subprocess_terminate(old_instance.subproc.get()); // force kill
}
if (old_instance.th.joinable()) {
old_instance.th.join();
}
}
mapping[name] = std::move(inst);
cv.notify_all();
}
static void interrupt_subprocess(FILE * stdin_file) {
// because subprocess.h does not provide a way to send SIGINT,
// we will send a command to the child process to exit gracefully
if (stdin_file) {
fprintf(stdin_file, "%s\n", CMD_EXIT);
fflush(stdin_file);
}
}
void server_models::unload(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
if (it->second.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
interrupt_subprocess(it->second.stdin_file);
// status change will be handled by the managing thread
} else {
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
}
}
}
void server_models::unload_all() {
std::vector<std::thread> to_join;
{
std::lock_guard<std::mutex> lk(mutex);
for (auto & [name, inst] : mapping) {
if (inst.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
interrupt_subprocess(inst.stdin_file);
// status change will be handled by the managing thread
}
// moving the thread to join list to avoid deadlock
to_join.push_back(std::move(inst.th));
}
}
for (auto & th : to_join) {
if (th.joinable()) {
th.join();
}
}
}
void server_models::update_status(const std::string & name, server_model_status status) {
// for now, we only allow updating to LOADED status
if (status != SERVER_MODEL_STATUS_LOADED) {
throw std::runtime_error("invalid status value");
}
auto meta = get_meta(name);
if (meta.has_value()) {
meta->status = status;
update_meta(name, meta.value());
}
}
void server_models::wait_until_loaded(const std::string & name) {
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [this, &name]() {
auto it = mapping.find(name);
if (it != mapping.end()) {
return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
}
return false;
});
}
bool server_models::ensure_model_loaded(const std::string & name) {
auto meta = get_meta(name);
if (!meta.has_value()) {
throw std::runtime_error("model name=" + name + " is not found");
}
if (meta->status == SERVER_MODEL_STATUS_LOADED) {
return false; // already loaded
}
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
load(name, true);
}
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
wait_until_loaded(name);
// check final status
meta = get_meta(name);
if (!meta.has_value() || meta->is_failed()) {
throw std::runtime_error("model name=" + name + " failed to load");
}
return true;
}
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
auto meta = get_meta(name);
if (!meta.has_value()) {
throw std::runtime_error("model name=" + name + " is not found");
}
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
throw std::invalid_argument("model name=" + name + " is not loaded");
}
if (update_last_used) {
std::unique_lock<std::mutex> lk(mutex);
mapping[name].meta.last_used = ggml_time_ms();
}
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
auto proxy = std::make_unique<server_http_proxy>(
method,
base_params.hostname,
meta->port,
req.path,
req.headers,
req.body,
req.should_stop);
return proxy;
}
std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
// send a notification to the router server that a model instance is ready
// TODO @ngxson : use HTTP client from libcommon
httplib::Client cli(base_params.hostname, router_port);
cli.set_connection_timeout(0, 200000); // 200 milliseconds
httplib::Request req;
req.method = "POST";
req.path = "/models/status";
req.set_header("Content-Type", "application/json");
if (!base_params.api_keys.empty()) {
req.set_header("Authorization", "Bearer " + base_params.api_keys[0]);
}
json body;
body["model"] = name;
body["value"] = server_model_status_to_string(SERVER_MODEL_STATUS_LOADED);
req.body = body.dump();
SRV_INF("notifying router server (port=%d) that model %s is ready\n", router_port, name.c_str());
auto result = cli.send(std::move(req));
if (result.error() != httplib::Error::Success) {
auto err_str = httplib::to_string(result.error());
SRV_ERR("failed to notify router server: %s\n", err_str.c_str());
exit(1); // force exit
}
// setup thread for monitoring stdin
return std::thread([shutdown_handler]() {
// wait for EOF on stdin
SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
bool eof = false;
while (true) {
std::string line;
if (!std::getline(std::cin, line)) {
// EOF detected, that means the router server is unexpectedly exit or killed
eof = true;
break;
}
if (line.find(CMD_EXIT) != std::string::npos) {
SRV_INF("%s", "exit command received, exiting...\n");
shutdown_handler(0);
break;
}
}
if (eof) {
SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
exit(1);
}
});
}
//
// server_models_routes
//
static void res_ok(std::unique_ptr<server_http_res> & res, const json & response_data) {
res->status = 200;
res->data = safe_json_to_str(response_data);
}
static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
res->status = json_value(error_data, "code", 500);
res->data = safe_json_to_str({{ "error", error_data }});
}
static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
if (name.empty()) {
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return false;
}
auto meta = models.get_meta(name);
if (!meta.has_value()) {
res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
return false;
}
if (models_autoload) {
models.ensure_model_loaded(name);
} else {
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return false;
}
}
return true;
}
static bool is_autoload(const common_params & params, const server_http_req & req) {
std::string autoload = req.get_param("autoload");
if (autoload.empty()) {
return params.models_autoload;
} else {
return autoload == "true" || autoload == "1";
}
}
void server_models_routes::init_routes() {
this->get_router_props = [this](const server_http_req & req) {
std::string name = req.get_param("model");
if (name.empty()) {
// main instance
auto res = std::make_unique<server_http_res>();
res_ok(res, {
// TODO: add support for this on web UI
{"role", "router"},
{"max_instances", 4}, // dummy value for testing
// this is a dummy response to make sure webui doesn't break
{"model_alias", "llama-server"},
{"model_path", "none"},
{"default_generation_settings", {
{"params", json{}},
{"n_ctx", 0},
}},
});
return res;
}
return proxy_get(req);
};
this->proxy_get = [this](const server_http_req & req) {
std::string method = "GET";
std::string name = req.get_param("model");
bool autoload = is_autoload(params, req);
auto error_res = std::make_unique<server_http_res>();
if (!router_validate_model(name, models, autoload, error_res)) {
return error_res;
}
return models.proxy_request(req, method, name, false);
};
this->proxy_post = [this](const server_http_req & req) {
std::string method = "POST";
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
bool autoload = is_autoload(params, req);
auto error_res = std::make_unique<server_http_res>();
if (!router_validate_model(name, models, autoload, error_res)) {
return error_res;
}
return models.proxy_request(req, method, name, true); // update last usage for POST request only
};
this->get_router_models = [this](const server_http_req &) {
auto res = std::make_unique<server_http_res>();
json models_json = json::array();
auto all_models = models.get_all_meta();
std::time_t t = std::time(0);
for (const auto & meta : all_models) {
json status {
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
};
if (meta.is_failed()) {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {
{"id", meta.name},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"in_cache", meta.in_cache},
{"path", meta.path},
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
}
res_ok(res, {
{"data", models_json},
{"object", "list"},
});
return res;
};
this->post_router_models_load = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
auto model = models.get_meta(name);
if (!model.has_value()) {
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
return res;
}
if (model->status == SERVER_MODEL_STATUS_LOADED) {
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.load(name, false);
res_ok(res, {{"success", true}});
return res;
};
// used by child process to notify the router about status change
// TODO @ngxson : maybe implement authentication for this endpoint in the future
this->post_router_models_status = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
std::string model = json_value(body, "model", std::string());
std::string value = json_value(body, "value", std::string());
models.update_status(model, server_model_status_from_string(value));
res_ok(res, {{"success", true}});
return res;
};
this->get_router_models = [this](const server_http_req &) {
auto res = std::make_unique<server_http_res>();
json models_json = json::array();
auto all_models = models.get_all_meta();
std::time_t t = std::time(0);
for (const auto & meta : all_models) {
json status {
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
};
if (meta.is_failed()) {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {
{"id", meta.name},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"in_cache", meta.in_cache},
{"path", meta.path},
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
}
res_ok(res, {
{"data", models_json},
{"object", "list"},
});
return res;
};
this->post_router_models_unload = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
auto model = models.get_meta(name);
if (!model.has_value()) {
res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
return res;
}
if (model->status != SERVER_MODEL_STATUS_LOADED) {
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.unload(name);
res_ok(res, {{"success", true}});
return res;
};
}
//
// server_http_proxy
//
// simple implementation of a pipe
// used for streaming data between threads
template<typename T>
struct pipe_t {
std::mutex mutex;
std::condition_variable cv;
std::queue<T> queue;
std::atomic<bool> writer_closed{false};
std::atomic<bool> reader_closed{false};
void close_write() {
writer_closed.store(true, std::memory_order_relaxed);
cv.notify_all();
}
void close_read() {
reader_closed.store(true, std::memory_order_relaxed);
cv.notify_all();
}
bool read(T & output, const std::function<bool()> & should_stop) {
std::unique_lock<std::mutex> lk(mutex);
constexpr auto poll_interval = std::chrono::milliseconds(500);
while (true) {
if (!queue.empty()) {
output = std::move(queue.front());
queue.pop();
return true;
}
if (writer_closed.load()) {
return false; // clean EOF
}
if (should_stop()) {
close_read(); // signal broken pipe to writer
return false; // cancelled / reader no longer alive
}
cv.wait_for(lk, poll_interval);
}
}
bool write(T && data) {
std::lock_guard<std::mutex> lk(mutex);
if (reader_closed.load()) {
return false; // broken pipe
}
queue.push(std::move(data));
cv.notify_one();
return true;
}
};
server_http_proxy::server_http_proxy(
const std::string & method,
const std::string & host,
int port,
const std::string & path,
const std::map<std::string, std::string> & headers,
const std::string & body,
const std::function<bool()> should_stop) {
// shared between reader and writer threads
auto cli = std::make_shared<httplib::Client>(host, port);
auto pipe = std::make_shared<pipe_t<msg_t>>();
// setup Client
cli->set_connection_timeout(0, 200000); // 200 milliseconds
this->status = 500; // to be overwritten upon response
this->cleanup = [pipe]() {
pipe->close_read();
pipe->close_write();
};
// wire up the receive end of the pipe
this->next = [pipe, should_stop](std::string & out) -> bool {
msg_t msg;
bool has_next = pipe->read(msg, should_stop);
if (!msg.data.empty()) {
out = std::move(msg.data);
}
return has_next; // false if EOF or pipe broken
};
// wire up the HTTP client
// note: do NOT capture `this` pointer, as it may be destroyed before the thread ends
httplib::ResponseHandler response_handler = [pipe, cli](const httplib::Response & response) {
msg_t msg;
msg.status = response.status;
for (const auto & [key, value] : response.headers) {
msg.headers[key] = value;
}
return pipe->write(std::move(msg)); // send headers first
};
httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) {
// send data chunks
// returns false if pipe is closed / broken (signal to stop receiving)
return pipe->write({{}, 0, std::string(data, data_length)});
};
// prepare the request to destination server
httplib::Request req;
{
req.method = method;
req.path = path;
for (const auto & [key, value] : headers) {
req.set_header(key, value);
}
req.body = body;
req.response_handler = response_handler;
req.content_receiver = content_receiver;
}
// start the proxy thread
SRV_DBG("start proxy thread %s %s\n", req.method.c_str(), req.path.c_str());
this->thread = std::thread([cli, pipe, req]() {
auto result = cli->send(std::move(req));
if (result.error() != httplib::Error::Success) {
auto err_str = httplib::to_string(result.error());
SRV_ERR("http client error: %s\n", err_str.c_str());
pipe->write({{}, 500, ""}); // header
pipe->write({{}, 0, "proxy error: " + err_str}); // body
}
pipe->close_write(); // signal EOF to reader
SRV_DBG("%s", "client request thread ended\n");
});
this->thread.detach();
// wait for the first chunk (headers)
msg_t header;
if (pipe->read(header, should_stop)) {
SRV_DBG("%s", "received response headers\n");
this->status = header.status;
this->headers = header.headers;
} else {
SRV_DBG("%s", "no response headers received (request cancelled?)\n");
}
}

View File

@ -0,0 +1,174 @@
#pragma once
#include "common.h"
#include "server-http.h"
#include <mutex>
#include <condition_variable>
#include <functional>
#include <memory>
/**
* state diagram:
*
* UNLOADED LOADING LOADED
*
* failed
*
* unloaded
*/
enum server_model_status {
// TODO: also add downloading state when the logic is added
SERVER_MODEL_STATUS_UNLOADED,
SERVER_MODEL_STATUS_LOADING,
SERVER_MODEL_STATUS_LOADED
};
static server_model_status server_model_status_from_string(const std::string & status_str) {
if (status_str == "unloaded") {
return SERVER_MODEL_STATUS_UNLOADED;
}
if (status_str == "loading") {
return SERVER_MODEL_STATUS_LOADING;
}
if (status_str == "loaded") {
return SERVER_MODEL_STATUS_LOADED;
}
throw std::runtime_error("invalid server model status");
}
static std::string server_model_status_to_string(server_model_status status) {
switch (status) {
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
case SERVER_MODEL_STATUS_LOADING: return "loading";
case SERVER_MODEL_STATUS_LOADED: return "loaded";
default: return "unknown";
}
}
struct server_model_meta {
std::string name;
std::string path;
std::string path_mmproj; // only available if in_cache=false
bool in_cache = false; // if true, use -hf; use -m otherwise
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
}
bool is_failed() const {
return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
}
};
struct subprocess_s;
struct server_models {
private:
struct instance_t {
std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
std::thread th;
server_model_meta meta;
FILE * stdin_file = nullptr;
};
std::mutex mutex;
std::condition_variable cv;
std::map<std::string, instance_t> mapping;
common_params base_params;
std::vector<std::string> base_args;
std::vector<std::string> base_env;
void update_meta(const std::string & name, const server_model_meta & meta);
// unload least recently used models if the limit is reached
void unload_lru();
public:
server_models(const common_params & params, int argc, char ** argv, char ** envp);
// check if a model instance exists
bool has_model(const std::string & name);
// return a copy of model metadata
std::optional<server_model_meta> get_meta(const std::string & name);
// return a copy of all model metadata
std::vector<server_model_meta> get_all_meta();
// if auto_load is true, load the model with previous args if any
void load(const std::string & name, bool auto_load);
void unload(const std::string & name);
void unload_all();
// update the status of a model instance
void update_status(const std::string & name, server_model_status status);
// wait until the model instance is fully loaded
// return when the model is loaded or failed to load
void wait_until_loaded(const std::string & name);
// load the model if not loaded, otherwise do nothing
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
bool ensure_model_loaded(const std::string & name);
// proxy an HTTP request to the model instance
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
// notify the router server that a model instance is ready
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
};
struct server_models_routes {
common_params params;
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
: params(params), models(params, argc, argv, envp) {
init_routes();
}
void init_routes();
// handlers using lambda function, so that they can capture `this` without `std::bind`
server_http_context::handler_t get_router_props;
server_http_context::handler_t proxy_get;
server_http_context::handler_t proxy_post;
server_http_context::handler_t get_router_models;
server_http_context::handler_t post_router_models_load;
server_http_context::handler_t post_router_models_status;
server_http_context::handler_t post_router_models_unload;
};
/**
* A simple HTTP proxy that forwards requests to another server
* and relays the responses back.
*/
struct server_http_proxy : server_http_res {
std::function<void()> cleanup = nullptr;
public:
server_http_proxy(const std::string & method,
const std::string & host,
int port,
const std::string & path,
const std::map<std::string, std::string> & headers,
const std::string & body,
const std::function<bool()> should_stop);
~server_http_proxy() {
if (cleanup) {
cleanup();
}
}
private:
std::thread thread;
struct msg_t {
std::map<std::string, std::string> headers;
int status = 0;
std::string data;
};
};

View File

@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
}
}
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
params.oaicompat_model = json_value(data, "model", model_name);
return params;
}

View File

@ -1,5 +1,6 @@
#include "server-context.h"
#include "server-http.h"
#include "server-models.h"
#include "arg.h"
#include "common.h"
@ -33,30 +34,36 @@ static inline void signal_handler(int signal) {
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
std::string message;
error_type error;
try {
return func(req);
} catch (const std::invalid_argument & e) {
error = ERROR_TYPE_INVALID_REQUEST;
message = e.what();
} catch (const std::exception & e) {
error = ERROR_TYPE_SERVER;
message = e.what();
} catch (...) {
error = ERROR_TYPE_SERVER;
message = "unknown error";
}
auto res = std::make_unique<server_http_res>();
res->status = 500;
try {
json error_data = format_error_response(message, ERROR_TYPE_SERVER);
json error_data = format_error_response(message, error);
res->status = json_value(error_data, "code", 500);
res->data = safe_json_to_str({{ "error", error_data }});
LOG_WRN("got exception: %s\n", res->data.c_str());
SRV_WRN("got exception: %s\n", res->data.c_str());
} catch (const std::exception & e) {
LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
res->data = "Internal Server Error";
}
return res;
};
}
int main(int argc, char ** argv) {
int main(int argc, char ** argv, char ** envp) {
// own arguments required by this example
common_params params;
@ -75,6 +82,11 @@ int main(int argc, char ** argv) {
params.kv_unified = true;
}
// for consistency between server router mode and single-model mode, we set the same model name as alias
if (params.model_alias.empty() && !params.model.name.empty()) {
params.model_alias = params.model.name;
}
common_init();
// struct that contains llama context and inference
@ -101,6 +113,42 @@ int main(int argc, char ** argv) {
// register API routes
server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
bool is_router_server = params.model.path.empty();
std::optional<server_models_routes> models_routes{};
if (is_router_server) {
// setup server instances manager
models_routes.emplace(params, argc, argv, envp);
// proxy handlers
// note: routes.get_health stays the same
routes.get_metrics = models_routes->proxy_get;
routes.post_props = models_routes->proxy_post;
routes.get_api_show = models_routes->proxy_get;
routes.post_completions = models_routes->proxy_post;
routes.post_completions_oai = models_routes->proxy_post;
routes.post_chat_completions = models_routes->proxy_post;
routes.post_anthropic_messages = models_routes->proxy_post;
routes.post_anthropic_count_tokens = models_routes->proxy_post;
routes.post_infill = models_routes->proxy_post;
routes.post_embeddings = models_routes->proxy_post;
routes.post_embeddings_oai = models_routes->proxy_post;
routes.post_rerank = models_routes->proxy_post;
routes.post_tokenize = models_routes->proxy_post;
routes.post_detokenize = models_routes->proxy_post;
routes.post_apply_template = models_routes->proxy_post;
routes.get_lora_adapters = models_routes->proxy_get;
routes.post_lora_adapters = models_routes->proxy_post;
routes.get_slots = models_routes->proxy_get;
routes.post_slots = models_routes->proxy_post;
// custom routes for router
routes.get_props = models_routes->get_router_props;
routes.get_models = models_routes->get_router_models;
ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load));
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status));
}
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
@ -140,8 +188,33 @@ int main(int argc, char ** argv) {
// Start the server
//
std::function<void()> clean_up;
if (is_router_server) {
LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
clean_up = [&models_routes]() {
SRV_INF("%s: cleaning up before exit...\n", __func__);
if (models_routes.has_value()) {
models_routes->models.unload_all();
}
llama_backend_free();
};
if (!ctx_http.start()) {
clean_up();
LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
return 1;
}
ctx_http.is_ready.store(true);
shutdown_handler = [&](int) {
ctx_http.stop();
};
} else {
// setup clean up function, to be called before exit
auto clean_up = [&ctx_http, &ctx_server]() {
clean_up = [&ctx_http, &ctx_server]() {
SRV_INF("%s: cleaning up before exit...\n", __func__);
ctx_http.stop();
ctx_server.terminate();
@ -176,6 +249,7 @@ int main(int argc, char ** argv) {
// this will unblock start_loop()
ctx_server.terminate();
};
}
// TODO: refactor in common/console
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@ -192,16 +266,39 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
if (is_router_server) {
LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__);
if (ctx_http.thread.joinable()) {
ctx_http.thread.join(); // keep the main thread alive
}
// when the HTTP server stops, clean up and exit
clean_up();
} else {
LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
LOG_INF("%s: starting the main loop...\n", __func__);
// this call blocks the main thread until ctx_server.terminate() is called
// optionally, notify router server that this instance is ready
const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
std::thread monitor_thread;
if (router_port != nullptr) {
monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
}
// this call blocks the main thread until queue_tasks.terminate() is called
ctx_server.start_loop();
clean_up();
if (ctx_http.thread.joinable()) {
ctx_http.thread.join();
}
if (monitor_thread.joinable()) {
monitor_thread.join();
}
llama_memory_breakdown_print(ctx_server.get_llama_context());
}
return 0;
}

View File

@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["system_fingerprint"].startswith("b")
assert res.body["model"] == model if model is not None else server.model_alias
# we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
# assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0]
@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
)
def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
server.model_alias = "llama-test-model"
server.start()
res = server.make_stream_request("POST", "/chat/completions", data={
"max_tokens": max_tokens,
@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
else:
assert "role" not in choice["delta"]
assert data["system_fingerprint"].startswith("b")
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
assert data["model"] == "llama-test-model"
if last_cmpl_id is None:
last_cmpl_id = data["id"]
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
@ -198,7 +199,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
choice = res.body["choices"][0]
assert match_regex(re_content, choice["message"]["content"])
else:
assert res.status_code != 200
assert res.status_code == 400
assert "error" in res.body

View File

@ -0,0 +1,50 @@
import pytest
from utils import *
server: ServerProcess
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.router()
@pytest.mark.parametrize(
"model,success",
[
("ggml-org/tinygemma3-GGUF:Q8_0", True),
("non-existent/model", False),
]
)
def test_router_chat_completion_stream(model: str, success: bool):
# TODO: make sure the model is in cache (ie. ServerProcess.load_all()) before starting the router server
global server
server.start()
content = ""
ex: ServerError | None = None
try:
res = server.make_stream_request("POST", "/chat/completions", data={
"model": model,
"max_tokens": 16,
"messages": [
{"role": "user", "content": "hello"},
],
"stream": True,
})
for data in res:
if data["choices"]:
choice = data["choices"][0]
if choice["finish_reason"] in ["stop", "length"]:
assert "content" not in choice["delta"]
else:
assert choice["finish_reason"] is None
content += choice["delta"]["content"] or ''
except ServerError as e:
ex = e
if success:
assert ex is None
assert len(content) > 0
else:
assert ex is not None
assert content == ""

View File

@ -46,7 +46,7 @@ class ServerProcess:
debug: bool = False
server_port: int = 8080
server_host: str = "127.0.0.1"
model_hf_repo: str = "ggml-org/models"
model_hf_repo: str | None = "ggml-org/models"
model_hf_file: str | None = "tinyllamas/stories260K.gguf"
model_alias: str = "tinyllama-2"
temperature: float = 0.8
@ -521,9 +521,8 @@ class ServerPreset:
server = ServerProcess()
server.offline = True # will be downloaded by load_all()
# mmproj is already provided by HF registry API
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
server.model_hf_file = "tinygemma3-Q8_0.gguf"
server.mmproj_url = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf"
server.model_hf_file = None
server.model_hf_repo = "ggml-org/tinygemma3-GGUF:Q8_0"
server.model_alias = "tinygemma3"
server.n_ctx = 1024
server.n_batch = 32
@ -532,6 +531,21 @@ class ServerPreset:
server.seed = 42
return server
@staticmethod
def router() -> ServerProcess:
server = ServerProcess()
# router server has no models
server.model_file = None
server.model_alias = None
server.model_hf_repo = None
server.model_hf_file = None
server.n_ctx = 1024
server.n_batch = 16
server.n_slots = 1
server.n_predict = 16
server.seed = 42
return server
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
"""

View File

@ -1,7 +1,7 @@
import type { StorybookConfig } from '@storybook/sveltekit';
const config: StorybookConfig = {
stories: ['../src/**/*.mdx', '../src/**/*.stories.@(js|ts|svelte)'],
stories: ['../tests/stories/**/*.mdx', '../tests/stories/**/*.stories.@(js|ts|svelte)'],
addons: [
'@storybook/addon-svelte-csf',
'@chromatic-com/storybook',

View File

@ -2,65 +2,685 @@
A modern, feature-rich web interface for llama.cpp built with SvelteKit. This UI provides an intuitive chat interface with advanced file handling, conversation management, and comprehensive model interaction capabilities.
The WebUI supports two server operation modes:
- **MODEL mode** - Single model operation (standard llama-server)
- **ROUTER mode** - Multi-model operation with dynamic model loading/unloading
---
## Table of Contents
- [Features](#features)
- [Getting Started](#getting-started)
- [Tech Stack](#tech-stack)
- [Build Pipeline](#build-pipeline)
- [Architecture](#architecture)
- [Data Flows](#data-flows)
- [Architectural Patterns](#architectural-patterns)
- [Testing](#testing)
---
## Features
- **Modern Chat Interface** - Clean, responsive design with dark/light mode
- **File Attachments** - Support for images, text files, PDFs, and audio with rich previews and drag-and-drop support
- **Conversation Management** - Create, edit, branch, and search conversations
- **Advanced Markdown** - Code highlighting, math formulas (KaTeX), and content blocks
- **Reasoning Content** - Support for models with thinking blocks
- **Keyboard Shortcuts** - Keyboard navigation (Shift+Ctrl/Cmd+O for new chat, Shift+Ctrl/Cmdt+E for edit conversation, Shift+Ctrl/Cmdt+D for delete conversation, Ctrl/Cmd+K for search, Ctrl/Cmd+V for paste, Ctrl/Cmd+B for opening/collapsing sidebar)
- **Request Tracking** - Monitor processing with slots endpoint integration
- **UI Testing** - Storybook component library with automated tests
### Chat Interface
## Development
- **Streaming responses** with real-time updates
- **Reasoning content** - Support for models with thinking/reasoning blocks
- **Dark/light theme** with system preference detection
- **Responsive design** for desktop and mobile
Install dependencies:
### File Attachments
- **Images** - JPEG, PNG, GIF, WebP, SVG (with PNG conversion)
- **Documents** - PDF (text extraction or image conversion for vision models)
- **Audio** - MP3, WAV for audio-capable models
- **Text files** - Source code, markdown, and other text formats
- **Drag-and-drop** and paste support with rich previews
### Conversation Management
- **Branching** - Branch messages conversations at any point by editing messages or regenerating responses, navigate between branches
- **Regeneration** - Regenerate responses with optional model switching (ROUTER mode)
- **Import/Export** - JSON format for backup and sharing
- **Search** - Find conversations by title or content
### Advanced Rendering
- **Syntax highlighting** - Code blocks with language detection
- **Math formulas** - KaTeX rendering for LaTeX expressions
- **Markdown** - Full GFM support with tables, lists, and more
### Multi-Model Support (ROUTER mode)
- **Model selector** with Loaded/Available groups
- **Automatic loading** - Models load on selection
- **Modality validation** - Prevents sending images to non-vision models
- **LRU unloading** - Server auto-manages model cache
### Keyboard Shortcuts
| Shortcut | Action |
| ------------------ | -------------------- |
| `Shift+Ctrl/Cmd+O` | New chat |
| `Shift+Ctrl/Cmd+E` | Edit conversation |
| `Shift+Ctrl/Cmd+D` | Delete conversation |
| `Ctrl/Cmd+K` | Search conversations |
| `Ctrl/Cmd+B` | Toggle sidebar |
### Developer Experience
- **Request tracking** - Monitor token generation with `/slots` endpoint
- **Storybook** - Component library with visual testing
- **Hot reload** - Instant updates during development
---
## Getting Started
### Prerequisites
- **Node.js** 18+ (20+ recommended)
- **npm** 9+
- **llama-server** running locally (for API access)
### 1. Install Dependencies
```bash
cd tools/server/webui
npm install
```
Start the development server + Storybook:
### 2. Start llama-server
In a separate terminal, start the backend server:
```bash
# Single model (MODEL mode)
./llama-server -m model.gguf
# Multi-model (ROUTER mode)
./llama-server --model-store /path/to/models
```
### 3. Start Development Servers
```bash
npm run dev
```
This will start both the SvelteKit dev server and Storybook on port 6006.
This starts:
## Building
- **Vite dev server** at `http://localhost:5173` - The main WebUI
- **Storybook** at `http://localhost:6006` - Component documentation
Create a production build:
The Vite dev server proxies API requests to `http://localhost:8080` (default llama-server port):
```typescript
// vite.config.ts proxy configuration
proxy: {
'/v1': 'http://localhost:8080',
'/props': 'http://localhost:8080',
'/slots': 'http://localhost:8080',
'/models': 'http://localhost:8080'
}
```
### Development Workflow
1. Open `http://localhost:5173` in your browser
2. Make changes to `.svelte`, `.ts`, or `.css` files
3. Changes hot-reload instantly
4. Use Storybook at `http://localhost:6006` for isolated component development
---
## Tech Stack
| Layer | Technology | Purpose |
| ----------------- | ------------------------------- | -------------------------------------------------------- |
| **Framework** | SvelteKit + Svelte 5 | Reactive UI with runes (`$state`, `$derived`, `$effect`) |
| **UI Components** | shadcn-svelte + bits-ui | Accessible, customizable component library |
| **Styling** | TailwindCSS 4 | Utility-first CSS with design tokens |
| **Database** | IndexedDB (Dexie) | Client-side storage for conversations and messages |
| **Build** | Vite | Fast bundling with static adapter |
| **Testing** | Playwright + Vitest + Storybook | E2E, unit, and visual testing |
| **Markdown** | remark + rehype | Markdown processing with KaTeX and syntax highlighting |
### Key Dependencies
```json
{
"svelte": "^5.0.0",
"bits-ui": "^2.8.11",
"dexie": "^4.0.11",
"pdfjs-dist": "^5.4.54",
"highlight.js": "^11.11.1",
"rehype-katex": "^7.0.1"
}
```
---
## Build Pipeline
### Development Build
```bash
npm run dev
```
Runs Vite in development mode with:
- Hot Module Replacement (HMR)
- Source maps
- Proxy to llama-server
### Production Build
```bash
npm run build
```
The build outputs static files to `../public` directory for deployment with llama.cpp server.
The build process:
## Testing
1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS
2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory)
3. **Post-Build Script** - Cleans up intermediate files
4. **Custom Plugin** - Creates `index.html.gz` with:
- Inlined favicon as base64
- GZIP compression (level 9)
- Deterministic output (zeroed timestamps)
Run the test suite:
```bash
# E2E tests
npm run test:e2e
# Unit tests
npm run test:unit
# UI tests
npm run test:ui
# All tests
npm run test
```text
tools/server/webui/ → build → tools/server/public/
├── src/ ├── index.html.gz (served by llama-server)
├── static/ └── (favicon inlined)
└── ...
```
### SvelteKit Configuration
```javascript
// svelte.config.js
adapter: adapter({
pages: '../public', // Output directory
assets: '../public', // Static assets
fallback: 'index.html', // SPA fallback
strict: true
}),
output: {
bundleStrategy: 'inline' // Single-file bundle
}
```
### Integration with llama-server
The WebUI is embedded directly into the llama-server binary:
1. `npm run build` outputs `index.html.gz` to `tools/server/public/`
2. llama-server compiles this into the binary at build time
3. When accessing `/`, llama-server serves the gzipped HTML
4. All assets are inlined (CSS, JS, fonts, favicon)
This results in a **single portable binary** with the full WebUI included.
---
## Architecture
- **Framework**: SvelteKit with Svelte 5 runes
- **Components**: ShadCN UI + bits-ui design system
- **Database**: IndexedDB with Dexie for local storage
- **Build**: Static adapter for deployment with llama.cpp server
- **Testing**: Playwright (E2E) + Vitest (unit) + Storybook (components)
The WebUI follows a layered architecture with unidirectional data flow:
```text
Routes → Components → Hooks → Stores → Services → Storage/API
```
### High-Level Architecture
See: [`docs/architecture/high-level-architecture-simplified.md`](docs/architecture/high-level-architecture-simplified.md)
```mermaid
flowchart TB
subgraph Routes["📍 Routes"]
R1["/ (Welcome)"]
R2["/chat/[id]"]
RL["+layout.svelte"]
end
subgraph Components["🧩 Components"]
C_Sidebar["ChatSidebar"]
C_Screen["ChatScreen"]
C_Form["ChatForm"]
C_Messages["ChatMessages"]
C_ModelsSelector["ModelsSelector"]
C_Settings["ChatSettings"]
end
subgraph Stores["🗄️ Stores"]
S1["chatStore"]
S2["conversationsStore"]
S3["modelsStore"]
S4["serverStore"]
S5["settingsStore"]
end
subgraph Services["⚙️ Services"]
SV1["ChatService"]
SV2["ModelsService"]
SV3["PropsService"]
SV4["DatabaseService"]
end
subgraph Storage["💾 Storage"]
ST1["IndexedDB"]
ST2["LocalStorage"]
end
subgraph APIs["🌐 llama-server"]
API1["/v1/chat/completions"]
API2["/props"]
API3["/models/*"]
end
R1 & R2 --> C_Screen
RL --> C_Sidebar
C_Screen --> C_Form & C_Messages & C_Settings
C_Screen --> S1 & S2
C_ModelsSelector --> S3 & S4
S1 --> SV1 & SV4
S3 --> SV2 & SV3
SV4 --> ST1
SV1 --> API1
SV2 --> API3
SV3 --> API2
```
### Layer Breakdown
#### Routes (`src/routes/`)
- **`/`** - Welcome screen, creates new conversation
- **`/chat/[id]`** - Active chat interface
- **`+layout.svelte`** - Sidebar, navigation, global initialization
#### Components (`src/lib/components/`)
Components are organized in `app/` (application-specific) and `ui/` (shadcn-svelte primitives).
**Chat Components** (`app/chat/`):
| Component | Responsibility |
| ------------------ | --------------------------------------------------------------------------- |
| `ChatScreen/` | Main chat container, coordinates message list, input form, and attachments |
| `ChatForm/` | Message input textarea with file upload, paste handling, keyboard shortcuts |
| `ChatMessages/` | Message list with branch navigation, regenerate/continue/edit actions |
| `ChatAttachments/` | File attachment previews, drag-and-drop, PDF/image/audio handling |
| `ChatSettings/` | Parameter sliders (temperature, top-p, etc.) with server default sync |
| `ChatSidebar/` | Conversation list, search, import/export, navigation |
**Dialog Components** (`app/dialogs/`):
| Component | Responsibility |
| ------------------------------- | -------------------------------------------------------- |
| `DialogChatSettings` | Full-screen settings configuration |
| `DialogModelInformation` | Model details (context size, modalities, parallel slots) |
| `DialogChatAttachmentPreview` | Full preview for images, PDFs (text or page view), code |
| `DialogConfirmation` | Generic confirmation for destructive actions |
| `DialogConversationTitleUpdate` | Edit conversation title |
**Server/Model Components** (`app/server/`, `app/models/`):
| Component | Responsibility |
| ------------------- | --------------------------------------------------------- |
| `ServerErrorSplash` | Error display when server is unreachable |
| `ModelsSelector` | Model dropdown with Loaded/Available groups (ROUTER mode) |
**Shared UI Components** (`app/misc/`):
| Component | Responsibility |
| -------------------------------- | ---------------------------------------------------------------- |
| `MarkdownContent` | Markdown rendering with KaTeX, syntax highlighting, copy buttons |
| `SyntaxHighlightedCode` | Code blocks with language detection and highlighting |
| `ActionButton`, `ActionDropdown` | Reusable action buttons and menus |
| `BadgeModality`, `BadgeInfo` | Status and capability badges |
#### Hooks (`src/lib/hooks/`)
- **`useModelChangeValidation`** - Validates model switch against conversation modalities
- **`useProcessingState`** - Tracks streaming progress and token generation
#### Stores (`src/lib/stores/`)
| Store | Responsibility |
| -------------------- | --------------------------------------------------------- |
| `chatStore` | Message sending, streaming, abort control, error handling |
| `conversationsStore` | CRUD for conversations, message branching, navigation |
| `modelsStore` | Model list, selection, loading/unloading (ROUTER) |
| `serverStore` | Server properties, role detection, modalities |
| `settingsStore` | User preferences, parameter sync with server defaults |
#### Services (`src/lib/services/`)
| Service | Responsibility |
| ---------------------- | ----------------------------------------------- |
| `ChatService` | API calls to`/v1/chat/completions`, SSE parsing |
| `ModelsService` | `/models`, `/models/load`, `/models/unload` |
| `PropsService` | `/props`, `/props?model=` |
| `DatabaseService` | IndexedDB operations via Dexie |
| `ParameterSyncService` | Syncs settings with server defaults |
---
## Data Flows
### MODEL Mode (Single Model)
See: [`docs/flows/data-flow-simplified-model-mode.md`](docs/flows/data-flow-simplified-model-mode.md)
```mermaid
sequenceDiagram
participant User
participant UI
participant Stores
participant DB as IndexedDB
participant API as llama-server
Note over User,API: Initialization
UI->>Stores: initialize()
Stores->>DB: load conversations
Stores->>API: GET /props
API-->>Stores: server config
Stores->>API: GET /v1/models
API-->>Stores: single model (auto-selected)
Note over User,API: Chat Flow
User->>UI: send message
Stores->>DB: save user message
Stores->>API: POST /v1/chat/completions (stream)
loop streaming
API-->>Stores: SSE chunks
Stores-->>UI: reactive update
end
Stores->>DB: save assistant message
```
### ROUTER Mode (Multi-Model)
See: [`docs/flows/data-flow-simplified-router-mode.md`](docs/flows/data-flow-simplified-router-mode.md)
```mermaid
sequenceDiagram
participant User
participant UI
participant Stores
participant API as llama-server
Note over User,API: Initialization
Stores->>API: GET /props
API-->>Stores: {role: "router"}
Stores->>API: GET /models
API-->>Stores: models[] with status
Note over User,API: Model Selection
User->>UI: select model
alt model not loaded
Stores->>API: POST /models/load
loop poll status
Stores->>API: GET /models
end
Stores->>API: GET /props?model=X
end
Stores->>Stores: validate modalities
Note over User,API: Chat Flow
Stores->>API: POST /v1/chat/completions {model: X}
loop streaming
API-->>Stores: SSE chunks + model info
end
```
### Detailed Flow Diagrams
| Flow | Description | File |
| ------------- | ------------------------------------------ | ----------------------------------------------------------- |
| Chat | Message lifecycle, streaming, regeneration | [`chat-flow.md`](docs/flows/chat-flow.md) |
| Models | Loading, unloading, modality caching | [`models-flow.md`](docs/flows/models-flow.md) |
| Server | Props fetching, role detection | [`server-flow.md`](docs/flows/server-flow.md) |
| Conversations | CRUD, branching, import/export | [`conversations-flow.md`](docs/flows/conversations-flow.md) |
| Database | IndexedDB schema, operations | [`database-flow.md`](docs/flows/database-flow.md) |
| Settings | Parameter sync, user overrides | [`settings-flow.md`](docs/flows/settings-flow.md) |
---
## Architectural Patterns
### 1. Reactive State with Svelte 5 Runes
All stores use Svelte 5's fine-grained reactivity:
```typescript
// Store with reactive state
class ChatStore {
#isLoading = $state(false);
#currentResponse = $state('');
// Derived values auto-update
get isStreaming() {
return $derived(this.#isLoading && this.#currentResponse.length > 0);
}
}
// Exported reactive accessors
export const isLoading = () => chatStore.isLoading;
export const currentResponse = () => chatStore.currentResponse;
```
### 2. Unidirectional Data Flow
Data flows in one direction, making state predictable:
```mermaid
flowchart LR
subgraph UI["UI Layer"]
A[User Action] --> B[Component]
end
subgraph State["State Layer"]
B --> C[Store Method]
C --> D[State Update]
end
subgraph IO["I/O Layer"]
C --> E[Service]
E --> F[API / IndexedDB]
F -.->|Response| D
end
D -->|Reactive| B
```
Components dispatch actions to stores, stores coordinate with services for I/O, and state updates reactively propagate back to the UI.
### 3. Per-Conversation State
Enables concurrent streaming across multiple conversations:
```typescript
class ChatStore {
chatLoadingStates = new Map<string, boolean>();
chatStreamingStates = new Map<string, { response: string; messageId: string }>();
abortControllers = new Map<string, AbortController>();
}
```
### 4. Message Branching with Tree Structure
Conversations are stored as a tree, not a linear list:
```typescript
interface DatabaseMessage {
id: string;
parent: string | null; // Points to parent message
children: string[]; // List of child message IDs
// ...
}
interface DatabaseConversation {
currentNode: string; // Currently viewed branch tip
// ...
}
```
Navigation between branches updates `currentNode` without losing history.
### 5. Layered Service Architecture
Stores handle state; services handle I/O:
```text
┌─────────────────┐
│ Stores │ Business logic, state management
├─────────────────┤
│ Services │ API calls, database operations
├─────────────────┤
│ Storage/API │ IndexedDB, LocalStorage, HTTP
└─────────────────┘
```
### 6. Server Role Abstraction
Single codebase handles both MODEL and ROUTER modes:
```typescript
// serverStore.ts
get isRouterMode() {
return this.role === ServerRole.ROUTER;
}
// Components conditionally render based on mode
{#if isRouterMode()}
<ModelsSelector />
{/if}
```
### 7. Modality Validation
Prevents sending attachments to incompatible models:
```typescript
// useModelChangeValidation hook
const validate = (modelId: string) => {
const modelModalities = modelsStore.getModelModalities(modelId);
const conversationModalities = conversationsStore.usedModalities;
// Check if model supports all used modalities
if (conversationModalities.hasImages && !modelModalities.vision) {
return { valid: false, reason: 'Model does not support images' };
}
// ...
};
```
### 8. Persistent Storage Strategy
Data is persisted across sessions using two storage mechanisms:
```mermaid
flowchart TB
subgraph Browser["Browser Storage"]
subgraph IDB["IndexedDB (Dexie)"]
C[Conversations]
M[Messages]
end
subgraph LS["LocalStorage"]
S[Settings Config]
O[User Overrides]
T[Theme Preference]
end
end
subgraph Stores["Svelte Stores"]
CS[conversationsStore] --> C
CS --> M
SS[settingsStore] --> S
SS --> O
SS --> T
end
```
- **IndexedDB**: Conversations and messages (large, structured data)
- **LocalStorage**: Settings, user parameter overrides, theme (small key-value data)
- **Memory only**: Server props, model list (fetched fresh on each session)
---
## Testing
### Test Types
| Type | Tool | Location | Command |
| ------------- | ------------------ | -------------------------------- | ------------------- |
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
| **Unit** | Vitest | `tests/client/`, `tests/server/` | `npm run test:unit` |
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
### Running Tests
```bash
# All tests
npm run test
# Individual test suites
npm run test:e2e # End-to-end (requires llama-server)
npm run test:client # Client-side unit tests
npm run test:server # Server-side unit tests
npm run test:ui # Storybook visual tests
```
### Storybook Development
```bash
npm run storybook # Start Storybook dev server on :6006
npm run build-storybook # Build static Storybook
```
### Linting and Formatting
```bash
npm run lint # Check code style
npm run format # Auto-format with Prettier
npm run check # TypeScript type checking
```
---
## Project Structure
```text
tools/server/webui/
├── src/
│ ├── lib/
│ │ ├── components/ # UI components (app/, ui/)
│ │ ├── hooks/ # Svelte hooks
│ │ ├── stores/ # State management
│ │ ├── services/ # API and database services
│ │ ├── types/ # TypeScript interfaces
│ │ └── utils/ # Utility functions
│ ├── routes/ # SvelteKit routes
│ └── styles/ # Global styles
├── static/ # Static assets
├── tests/ # Test files
├── docs/ # Architecture diagrams
│ ├── architecture/ # High-level architecture
│ └── flows/ # Feature-specific flows
└── .storybook/ # Storybook configuration
```
---
## Related Documentation
- [llama.cpp Server README](../README.md) - Full server documentation
- [Multimodal Documentation](../../../docs/multimodal.md) - Image and audio support
- [Function Calling](../../../docs/function-calling.md) - Tool use capabilities

View File

@ -0,0 +1,102 @@
```mermaid
flowchart TB
subgraph Routes["📍 Routes"]
R1["/ (Welcome)"]
R2["/chat/[id]"]
RL["+layout.svelte"]
end
subgraph Components["🧩 Components"]
C_Sidebar["ChatSidebar"]
C_Screen["ChatScreen"]
C_Form["ChatForm"]
C_Messages["ChatMessages"]
C_ModelsSelector["ModelsSelector"]
C_Settings["ChatSettings"]
end
subgraph Hooks["🪝 Hooks"]
H1["useModelChangeValidation"]
H2["useProcessingState"]
end
subgraph Stores["🗄️ Stores"]
S1["chatStore<br/><i>Chat interactions & streaming</i>"]
S2["conversationsStore<br/><i>Conversation data & messages</i>"]
S3["modelsStore<br/><i>Model selection & loading</i>"]
S4["serverStore<br/><i>Server props & role detection</i>"]
S5["settingsStore<br/><i>User configuration</i>"]
end
subgraph Services["⚙️ Services"]
SV1["ChatService"]
SV2["ModelsService"]
SV3["PropsService"]
SV4["DatabaseService"]
SV5["ParameterSyncService"]
end
subgraph Storage["💾 Storage"]
ST1["IndexedDB<br/><i>conversations, messages</i>"]
ST2["LocalStorage<br/><i>config, userOverrides</i>"]
end
subgraph APIs["🌐 llama-server API"]
API1["/v1/chat/completions"]
API2["/props"]
API3["/models/*"]
API4["/v1/models"]
end
%% Routes → Components
R1 & R2 --> C_Screen
RL --> C_Sidebar
%% Component hierarchy
C_Screen --> C_Form & C_Messages & C_Settings
C_Form & C_Messages --> C_ModelsSelector
%% Components → Hooks → Stores
C_Form & C_Messages --> H1 & H2
H1 --> S3 & S4
H2 --> S1 & S5
%% Components → Stores
C_Screen --> S1 & S2
C_Sidebar --> S2
C_ModelsSelector --> S3 & S4
C_Settings --> S5
%% Stores → Services
S1 --> SV1 & SV4
S2 --> SV4
S3 --> SV2 & SV3
S4 --> SV3
S5 --> SV5
%% Services → Storage
SV4 --> ST1
SV5 --> ST2
%% Services → APIs
SV1 --> API1
SV2 --> API3 & API4
SV3 --> API2
%% Styling
classDef routeStyle fill:#e1f5fe,stroke:#01579b,stroke-width:2px
classDef componentStyle fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
classDef hookStyle fill:#fff8e1,stroke:#ff8f00,stroke-width:2px
classDef storeStyle fill:#fff3e0,stroke:#e65100,stroke-width:2px
classDef serviceStyle fill:#e8f5e9,stroke:#2e7d32,stroke-width:2px
classDef storageStyle fill:#fce4ec,stroke:#c2185b,stroke-width:2px
classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
class R1,R2,RL routeStyle
class C_Sidebar,C_Screen,C_Form,C_Messages,C_ModelsSelector,C_Settings componentStyle
class H1,H2 hookStyle
class S1,S2,S3,S4,S5 storeStyle
class SV1,SV2,SV3,SV4,SV5 serviceStyle
class ST1,ST2 storageStyle
class API1,API2,API3,API4 apiStyle
```

View File

@ -0,0 +1,269 @@
```mermaid
flowchart TB
subgraph Routes["📍 Routes"]
R1["/ (+page.svelte)"]
R2["/chat/[id]"]
RL["+layout.svelte"]
end
subgraph Components["🧩 Components"]
direction TB
subgraph LayoutComponents["Layout"]
C_Sidebar["ChatSidebar"]
C_Screen["ChatScreen"]
end
subgraph ChatUIComponents["Chat UI"]
C_Form["ChatForm"]
C_Messages["ChatMessages"]
C_Message["ChatMessage"]
C_Attach["ChatAttachments"]
C_ModelsSelector["ModelsSelector"]
C_Settings["ChatSettings"]
end
end
subgraph Hooks["🪝 Hooks"]
H1["useModelChangeValidation"]
H2["useProcessingState"]
H3["isMobile"]
end
subgraph Stores["🗄️ Stores"]
direction TB
subgraph S1["chatStore"]
S1State["<b>State:</b><br/>isLoading, currentResponse<br/>errorDialogState<br/>activeProcessingState<br/>chatLoadingStates<br/>chatStreamingStates<br/>abortControllers<br/>processingStates<br/>activeConversationId<br/>isStreamingActive"]
S1LoadState["<b>Loading State:</b><br/>setChatLoading()<br/>isChatLoading()<br/>syncLoadingStateForChat()<br/>clearUIState()<br/>isChatLoadingPublic()<br/>getAllLoadingChats()<br/>getAllStreamingChats()"]
S1ProcState["<b>Processing State:</b><br/>setActiveProcessingConversation()<br/>getProcessingState()<br/>clearProcessingState()<br/>getActiveProcessingState()<br/>updateProcessingStateFromTimings()<br/>getCurrentProcessingStateSync()<br/>restoreProcessingStateFromMessages()"]
S1Stream["<b>Streaming:</b><br/>streamChatCompletion()<br/>startStreaming()<br/>stopStreaming()<br/>stopGeneration()<br/>isStreaming()"]
S1Error["<b>Error Handling:</b><br/>showErrorDialog()<br/>dismissErrorDialog()<br/>isAbortError()"]
S1Msg["<b>Message Operations:</b><br/>addMessage()<br/>sendMessage()<br/>updateMessage()<br/>deleteMessage()<br/>getDeletionInfo()"]
S1Regen["<b>Regeneration:</b><br/>regenerateMessage()<br/>regenerateMessageWithBranching()<br/>continueAssistantMessage()"]
S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()"]
S1Utils["<b>Utilities:</b><br/>getApiOptions()<br/>parseTimingData()<br/>getOrCreateAbortController()<br/>getConversationModel()"]
end
subgraph S2["conversationsStore"]
S2State["<b>State:</b><br/>conversations<br/>activeConversation<br/>activeMessages<br/>usedModalities<br/>isInitialized<br/>titleUpdateConfirmationCallback"]
S2Modal["<b>Modalities:</b><br/>getModalitiesUpToMessage()<br/>calculateModalitiesFromMessages()"]
S2Lifecycle["<b>Lifecycle:</b><br/>initialize()<br/>loadConversations()<br/>clearActiveConversation()"]
S2ConvCRUD["<b>Conversation CRUD:</b><br/>createConversation()<br/>loadConversation()<br/>deleteConversation()<br/>updateConversationName()<br/>updateConversationTitleWithConfirmation()"]
S2MsgMgmt["<b>Message Management:</b><br/>refreshActiveMessages()<br/>addMessageToActive()<br/>updateMessageAtIndex()<br/>findMessageIndex()<br/>sliceActiveMessages()<br/>removeMessageAtIndex()<br/>getConversationMessages()"]
S2Nav["<b>Navigation:</b><br/>navigateToSibling()<br/>updateCurrentNode()<br/>updateConversationTimestamp()"]
S2Export["<b>Import/Export:</b><br/>downloadConversation()<br/>exportAllConversations()<br/>importConversations()<br/>triggerDownload()"]
S2Utils["<b>Utilities:</b><br/>setTitleUpdateConfirmationCallback()"]
end
subgraph S3["modelsStore"]
S3State["<b>State:</b><br/>models, routerModels<br/>selectedModelId<br/>selectedModelName<br/>loading, updating, error<br/>modelLoadingStates<br/>modelPropsCache<br/>modelPropsFetching<br/>propsCacheVersion"]
S3Getters["<b>Computed Getters:</b><br/>selectedModel<br/>loadedModelIds<br/>loadingModelIds<br/>singleModelName"]
S3Modal["<b>Modalities:</b><br/>getModelModalities()<br/>modelSupportsVision()<br/>modelSupportsAudio()<br/>getModelModalitiesArray()<br/>getModelProps()<br/>updateModelModalities()"]
S3Status["<b>Status Queries:</b><br/>isModelLoaded()<br/>isModelOperationInProgress()<br/>getModelStatus()<br/>isModelPropsFetching()"]
S3Fetch["<b>Data Fetching:</b><br/>fetch()<br/>fetchRouterModels()<br/>fetchModelProps()<br/>fetchModalitiesForLoadedModels()"]
S3Select["<b>Model Selection:</b><br/>selectModelById()<br/>selectModelByName()<br/>clearSelection()<br/>findModelByName()<br/>findModelById()<br/>hasModel()"]
S3LoadUnload["<b>Loading/Unloading Models:</b><br/>loadModel()<br/>unloadModel()<br/>ensureModelLoaded()<br/>waitForModelStatus()<br/>pollForModelStatus()"]
S3Utils["<b>Utilities:</b><br/>toDisplayName()<br/>clear()"]
end
subgraph S4["serverStore"]
S4State["<b>State:</b><br/>props<br/>loading, error<br/>role<br/>fetchPromise"]
S4Getters["<b>Getters:</b><br/>defaultParams<br/>contextSize<br/>isRouterMode<br/>isModelMode"]
S4Data["<b>Data Handling:</b><br/>fetch()<br/>getErrorMessage()<br/>clear()"]
S4Utils["<b>Utilities:</b><br/>detectRole()"]
end
subgraph S5["settingsStore"]
S5State["<b>State:</b><br/>config<br/>theme<br/>isInitialized<br/>userOverrides"]
S5Lifecycle["<b>Lifecycle:</b><br/>initialize()<br/>loadConfig()<br/>saveConfig()<br/>loadTheme()<br/>saveTheme()"]
S5Update["<b>Config Updates:</b><br/>updateConfig()<br/>updateMultipleConfig()<br/>updateTheme()"]
S5Reset["<b>Reset:</b><br/>resetConfig()<br/>resetTheme()<br/>resetAll()<br/>resetParameterToServerDefault()"]
S5Sync["<b>Server Sync:</b><br/>syncWithServerDefaults()<br/>forceSyncWithServerDefaults()"]
S5Utils["<b>Utilities:</b><br/>getConfig()<br/>getAllConfig()<br/>getParameterInfo()<br/>getParameterDiff()<br/>getServerDefaults()<br/>clearAllUserOverrides()"]
end
subgraph ReactiveExports["⚡ Reactive Exports"]
direction LR
subgraph ChatExports["chatStore"]
RE1["isLoading()"]
RE2["currentResponse()"]
RE3["errorDialog()"]
RE4["activeProcessingState()"]
RE5["isChatStreaming()"]
RE6["isChatLoading()"]
RE7["getChatStreaming()"]
RE8["getAllLoadingChats()"]
RE9["getAllStreamingChats()"]
end
subgraph ConvExports["conversationsStore"]
RE10["conversations()"]
RE11["activeConversation()"]
RE12["activeMessages()"]
RE13["isConversationsInitialized()"]
RE14["usedModalities()"]
end
subgraph ModelsExports["modelsStore"]
RE15["modelOptions()"]
RE16["routerModels()"]
RE17["modelsLoading()"]
RE18["modelsUpdating()"]
RE19["modelsError()"]
RE20["selectedModelId()"]
RE21["selectedModelName()"]
RE22["selectedModelOption()"]
RE23["loadedModelIds()"]
RE24["loadingModelIds()"]
RE25["propsCacheVersion()"]
RE26["singleModelName()"]
end
subgraph ServerExports["serverStore"]
RE27["serverProps()"]
RE28["serverLoading()"]
RE29["serverError()"]
RE30["serverRole()"]
RE31["defaultParams()"]
RE32["contextSize()"]
RE33["isRouterMode()"]
RE34["isModelMode()"]
end
subgraph SettingsExports["settingsStore"]
RE35["config()"]
RE36["theme()"]
RE37["isInitialized()"]
end
end
end
subgraph Services["⚙️ Services"]
direction TB
subgraph SV1["ChatService"]
SV1Msg["<b>Messaging:</b><br/>sendMessage()"]
SV1Stream["<b>Streaming:</b><br/>handleStreamResponse()<br/>parseSSEChunk()"]
SV1Convert["<b>Conversion:</b><br/>convertMessageToChatData()<br/>convertExtraToApiFormat()"]
SV1Utils["<b>Utilities:</b><br/>extractReasoningContent()<br/>getServerProps()<br/>getModels()"]
end
subgraph SV2["ModelsService"]
SV2List["<b>Listing:</b><br/>list()<br/>listRouter()"]
SV2LoadUnload["<b>Load/Unload:</b><br/>load()<br/>unload()"]
SV2Status["<b>Status:</b><br/>isModelLoaded()<br/>isModelLoading()"]
end
subgraph SV3["PropsService"]
SV3Fetch["<b>Fetching:</b><br/>fetch()<br/>fetchForModel()"]
end
subgraph SV4["DatabaseService"]
SV4Conv["<b>Conversations:</b><br/>createConversation()<br/>getConversation()<br/>getAllConversations()<br/>updateConversation()<br/>deleteConversation()"]
SV4Msg["<b>Messages:</b><br/>createMessageBranch()<br/>createRootMessage()<br/>getConversationMessages()<br/>updateMessage()<br/>deleteMessage()<br/>deleteMessageCascading()"]
SV4Node["<b>Navigation:</b><br/>updateCurrentNode()"]
SV4Import["<b>Import:</b><br/>importConversations()"]
end
subgraph SV5["ParameterSyncService"]
SV5Extract["<b>Extraction:</b><br/>extractServerDefaults()"]
SV5Merge["<b>Merging:</b><br/>mergeWithServerDefaults()"]
SV5Info["<b>Info:</b><br/>getParameterInfo()<br/>canSyncParameter()<br/>getSyncableParameterKeys()<br/>validateServerParameter()"]
SV5Diff["<b>Diff:</b><br/>createParameterDiff()"]
end
end
subgraph Storage["💾 Storage"]
ST1["IndexedDB"]
ST2["conversations"]
ST3["messages"]
ST5["LocalStorage"]
ST6["config"]
ST7["userOverrides"]
end
subgraph APIs["🌐 llama-server API"]
API1["/v1/chat/completions"]
API2["/props<br/>/props?model="]
API3["/models<br/>/models/load<br/>/models/unload"]
API4["/v1/models"]
end
%% Routes render Components
R1 --> C_Screen
R2 --> C_Screen
RL --> C_Sidebar
%% Component hierarchy
C_Screen --> C_Form & C_Messages & C_Settings
C_Messages --> C_Message
C_Message --> C_ModelsSelector
C_Form --> C_ModelsSelector
C_Form --> C_Attach
C_Message --> C_Attach
%% Components use Hooks
C_Form --> H1
C_Message --> H1 & H2
C_Screen --> H2
%% Hooks use Stores
H1 --> S3 & S4
H2 --> S1 & S5
%% Components use Stores
C_Screen --> S1 & S2
C_Messages --> S2
C_Message --> S1 & S2 & S3
C_Form --> S1 & S3
C_Sidebar --> S2
C_ModelsSelector --> S3 & S4
C_Settings --> S5
%% Stores export Reactive State
S1 -. exports .-> ChatExports
S2 -. exports .-> ConvExports
S3 -. exports .-> ModelsExports
S4 -. exports .-> ServerExports
S5 -. exports .-> SettingsExports
%% Stores use Services
S1 --> SV1 & SV4
S2 --> SV4
S3 --> SV2 & SV3
S4 --> SV3
S5 --> SV5
%% Services to Storage
SV4 --> ST1
ST1 --> ST2 & ST3
SV5 --> ST5
ST5 --> ST6 & ST7
%% Services to APIs
SV1 --> API1
SV2 --> API3 & API4
SV3 --> API2
%% Styling
classDef routeStyle fill:#e1f5fe,stroke:#01579b,stroke-width:2px
classDef componentStyle fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
classDef componentGroupStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:1px
classDef storeStyle fill:#fff3e0,stroke:#e65100,stroke-width:2px
classDef stateStyle fill:#ffe0b2,stroke:#e65100,stroke-width:1px
classDef methodStyle fill:#ffecb3,stroke:#e65100,stroke-width:1px
classDef reactiveStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px
classDef serviceStyle fill:#e8f5e9,stroke:#2e7d32,stroke-width:2px
classDef serviceMStyle fill:#c8e6c9,stroke:#2e7d32,stroke-width:1px
classDef storageStyle fill:#fce4ec,stroke:#c2185b,stroke-width:2px
classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
class R1,R2,RL routeStyle
class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message componentStyle
class C_ModelsSelector,C_Settings componentStyle
class C_Attach componentStyle
class H1,H2,H3 methodStyle
class LayoutComponents,ChatUIComponents componentGroupStyle
class Hooks storeStyle
class S1,S2,S3,S4,S5 storeStyle
class S1State,S2State,S3State,S4State,S5State stateStyle
class S1Msg,S1Regen,S1Edit,S1Stream,S1LoadState,S1ProcState,S1Error,S1Utils methodStyle
class S2Lifecycle,S2ConvCRUD,S2MsgMgmt,S2Nav,S2Modal,S2Export,S2Utils methodStyle
class S3Getters,S3Modal,S3Status,S3Fetch,S3Select,S3LoadUnload,S3Utils methodStyle
class S4Getters,S4Data,S4Utils methodStyle
class S5Lifecycle,S5Update,S5Reset,S5Sync,S5Utils methodStyle
class ChatExports,ConvExports,ModelsExports,ServerExports,SettingsExports reactiveStyle
class SV1,SV2,SV3,SV4,SV5 serviceStyle
class SV1Msg,SV1Stream,SV1Convert,SV1Utils serviceMStyle
class SV2List,SV2LoadUnload,SV2Status serviceMStyle
class SV3Fetch serviceMStyle
class SV4Conv,SV4Msg,SV4Node,SV4Import serviceMStyle
class SV5Extract,SV5Merge,SV5Info,SV5Diff serviceMStyle
class ST1,ST2,ST3,ST5,ST6,ST7 storageStyle
class API1,API2,API3,API4 apiStyle
```

View File

@ -0,0 +1,174 @@
```mermaid
sequenceDiagram
participant UI as 🧩 ChatForm / ChatMessage
participant chatStore as 🗄️ chatStore
participant convStore as 🗄️ conversationsStore
participant settingsStore as 🗄️ settingsStore
participant ChatSvc as ⚙️ ChatService
participant DbSvc as ⚙️ DatabaseService
participant API as 🌐 /v1/chat/completions
Note over chatStore: State:<br/>isLoading, currentResponse<br/>errorDialogState, activeProcessingState<br/>chatLoadingStates (Map)<br/>chatStreamingStates (Map)<br/>abortControllers (Map)<br/>processingStates (Map)
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 💬 SEND MESSAGE
%% ═══════════════════════════════════════════════════════════════════════════
UI->>chatStore: sendMessage(content, extras)
activate chatStore
chatStore->>chatStore: setChatLoading(convId, true)
chatStore->>chatStore: clearChatStreaming(convId)
alt no active conversation
chatStore->>convStore: createConversation()
Note over convStore: → see conversations-flow.mmd
end
chatStore->>chatStore: addMessage("user", content, extras)
chatStore->>DbSvc: createMessageBranch(userMsg, parentId)
chatStore->>convStore: addMessageToActive(userMsg)
chatStore->>convStore: updateCurrentNode(userMsg.id)
chatStore->>chatStore: createAssistantMessage(userMsg.id)
chatStore->>DbSvc: createMessageBranch(assistantMsg, userMsg.id)
chatStore->>convStore: addMessageToActive(assistantMsg)
chatStore->>chatStore: streamChatCompletion(messages, assistantMsg)
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🌊 STREAMING
%% ═══════════════════════════════════════════════════════════════════════════
activate chatStore
chatStore->>chatStore: startStreaming()
Note right of chatStore: isStreamingActive = true
chatStore->>chatStore: setActiveProcessingConversation(convId)
chatStore->>chatStore: getOrCreateAbortController(convId)
Note right of chatStore: abortControllers.set(convId, new AbortController())
chatStore->>chatStore: getApiOptions()
Note right of chatStore: Merge from settingsStore.config:<br/>temperature, max_tokens, top_p, etc.
chatStore->>ChatSvc: sendMessage(messages, options, signal)
activate ChatSvc
ChatSvc->>ChatSvc: convertMessageToChatData(messages)
Note right of ChatSvc: DatabaseMessage[] → ApiChatMessageData[]<br/>Process attachments (images, PDFs, audio)
ChatSvc->>API: POST /v1/chat/completions
Note right of API: {messages, model?, stream: true, ...params}
loop SSE chunks
API-->>ChatSvc: data: {"choices":[{"delta":{...}}]}
ChatSvc->>ChatSvc: parseSSEChunk(line)
alt content chunk
ChatSvc-->>chatStore: onChunk(content)
chatStore->>chatStore: setChatStreaming(convId, response, msgId)
Note right of chatStore: currentResponse = $state(accumulated)
chatStore->>convStore: updateMessageAtIndex(idx, {content})
end
alt reasoning chunk
ChatSvc-->>chatStore: onReasoningChunk(reasoning)
chatStore->>convStore: updateMessageAtIndex(idx, {thinking})
end
alt tool_calls chunk
ChatSvc-->>chatStore: onToolCallChunk(toolCalls)
chatStore->>convStore: updateMessageAtIndex(idx, {toolCalls})
end
alt model info
ChatSvc-->>chatStore: onModel(modelName)
chatStore->>chatStore: recordModel(modelName)
chatStore->>DbSvc: updateMessage(msgId, {model})
end
alt timings (during stream)
ChatSvc-->>chatStore: onTimings(timings, promptProgress)
chatStore->>chatStore: updateProcessingStateFromTimings()
end
chatStore-->>UI: reactive $state update
end
API-->>ChatSvc: data: [DONE]
ChatSvc-->>chatStore: onComplete(content, reasoning, timings, toolCalls)
deactivate ChatSvc
chatStore->>chatStore: stopStreaming()
chatStore->>DbSvc: updateMessage(msgId, {content, timings, model})
chatStore->>convStore: updateCurrentNode(msgId)
chatStore->>chatStore: setChatLoading(convId, false)
chatStore->>chatStore: clearChatStreaming(convId)
chatStore->>chatStore: clearProcessingState(convId)
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ⏹️ STOP GENERATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>chatStore: stopGeneration()
activate chatStore
chatStore->>chatStore: savePartialResponseIfNeeded(convId)
Note right of chatStore: Save currentResponse to DB if non-empty
chatStore->>chatStore: abortControllers.get(convId).abort()
Note right of chatStore: fetch throws AbortError → caught by isAbortError()
chatStore->>chatStore: stopStreaming()
chatStore->>chatStore: setChatLoading(convId, false)
chatStore->>chatStore: clearChatStreaming(convId)
chatStore->>chatStore: clearProcessingState(convId)
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🔁 REGENERATE
%% ═══════════════════════════════════════════════════════════════════════════
UI->>chatStore: regenerateMessageWithBranching(msgId, model?)
activate chatStore
chatStore->>convStore: findMessageIndex(msgId)
chatStore->>chatStore: Get parent of target message
chatStore->>chatStore: createAssistantMessage(parentId)
chatStore->>DbSvc: createMessageBranch(newAssistantMsg, parentId)
chatStore->>convStore: refreshActiveMessages()
Note right of chatStore: Same streaming flow
chatStore->>chatStore: streamChatCompletion(...)
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ➡️ CONTINUE
%% ═══════════════════════════════════════════════════════════════════════════
UI->>chatStore: continueAssistantMessage(msgId)
activate chatStore
chatStore->>chatStore: Get existing content from message
chatStore->>chatStore: streamChatCompletion(..., existingContent)
Note right of chatStore: Appends to existing message content
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ✏️ EDIT USER MESSAGE
%% ═══════════════════════════════════════════════════════════════════════════
UI->>chatStore: editUserMessagePreserveResponses(msgId, newContent)
activate chatStore
chatStore->>chatStore: Get parent of target message
chatStore->>DbSvc: createMessageBranch(editedMsg, parentId)
chatStore->>convStore: refreshActiveMessages()
Note right of chatStore: Creates new branch, original preserved
deactivate chatStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ❌ ERROR HANDLING
%% ═══════════════════════════════════════════════════════════════════════════
Note over chatStore: On stream error (non-abort):
chatStore->>chatStore: showErrorDialog(type, message)
Note right of chatStore: errorDialogState = {type: 'timeout'|'server', message}
chatStore->>convStore: removeMessageAtIndex(failedMsgIdx)
chatStore->>DbSvc: deleteMessage(failedMsgId)
```

View File

@ -0,0 +1,155 @@
```mermaid
sequenceDiagram
participant UI as 🧩 ChatSidebar / ChatScreen
participant convStore as 🗄️ conversationsStore
participant chatStore as 🗄️ chatStore
participant DbSvc as ⚙️ DatabaseService
participant IDB as 💾 IndexedDB
Note over convStore: State:<br/>conversations: DatabaseConversation[]<br/>activeConversation: DatabaseConversation | null<br/>activeMessages: DatabaseMessage[]<br/>isInitialized: boolean<br/>usedModalities: $derived({vision, audio})
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 🚀 INITIALIZATION
%% ═══════════════════════════════════════════════════════════════════════════
Note over convStore: Auto-initialized in constructor (browser only)
convStore->>convStore: initialize()
activate convStore
convStore->>convStore: loadConversations()
convStore->>DbSvc: getAllConversations()
DbSvc->>IDB: SELECT * FROM conversations ORDER BY lastModified DESC
IDB-->>DbSvc: Conversation[]
DbSvc-->>convStore: conversations
convStore->>convStore: conversations = $state(data)
convStore->>convStore: isInitialized = true
deactivate convStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: CREATE CONVERSATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: createConversation(name?)
activate convStore
convStore->>DbSvc: createConversation(name || "New Chat")
DbSvc->>IDB: INSERT INTO conversations
IDB-->>DbSvc: conversation {id, name, lastModified, currNode: ""}
DbSvc-->>convStore: conversation
convStore->>convStore: conversations.unshift(conversation)
convStore->>convStore: activeConversation = $state(conversation)
convStore->>convStore: activeMessages = $state([])
deactivate convStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 📂 LOAD CONVERSATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: loadConversation(convId)
activate convStore
convStore->>DbSvc: getConversation(convId)
DbSvc->>IDB: SELECT * FROM conversations WHERE id = ?
IDB-->>DbSvc: conversation
convStore->>convStore: activeConversation = $state(conversation)
convStore->>convStore: refreshActiveMessages()
convStore->>DbSvc: getConversationMessages(convId)
DbSvc->>IDB: SELECT * FROM messages WHERE convId = ?
IDB-->>DbSvc: allMessages[]
convStore->>convStore: filterByLeafNodeId(allMessages, currNode)
Note right of convStore: Filter to show only current branch path
convStore->>convStore: activeMessages = $state(filtered)
convStore->>chatStore: syncLoadingStateForChat(convId)
Note right of chatStore: Sync isLoading/currentResponse if streaming
deactivate convStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 🌳 MESSAGE BRANCHING MODEL
%% ═══════════════════════════════════════════════════════════════════════════
Note over IDB: Message Tree Structure:<br/>- Each message has parent (null for root)<br/>- Each message has children[] array<br/>- Conversation.currNode points to active leaf<br/>- filterByLeafNodeId() traverses from root to currNode
rect rgb(240, 240, 255)
Note over convStore: Example Branch Structure:
Note over convStore: root → user1 → assistant1 → user2 → assistant2a (currNode)<br/> ↘ assistant2b (alt branch)
end
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: ↔️ BRANCH NAVIGATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: navigateToSibling(msgId, direction)
activate convStore
convStore->>convStore: Find message in activeMessages
convStore->>convStore: Get parent message
convStore->>convStore: Find sibling in parent.children[]
convStore->>convStore: findLeafNode(siblingId, allMessages)
Note right of convStore: Navigate to leaf of sibling branch
convStore->>convStore: updateCurrentNode(leafId)
convStore->>DbSvc: updateCurrentNode(convId, leafId)
DbSvc->>IDB: UPDATE conversations SET currNode = ?
convStore->>convStore: refreshActiveMessages()
deactivate convStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 📝 UPDATE CONVERSATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: updateConversationName(convId, newName)
activate convStore
convStore->>DbSvc: updateConversation(convId, {name: newName})
DbSvc->>IDB: UPDATE conversations SET name = ?
convStore->>convStore: Update in conversations array
deactivate convStore
Note over convStore: Auto-title update (after first response):
convStore->>convStore: updateConversationTitleWithConfirmation()
convStore->>convStore: titleUpdateConfirmationCallback?()
Note right of convStore: Shows dialog if title would change
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 🗑️ DELETE CONVERSATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: deleteConversation(convId)
activate convStore
convStore->>DbSvc: deleteConversation(convId)
DbSvc->>IDB: DELETE FROM conversations WHERE id = ?
DbSvc->>IDB: DELETE FROM messages WHERE convId = ?
convStore->>convStore: conversations.filter(c => c.id !== convId)
alt deleted active conversation
convStore->>convStore: clearActiveConversation()
end
deactivate convStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 📊 MODALITY TRACKING
%% ═══════════════════════════════════════════════════════════════════════════
Note over convStore: usedModalities = $derived.by(() => {<br/> calculateModalitiesFromMessages(activeMessages)<br/>})
Note over convStore: Scans activeMessages for attachments:<br/>- IMAGE → vision: true<br/>- PDF (processedAsImages) → vision: true<br/>- AUDIO → audio: true
UI->>convStore: getModalitiesUpToMessage(msgId)
Note right of convStore: Used for regeneration validation<br/>Only checks messages BEFORE target
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,IDB: 📤 EXPORT / 📥 IMPORT
%% ═══════════════════════════════════════════════════════════════════════════
UI->>convStore: exportAllConversations()
activate convStore
convStore->>DbSvc: getAllConversations()
loop each conversation
convStore->>DbSvc: getConversationMessages(convId)
end
convStore->>convStore: triggerDownload(JSON blob)
deactivate convStore
UI->>convStore: importConversations(file)
activate convStore
convStore->>convStore: Parse JSON file
convStore->>DbSvc: importConversations(parsed)
DbSvc->>IDB: Bulk INSERT conversations + messages
convStore->>convStore: loadConversations()
deactivate convStore
```

View File

@ -0,0 +1,45 @@
```mermaid
%% MODEL Mode Data Flow (single model)
%% Detailed flows: ./flows/server-flow.mmd, ./flows/models-flow.mmd, ./flows/chat-flow.mmd
sequenceDiagram
participant User as 👤 User
participant UI as 🧩 UI
participant Stores as 🗄️ Stores
participant DB as 💾 IndexedDB
participant API as 🌐 llama-server
Note over User,API: 🚀 Initialization (see: server-flow.mmd, models-flow.mmd)
UI->>Stores: initialize()
Stores->>DB: load conversations
Stores->>API: GET /props
API-->>Stores: server config + modalities
Stores->>API: GET /v1/models
API-->>Stores: single model (auto-selected)
Note over User,API: 💬 Chat Flow (see: chat-flow.mmd)
User->>UI: send message
UI->>Stores: sendMessage()
Stores->>DB: save user message
Stores->>API: POST /v1/chat/completions (stream)
loop streaming
API-->>Stores: SSE chunks
Stores-->>UI: reactive update
end
API-->>Stores: done + timings
Stores->>DB: save assistant message
Note over User,API: 🔁 Regenerate
User->>UI: regenerate
Stores->>DB: create message branch
Note right of Stores: same streaming flow
Note over User,API: ⏹️ Stop
User->>UI: stop
Stores->>Stores: abort stream
Stores->>DB: save partial response
```

View File

@ -0,0 +1,77 @@
```mermaid
%% ROUTER Mode Data Flow (multi-model)
%% Detailed flows: ./flows/server-flow.mmd, ./flows/models-flow.mmd, ./flows/chat-flow.mmd
sequenceDiagram
participant User as 👤 User
participant UI as 🧩 UI
participant Stores as 🗄️ Stores
participant DB as 💾 IndexedDB
participant API as 🌐 llama-server
Note over User,API: 🚀 Initialization (see: server-flow.mmd, models-flow.mmd)
UI->>Stores: initialize()
Stores->>DB: load conversations
Stores->>API: GET /props
API-->>Stores: {role: "router"}
Stores->>API: GET /models
API-->>Stores: models[] with status (loaded/available)
loop each loaded model
Stores->>API: GET /props?model=X
API-->>Stores: modalities (vision/audio)
end
Note over User,API: 🔄 Model Selection (see: models-flow.mmd)
User->>UI: select model
alt model not loaded
Stores->>API: POST /models/load
loop poll status
Stores->>API: GET /models
API-->>Stores: check if loaded
end
Stores->>API: GET /props?model=X
API-->>Stores: cache modalities
end
Stores->>Stores: validate modalities vs conversation
alt valid
Stores->>Stores: select model
else invalid
Stores->>API: POST /models/unload
UI->>User: show error toast
end
Note over User,API: 💬 Chat Flow (see: chat-flow.mmd)
User->>UI: send message
UI->>Stores: sendMessage()
Stores->>DB: save user message
Stores->>API: POST /v1/chat/completions {model: X}
Note right of API: router forwards to model
loop streaming
API-->>Stores: SSE chunks + model info
Stores-->>UI: reactive update
end
API-->>Stores: done + timings
Stores->>DB: save assistant message + model used
Note over User,API: 🔁 Regenerate (optional: different model)
User->>UI: regenerate
Stores->>Stores: validate modalities up to this message
Stores->>DB: create message branch
Note right of Stores: same streaming flow
Note over User,API: ⏹️ Stop
User->>UI: stop
Stores->>Stores: abort stream
Stores->>DB: save partial response
Note over User,API: 🗑️ LRU Unloading
Note right of API: Server auto-unloads LRU models<br/>when cache full
User->>UI: select unloaded model
Note right of Stores: triggers load flow again
```

View File

@ -0,0 +1,155 @@
```mermaid
sequenceDiagram
participant Store as 🗄️ Stores
participant DbSvc as ⚙️ DatabaseService
participant Dexie as 📦 Dexie ORM
participant IDB as 💾 IndexedDB
Note over DbSvc: Stateless service - all methods static<br/>Database: "LlamacppWebui"
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 📊 SCHEMA
%% ═══════════════════════════════════════════════════════════════════════════
rect rgb(240, 248, 255)
Note over IDB: conversations table:<br/>id (PK), lastModified, currNode, name
end
rect rgb(255, 248, 240)
Note over IDB: messages table:<br/>id (PK), convId (FK), type, role, timestamp,<br/>parent, children[], content, thinking,<br/>toolCalls, extra[], model, timings
end
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 💬 CONVERSATIONS CRUD
%% ═══════════════════════════════════════════════════════════════════════════
Store->>DbSvc: createConversation(name)
activate DbSvc
DbSvc->>DbSvc: Generate UUID
DbSvc->>Dexie: db.conversations.add({id, name, lastModified, currNode: ""})
Dexie->>IDB: INSERT
IDB-->>Dexie: success
DbSvc-->>Store: DatabaseConversation
deactivate DbSvc
Store->>DbSvc: getConversation(convId)
DbSvc->>Dexie: db.conversations.get(convId)
Dexie->>IDB: SELECT WHERE id = ?
IDB-->>DbSvc: DatabaseConversation
Store->>DbSvc: getAllConversations()
DbSvc->>Dexie: db.conversations.orderBy('lastModified').reverse().toArray()
Dexie->>IDB: SELECT ORDER BY lastModified DESC
IDB-->>DbSvc: DatabaseConversation[]
Store->>DbSvc: updateConversation(convId, updates)
DbSvc->>Dexie: db.conversations.update(convId, {...updates, lastModified})
Dexie->>IDB: UPDATE
Store->>DbSvc: deleteConversation(convId)
activate DbSvc
DbSvc->>Dexie: db.conversations.delete(convId)
Dexie->>IDB: DELETE FROM conversations
DbSvc->>Dexie: db.messages.where('convId').equals(convId).delete()
Dexie->>IDB: DELETE FROM messages WHERE convId = ?
deactivate DbSvc
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 📝 MESSAGES CRUD
%% ═══════════════════════════════════════════════════════════════════════════
Store->>DbSvc: createRootMessage(convId)
activate DbSvc
DbSvc->>DbSvc: Create root message {type: "root", parent: null}
DbSvc->>Dexie: db.messages.add(rootMsg)
Dexie->>IDB: INSERT
DbSvc-->>Store: rootMessageId
deactivate DbSvc
Store->>DbSvc: createMessageBranch(message, parentId)
activate DbSvc
DbSvc->>DbSvc: Generate UUID for new message
DbSvc->>Dexie: db.messages.add({...message, id, parent: parentId})
Dexie->>IDB: INSERT message
alt parentId exists
DbSvc->>Dexie: db.messages.get(parentId)
Dexie->>IDB: SELECT parent
DbSvc->>DbSvc: parent.children.push(newId)
DbSvc->>Dexie: db.messages.update(parentId, {children})
Dexie->>IDB: UPDATE parent.children
end
DbSvc->>Dexie: db.conversations.update(convId, {currNode: newId})
Dexie->>IDB: UPDATE conversation.currNode
DbSvc-->>Store: DatabaseMessage
deactivate DbSvc
Store->>DbSvc: getConversationMessages(convId)
DbSvc->>Dexie: db.messages.where('convId').equals(convId).toArray()
Dexie->>IDB: SELECT WHERE convId = ?
IDB-->>DbSvc: DatabaseMessage[]
Store->>DbSvc: updateMessage(msgId, updates)
DbSvc->>Dexie: db.messages.update(msgId, updates)
Dexie->>IDB: UPDATE
Store->>DbSvc: deleteMessage(msgId)
DbSvc->>Dexie: db.messages.delete(msgId)
Dexie->>IDB: DELETE
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 🌳 BRANCHING OPERATIONS
%% ═══════════════════════════════════════════════════════════════════════════
Store->>DbSvc: updateCurrentNode(convId, nodeId)
DbSvc->>Dexie: db.conversations.update(convId, {currNode: nodeId, lastModified})
Dexie->>IDB: UPDATE
Store->>DbSvc: deleteMessageCascading(msgId)
activate DbSvc
DbSvc->>DbSvc: findDescendantMessages(msgId, allMessages)
Note right of DbSvc: Recursively find all children
loop each descendant
DbSvc->>Dexie: db.messages.delete(descendantId)
Dexie->>IDB: DELETE
end
DbSvc->>Dexie: db.messages.delete(msgId)
Dexie->>IDB: DELETE target message
deactivate DbSvc
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 📥 IMPORT
%% ═══════════════════════════════════════════════════════════════════════════
Store->>DbSvc: importConversations(data)
activate DbSvc
loop each conversation in data
DbSvc->>DbSvc: Generate new UUIDs (avoid conflicts)
DbSvc->>Dexie: db.conversations.add(conversation)
Dexie->>IDB: INSERT conversation
loop each message
DbSvc->>Dexie: db.messages.add(message)
Dexie->>IDB: INSERT message
end
end
deactivate DbSvc
%% ═══════════════════════════════════════════════════════════════════════════
Note over Store,IDB: 🔗 MESSAGE TREE UTILITIES
%% ═══════════════════════════════════════════════════════════════════════════
Note over DbSvc: Used by stores (imported from utils):
rect rgb(240, 255, 240)
Note over DbSvc: filterByLeafNodeId(messages, leafId)<br/>→ Returns path from root to leaf<br/>→ Used to display current branch
end
rect rgb(240, 255, 240)
Note over DbSvc: findLeafNode(startId, messages)<br/>→ Traverse to deepest child<br/>→ Used for branch navigation
end
rect rgb(240, 255, 240)
Note over DbSvc: findDescendantMessages(msgId, messages)<br/>→ Find all children recursively<br/>→ Used for cascading deletes
end
```

View File

@ -0,0 +1,181 @@
```mermaid
sequenceDiagram
participant UI as 🧩 ModelsSelector
participant Hooks as 🪝 useModelChangeValidation
participant modelsStore as 🗄️ modelsStore
participant serverStore as 🗄️ serverStore
participant convStore as 🗄️ conversationsStore
participant ModelsSvc as ⚙️ ModelsService
participant PropsSvc as ⚙️ PropsService
participant API as 🌐 llama-server
Note over modelsStore: State:<br/>models: ModelOption[]<br/>routerModels: ApiModelDataEntry[]<br/>selectedModelId, selectedModelName<br/>loading, updating, error<br/>modelLoadingStates (Map)<br/>modelPropsCache (Map)<br/>propsCacheVersion
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🚀 INITIALIZATION (MODEL mode)
%% ═══════════════════════════════════════════════════════════════════════════
UI->>modelsStore: fetch()
activate modelsStore
modelsStore->>modelsStore: loading = true
alt serverStore.props not loaded
modelsStore->>serverStore: fetch()
Note over serverStore: → see server-flow.mmd
end
modelsStore->>ModelsSvc: list()
ModelsSvc->>API: GET /v1/models
API-->>ModelsSvc: ApiModelListResponse {data: [model]}
modelsStore->>modelsStore: models = $state(mapped)
Note right of modelsStore: Map to ModelOption[]:<br/>{id, name, model, description, capabilities}
Note over modelsStore: MODEL mode: Get modalities from serverStore.props
modelsStore->>modelsStore: modelPropsCache.set(model.id, serverStore.props)
modelsStore->>modelsStore: models[0].modalities = props.modalities
modelsStore->>modelsStore: Auto-select single model
Note right of modelsStore: selectedModelId = models[0].id
modelsStore->>modelsStore: loading = false
deactivate modelsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🚀 INITIALIZATION (ROUTER mode)
%% ═══════════════════════════════════════════════════════════════════════════
UI->>modelsStore: fetch()
activate modelsStore
modelsStore->>ModelsSvc: list()
ModelsSvc->>API: GET /v1/models
API-->>ModelsSvc: ApiModelListResponse
modelsStore->>modelsStore: models = $state(mapped)
deactivate modelsStore
Note over UI: After models loaded, layout triggers:
UI->>modelsStore: fetchRouterModels()
activate modelsStore
modelsStore->>ModelsSvc: listRouter()
ModelsSvc->>API: GET /models
API-->>ModelsSvc: ApiRouterModelsListResponse
Note right of API: {data: [{id, status, path, in_cache}]}
modelsStore->>modelsStore: routerModels = $state(data)
modelsStore->>modelsStore: fetchModalitiesForLoadedModels()
loop each model where status === "loaded"
modelsStore->>PropsSvc: fetchForModel(modelId)
PropsSvc->>API: GET /props?model={modelId}
API-->>PropsSvc: ApiLlamaCppServerProps
modelsStore->>modelsStore: modelPropsCache.set(modelId, props)
end
modelsStore->>modelsStore: propsCacheVersion++
deactivate modelsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🔄 MODEL SELECTION (ROUTER mode)
%% ═══════════════════════════════════════════════════════════════════════════
UI->>Hooks: useModelChangeValidation({getRequiredModalities, onSuccess?, onValidationFailure?})
Note over Hooks: Hook configured per-component:<br/>ChatForm: getRequiredModalities = usedModalities<br/>ChatMessage: getRequiredModalities = getModalitiesUpToMessage(msgId)
UI->>Hooks: handleModelChange(modelId, modelName)
activate Hooks
Hooks->>Hooks: previousSelectedModelId = modelsStore.selectedModelId
Hooks->>modelsStore: isModelLoaded(modelName)?
alt model NOT loaded
Hooks->>modelsStore: loadModel(modelName)
Note over modelsStore: → see LOAD MODEL section below
end
Note over Hooks: Always fetch props (from cache or API)
Hooks->>modelsStore: fetchModelProps(modelName)
modelsStore-->>Hooks: props
Hooks->>convStore: getRequiredModalities()
convStore-->>Hooks: {vision, audio}
Hooks->>Hooks: Validate: model.modalities ⊇ required?
alt validation PASSED
Hooks->>modelsStore: selectModelById(modelId)
Hooks-->>UI: return true
else validation FAILED
Hooks->>UI: toast.error("Model doesn't support required modalities")
alt model was just loaded
Hooks->>modelsStore: unloadModel(modelName)
end
alt onValidationFailure provided
Hooks->>modelsStore: selectModelById(previousSelectedModelId)
end
Hooks-->>UI: return false
end
deactivate Hooks
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ⬆️ LOAD MODEL (ROUTER mode)
%% ═══════════════════════════════════════════════════════════════════════════
modelsStore->>modelsStore: loadModel(modelId)
activate modelsStore
alt already loaded
modelsStore-->>modelsStore: return (no-op)
end
modelsStore->>modelsStore: modelLoadingStates.set(modelId, true)
modelsStore->>ModelsSvc: load(modelId)
ModelsSvc->>API: POST /models/load {model: modelId}
API-->>ModelsSvc: {status: "loading"}
modelsStore->>modelsStore: pollForModelStatus(modelId, LOADED)
loop poll every 500ms (max 60 attempts)
modelsStore->>modelsStore: fetchRouterModels()
modelsStore->>ModelsSvc: listRouter()
ModelsSvc->>API: GET /models
API-->>ModelsSvc: models[]
modelsStore->>modelsStore: getModelStatus(modelId)
alt status === LOADED
Note right of modelsStore: break loop
else status === LOADING
Note right of modelsStore: wait 500ms, continue
end
end
modelsStore->>modelsStore: updateModelModalities(modelId)
modelsStore->>PropsSvc: fetchForModel(modelId)
PropsSvc->>API: GET /props?model={modelId}
API-->>PropsSvc: props with modalities
modelsStore->>modelsStore: modelPropsCache.set(modelId, props)
modelsStore->>modelsStore: propsCacheVersion++
modelsStore->>modelsStore: modelLoadingStates.set(modelId, false)
deactivate modelsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ⬇️ UNLOAD MODEL (ROUTER mode)
%% ═══════════════════════════════════════════════════════════════════════════
modelsStore->>modelsStore: unloadModel(modelId)
activate modelsStore
modelsStore->>modelsStore: modelLoadingStates.set(modelId, true)
modelsStore->>ModelsSvc: unload(modelId)
ModelsSvc->>API: POST /models/unload {model: modelId}
modelsStore->>modelsStore: pollForModelStatus(modelId, UNLOADED)
loop poll until unloaded
modelsStore->>ModelsSvc: listRouter()
ModelsSvc->>API: GET /models
end
modelsStore->>modelsStore: modelLoadingStates.set(modelId, false)
deactivate modelsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 📊 COMPUTED GETTERS
%% ═══════════════════════════════════════════════════════════════════════════
Note over modelsStore: Getters:<br/>- selectedModel: ModelOption | null<br/>- loadedModelIds: string[] (from routerModels)<br/>- loadingModelIds: string[] (from modelLoadingStates)<br/>- singleModelName: string | null (MODEL mode only)
Note over modelsStore: Modality helpers:<br/>- getModelModalities(modelId): {vision, audio}<br/>- modelSupportsVision(modelId): boolean<br/>- modelSupportsAudio(modelId): boolean
```

View File

@ -0,0 +1,76 @@
```mermaid
sequenceDiagram
participant UI as 🧩 +layout.svelte
participant serverStore as 🗄️ serverStore
participant PropsSvc as ⚙️ PropsService
participant API as 🌐 llama-server
Note over serverStore: State:<br/>props: ApiLlamaCppServerProps | null<br/>loading, error<br/>role: ServerRole | null (MODEL | ROUTER)<br/>fetchPromise (deduplication)
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🚀 INITIALIZATION
%% ═══════════════════════════════════════════════════════════════════════════
UI->>serverStore: fetch()
activate serverStore
alt fetchPromise exists (already fetching)
serverStore-->>UI: return fetchPromise
Note right of serverStore: Deduplicate concurrent calls
end
serverStore->>serverStore: loading = true
serverStore->>serverStore: fetchPromise = new Promise()
serverStore->>PropsSvc: fetch()
PropsSvc->>API: GET /props
API-->>PropsSvc: ApiLlamaCppServerProps
Note right of API: {role, model_path, model_alias,<br/>modalities, default_generation_settings, ...}
PropsSvc-->>serverStore: props
serverStore->>serverStore: props = $state(data)
serverStore->>serverStore: detectRole(props)
Note right of serverStore: role = props.role === "router"<br/> ? ServerRole.ROUTER<br/> : ServerRole.MODEL
serverStore->>serverStore: loading = false
serverStore->>serverStore: fetchPromise = null
deactivate serverStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 📊 COMPUTED GETTERS
%% ═══════════════════════════════════════════════════════════════════════════
Note over serverStore: Getters from props:
rect rgb(240, 255, 240)
Note over serverStore: defaultParams<br/>→ props.default_generation_settings.params<br/>(temperature, top_p, top_k, etc.)
end
rect rgb(240, 255, 240)
Note over serverStore: contextSize<br/>→ props.default_generation_settings.n_ctx
end
rect rgb(255, 240, 240)
Note over serverStore: isRouterMode<br/>→ role === ServerRole.ROUTER
end
rect rgb(255, 240, 240)
Note over serverStore: isModelMode<br/>→ role === ServerRole.MODEL
end
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: 🔗 RELATIONSHIPS
%% ═══════════════════════════════════════════════════════════════════════════
Note over serverStore: Used by:
Note right of serverStore: - modelsStore: role detection, MODEL mode modalities<br/>- settingsStore: syncWithServerDefaults (defaultParams)<br/>- chatStore: contextSize for processing state<br/>- UI components: isRouterMode for conditional rendering
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,API: ❌ ERROR HANDLING
%% ═══════════════════════════════════════════════════════════════════════════
Note over serverStore: getErrorMessage(): string | null<br/>Returns formatted error for UI display
Note over serverStore: clear(): void<br/>Resets all state (props, error, loading, role)
```

View File

@ -0,0 +1,144 @@
```mermaid
sequenceDiagram
participant UI as 🧩 ChatSettings
participant settingsStore as 🗄️ settingsStore
participant serverStore as 🗄️ serverStore
participant ParamSvc as ⚙️ ParameterSyncService
participant LS as 💾 LocalStorage
Note over settingsStore: State:<br/>config: SettingsConfigType<br/>theme: string ("auto" | "light" | "dark")<br/>isInitialized: boolean<br/>userOverrides: Set&lt;string&gt;
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 🚀 INITIALIZATION
%% ═══════════════════════════════════════════════════════════════════════════
Note over settingsStore: Auto-initialized in constructor (browser only)
settingsStore->>settingsStore: initialize()
activate settingsStore
settingsStore->>settingsStore: loadConfig()
settingsStore->>LS: get("llama-config")
LS-->>settingsStore: StoredConfig | null
alt config exists
settingsStore->>settingsStore: Merge with SETTING_CONFIG_DEFAULT
Note right of settingsStore: Fill missing keys with defaults
else no config
settingsStore->>settingsStore: config = SETTING_CONFIG_DEFAULT
end
settingsStore->>LS: get("llama-userOverrides")
LS-->>settingsStore: string[] | null
settingsStore->>settingsStore: userOverrides = new Set(data)
settingsStore->>settingsStore: loadTheme()
settingsStore->>LS: get("llama-theme")
LS-->>settingsStore: theme | "auto"
settingsStore->>settingsStore: isInitialized = true
deactivate settingsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 🔄 SYNC WITH SERVER DEFAULTS
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI: Triggered from +layout.svelte when serverStore.props loaded
UI->>settingsStore: syncWithServerDefaults()
activate settingsStore
settingsStore->>serverStore: defaultParams
serverStore-->>settingsStore: {temperature, top_p, top_k, ...}
settingsStore->>ParamSvc: extractServerDefaults(defaultParams)
ParamSvc-->>settingsStore: Record<string, value>
settingsStore->>ParamSvc: mergeWithServerDefaults(config, serverDefaults)
Note right of ParamSvc: For each syncable parameter:<br/>- If NOT in userOverrides → use server default<br/>- If in userOverrides → keep user value
ParamSvc-->>settingsStore: mergedConfig
settingsStore->>settingsStore: config = mergedConfig
settingsStore->>settingsStore: saveConfig()
deactivate settingsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: ⚙️ UPDATE CONFIG
%% ═══════════════════════════════════════════════════════════════════════════
UI->>settingsStore: updateConfig(key, value)
activate settingsStore
settingsStore->>settingsStore: config[key] = value
settingsStore->>settingsStore: userOverrides.add(key)
Note right of settingsStore: Mark as user-modified (won't be overwritten by server)
settingsStore->>settingsStore: saveConfig()
settingsStore->>LS: set("llama-config", config)
settingsStore->>LS: set("llama-userOverrides", [...userOverrides])
deactivate settingsStore
UI->>settingsStore: updateMultipleConfig({key1: val1, key2: val2})
activate settingsStore
Note right of settingsStore: Batch update, single save
settingsStore->>settingsStore: For each key: config[key] = value
settingsStore->>settingsStore: For each key: userOverrides.add(key)
settingsStore->>settingsStore: saveConfig()
deactivate settingsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 🔄 RESET
%% ═══════════════════════════════════════════════════════════════════════════
UI->>settingsStore: resetConfig()
activate settingsStore
settingsStore->>settingsStore: config = SETTING_CONFIG_DEFAULT
settingsStore->>settingsStore: userOverrides.clear()
settingsStore->>settingsStore: syncWithServerDefaults()
Note right of settingsStore: Apply server defaults for syncable params
settingsStore->>settingsStore: saveConfig()
deactivate settingsStore
UI->>settingsStore: resetParameterToServerDefault(key)
activate settingsStore
settingsStore->>settingsStore: userOverrides.delete(key)
settingsStore->>serverStore: defaultParams[key]
settingsStore->>settingsStore: config[key] = serverDefault
settingsStore->>settingsStore: saveConfig()
deactivate settingsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 🎨 THEME
%% ═══════════════════════════════════════════════════════════════════════════
UI->>settingsStore: updateTheme(newTheme)
activate settingsStore
settingsStore->>settingsStore: theme = newTheme
settingsStore->>settingsStore: saveTheme()
settingsStore->>LS: set("llama-theme", theme)
deactivate settingsStore
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 📊 PARAMETER INFO
%% ═══════════════════════════════════════════════════════════════════════════
UI->>settingsStore: getParameterInfo(key)
settingsStore->>ParamSvc: getParameterInfo(key, config, serverDefaults, userOverrides)
ParamSvc-->>settingsStore: ParameterInfo
Note right of ParamSvc: {<br/> currentValue,<br/> serverDefault,<br/> isUserOverride: boolean,<br/> canSync: boolean,<br/> isDifferentFromServer: boolean<br/>}
UI->>settingsStore: getParameterDiff()
settingsStore->>ParamSvc: createParameterDiff(config, serverDefaults, userOverrides)
ParamSvc-->>settingsStore: ParameterDiff[]
Note right of ParamSvc: Array of parameters where user != server
%% ═══════════════════════════════════════════════════════════════════════════
Note over UI,LS: 📋 CONFIG CATEGORIES
%% ═══════════════════════════════════════════════════════════════════════════
Note over settingsStore: Syncable with server (from /props):
rect rgb(240, 255, 240)
Note over settingsStore: temperature, top_p, top_k, min_p<br/>repeat_penalty, presence_penalty, frequency_penalty<br/>dynatemp_range, dynatemp_exponent<br/>typ_p, xtc_probability, xtc_threshold<br/>dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n
end
Note over settingsStore: UI-only (not synced):
rect rgb(255, 240, 240)
Note over settingsStore: systemMessage, custom (JSON)<br/>showStatistics, enableContinueGeneration<br/>autoMicOnEmpty, disableAutoScroll<br/>apiKey, pdfAsImage, disableReasoningFormat
end
```

View File

@ -64,7 +64,7 @@
"svelte": "^5.0.0",
"svelte-check": "^4.0.0",
"tailwind-merge": "^3.3.1",
"tailwind-variants": "^1.0.0",
"tailwind-variants": "^3.2.2",
"tailwindcss": "^4.0.0",
"tw-animate-css": "^1.3.5",
"typescript": "^5.0.0",
@ -8324,31 +8324,23 @@
}
},
"node_modules/tailwind-variants": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/tailwind-variants/-/tailwind-variants-1.0.0.tgz",
"integrity": "sha512-2WSbv4ulEEyuBKomOunut65D8UZwxrHoRfYnxGcQNnHqlSCp2+B7Yz2W+yrNDrxRodOXtGD/1oCcKGNBnUqMqA==",
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/tailwind-variants/-/tailwind-variants-3.2.2.tgz",
"integrity": "sha512-Mi4kHeMTLvKlM98XPnK+7HoBPmf4gygdFmqQPaDivc3DpYS6aIY6KiG/PgThrGvii5YZJqRsPz0aPyhoFzmZgg==",
"dev": true,
"license": "MIT",
"dependencies": {
"tailwind-merge": "3.0.2"
},
"engines": {
"node": ">=16.x",
"pnpm": ">=7.x"
},
"peerDependencies": {
"tailwind-merge": ">=3.0.0",
"tailwindcss": "*"
}
},
"node_modules/tailwind-variants/node_modules/tailwind-merge": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.0.2.tgz",
"integrity": "sha512-l7z+OYZ7mu3DTqrL88RiKrKIqO3NcpEO8V/Od04bNpvk0kiIFndGEoqfuzvj4yuhRkHKjRkII2z+KS2HfPcSxw==",
"dev": true,
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/dcastil"
"peerDependenciesMeta": {
"tailwind-merge": {
"optional": true
}
}
},
"node_modules/tailwindcss": {

View File

@ -66,7 +66,7 @@
"svelte": "^5.0.0",
"svelte-check": "^4.0.0",
"tailwind-merge": "^3.3.1",
"tailwind-variants": "^1.0.0",
"tailwind-variants": "^3.2.2",
"tailwindcss": "^4.0.0",
"tw-animate-css": "^1.3.5",
"typescript": "^5.0.0",

View File

@ -7,5 +7,5 @@ export default defineConfig({
timeout: 120000,
reuseExistingServer: false
},
testDir: 'e2e'
testDir: 'tests/e2e'
});

View File

@ -49,7 +49,9 @@ trap cleanup SIGINT SIGTERM
echo "🚀 Starting development servers..."
echo "📝 Note: Make sure to start llama-server separately if needed"
cd tools/server/webui
storybook dev -p 6006 --ci & vite dev --host 0.0.0.0 &
# Use --insecure-http-parser to handle malformed HTTP responses from llama-server
# (some responses have both Content-Length and Transfer-Encoding headers)
storybook dev -p 6006 --ci & NODE_OPTIONS="--insecure-http-parser" vite dev --host 0.0.0.0 &
# Wait for all background processes
wait

View File

@ -29,7 +29,7 @@
--chart-3: oklch(0.398 0.07 227.392);
--chart-4: oklch(0.828 0.189 84.429);
--chart-5: oklch(0.769 0.188 70.08);
--sidebar: oklch(0.985 0 0);
--sidebar: oklch(0.987 0 0);
--sidebar-foreground: oklch(0.145 0 0);
--sidebar-primary: oklch(0.205 0 0);
--sidebar-primary-foreground: oklch(0.985 0 0);
@ -66,7 +66,7 @@
--chart-3: oklch(0.769 0.188 70.08);
--chart-4: oklch(0.627 0.265 303.9);
--chart-5: oklch(0.645 0.246 16.439);
--sidebar: oklch(0.205 0 0);
--sidebar: oklch(0.19 0 0);
--sidebar-foreground: oklch(0.985 0 0);
--sidebar-primary: oklch(0.488 0.243 264.376);
--sidebar-primary-foreground: oklch(0.985 0 0);

View File

@ -4,27 +4,38 @@
// Import chat types from dedicated module
import type {
// API types
ApiChatCompletionRequest,
ApiChatCompletionResponse,
ApiChatCompletionStreamChunk,
ApiChatCompletionToolCall,
ApiChatCompletionToolCallDelta,
ApiChatMessageData,
ApiChatMessageContentPart,
ApiContextSizeError,
ApiErrorResponse,
ApiLlamaCppServerProps,
ApiProcessingState
} from '$lib/types/api';
import type {
ApiModelDataEntry,
ApiModelListResponse,
ApiProcessingState,
ApiRouterModelMeta,
ApiRouterModelsLoadRequest,
ApiRouterModelsLoadResponse,
ApiRouterModelsStatusRequest,
ApiRouterModelsStatusResponse,
ApiRouterModelsListResponse,
ApiRouterModelsUnloadRequest,
ApiRouterModelsUnloadResponse,
// Chat types
ChatAttachmentDisplayItem,
ChatAttachmentPreviewItem,
ChatMessageType,
ChatRole,
ChatUploadedFile,
ChatMessageSiblingInfo,
ChatMessagePromptProgress,
ChatMessageTimings
} from '$lib/types/chat';
import type {
ChatMessageTimings,
// Database types
DatabaseConversation,
DatabaseMessage,
DatabaseMessageExtra,
@ -32,14 +43,20 @@ import type {
DatabaseMessageExtraImageFile,
DatabaseMessageExtraTextFile,
DatabaseMessageExtraPdfFile,
DatabaseMessageExtraLegacyContext
} from '$lib/types/database';
import type {
DatabaseMessageExtraLegacyContext,
ExportedConversation,
ExportedConversations,
// Model types
ModelModalities,
ModelOption,
// Settings types
SettingsChatServiceOptions,
SettingsConfigValue,
SettingsFieldConfig,
SettingsConfigType
} from '$lib/types/settings';
} from '$lib/types';
import { ServerRole, ServerModelStatus, ModelModality } from '$lib/enums';
declare global {
// namespace App {
@ -51,22 +68,38 @@ declare global {
// }
export {
// API types
ApiChatCompletionRequest,
ApiChatCompletionResponse,
ApiChatCompletionStreamChunk,
ApiChatCompletionToolCall,
ApiChatCompletionToolCallDelta,
ApiChatMessageData,
ApiChatMessageContentPart,
ApiContextSizeError,
ApiErrorResponse,
ApiLlamaCppServerProps,
ApiModelDataEntry,
ApiModelListResponse,
ApiProcessingState,
ChatMessageData,
ApiRouterModelMeta,
ApiRouterModelsLoadRequest,
ApiRouterModelsLoadResponse,
ApiRouterModelsStatusRequest,
ApiRouterModelsStatusResponse,
ApiRouterModelsListResponse,
ApiRouterModelsUnloadRequest,
ApiRouterModelsUnloadResponse,
// Chat types
ChatAttachmentDisplayItem,
ChatAttachmentPreviewItem,
ChatMessagePromptProgress,
ChatMessageSiblingInfo,
ChatMessageTimings,
ChatMessageType,
ChatRole,
ChatUploadedFile,
// Database types
DatabaseConversation,
DatabaseMessage,
DatabaseMessageExtra,
@ -75,9 +108,19 @@ declare global {
DatabaseMessageExtraTextFile,
DatabaseMessageExtraPdfFile,
DatabaseMessageExtraLegacyContext,
ExportedConversation,
ExportedConversations,
// Enum types
ModelModality,
ServerRole,
ServerModelStatus,
// Model types
ModelModalities,
ModelOption,
// Settings types
SettingsChatServiceOptions,
SettingsConfigValue,
SettingsFieldConfig,
SettingsConfigType,
SettingsChatServiceOptions
SettingsConfigType
};
}

View File

@ -1,9 +1,17 @@
<script lang="ts">
import { FileText, Image, Music, FileIcon, Eye } from '@lucide/svelte';
import { FileTypeCategory, MimeTypeApplication } from '$lib/enums/files';
import { convertPDFToImage } from '$lib/utils/pdf-processing';
import { Button } from '$lib/components/ui/button';
import { getFileTypeCategory } from '$lib/utils/file-type';
import * as Alert from '$lib/components/ui/alert';
import { SyntaxHighlightedCode } from '$lib/components/app';
import { FileText, Image, Music, FileIcon, Eye, Info } from '@lucide/svelte';
import {
isTextFile,
isImageFile,
isPdfFile,
isAudioFile,
getLanguageFromFilename
} from '$lib/utils';
import { convertPDFToImage } from '$lib/utils/browser-only';
import { modelsStore } from '$lib/stores/models.svelte';
interface Props {
// Either an uploaded file or a stored attachment
@ -12,53 +20,36 @@
// For uploaded files
preview?: string;
name?: string;
type?: string;
textContent?: string;
// For checking vision modality
activeModelId?: string;
}
let { uploadedFile, attachment, preview, name, type, textContent }: Props = $props();
let { uploadedFile, attachment, preview, name, textContent, activeModelId }: Props = $props();
let hasVisionModality = $derived(
activeModelId ? modelsStore.modelSupportsVision(activeModelId) : false
);
let displayName = $derived(uploadedFile?.name || attachment?.name || name || 'Unknown File');
let displayPreview = $derived(
uploadedFile?.preview || (attachment?.type === 'imageFile' ? attachment.base64Url : preview)
);
// Determine file type from uploaded file or attachment
let isAudio = $derived(isAudioFile(attachment, uploadedFile));
let isImage = $derived(isImageFile(attachment, uploadedFile));
let isPdf = $derived(isPdfFile(attachment, uploadedFile));
let isText = $derived(isTextFile(attachment, uploadedFile));
let displayType = $derived(
uploadedFile?.type ||
(attachment?.type === 'imageFile'
? 'image'
: attachment?.type === 'textFile'
? 'text'
: attachment?.type === 'audioFile'
? attachment.mimeType || 'audio'
: attachment?.type === 'pdfFile'
? MimeTypeApplication.PDF
: type || 'unknown')
let displayPreview = $derived(
uploadedFile?.preview ||
(isImage && attachment && 'base64Url' in attachment ? attachment.base64Url : preview)
);
let displayTextContent = $derived(
uploadedFile?.textContent ||
(attachment?.type === 'textFile'
? attachment.content
: attachment?.type === 'pdfFile'
? attachment.content
: textContent)
(attachment && 'content' in attachment ? attachment.content : textContent)
);
let isAudio = $derived(
getFileTypeCategory(displayType) === FileTypeCategory.AUDIO || displayType === 'audio'
);
let isImage = $derived(
getFileTypeCategory(displayType) === FileTypeCategory.IMAGE || displayType === 'image'
);
let isPdf = $derived(displayType === MimeTypeApplication.PDF);
let isText = $derived(
getFileTypeCategory(displayType) === FileTypeCategory.TEXT || displayType === 'text'
);
let language = $derived(getLanguageFromFilename(displayName));
let IconComponent = $derived(() => {
if (isImage) return Image;
@ -87,15 +78,20 @@
if (uploadedFile?.file) {
file = uploadedFile.file;
} else if (attachment?.type === 'pdfFile') {
} else if (isPdf && attachment) {
// Check if we have pre-processed images
if (attachment.images && Array.isArray(attachment.images)) {
if (
'images' in attachment &&
attachment.images &&
Array.isArray(attachment.images) &&
attachment.images.length > 0
) {
pdfImages = attachment.images;
return;
}
// Convert base64 back to File for processing
if (attachment.base64Data) {
if ('base64Data' in attachment && attachment.base64Data) {
const base64Data = attachment.base64Data;
const byteCharacters = atob(base64Data);
const byteNumbers = new Array(byteCharacters.length);
@ -103,7 +99,7 @@
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
file = new File([byteArray], displayName, { type: MimeTypeApplication.PDF });
file = new File([byteArray], displayName, { type: 'application/pdf' });
}
}
@ -181,6 +177,24 @@
/>
</div>
{:else if isPdf && pdfViewMode === 'pages'}
{#if !hasVisionModality && activeModelId}
<Alert.Root class="mb-4">
<Info class="h-4 w-4" />
<Alert.Title>Preview only</Alert.Title>
<Alert.Description>
<span class="inline-flex">
The selected model does not support vision. Only the extracted
<!-- svelte-ignore a11y_click_events_have_key_events -->
<!-- svelte-ignore a11y_no_static_element_interactions -->
<span class="mx-1 cursor-pointer underline" onclick={() => (pdfViewMode = 'text')}>
text
</span>
will be sent to the model.
</span>
</Alert.Description>
</Alert.Root>
{/if}
{#if pdfImagesLoading}
<div class="flex items-center justify-center p-8">
<div class="text-center">
@ -227,28 +241,24 @@
</div>
{/if}
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
<div
class="max-h-[60vh] overflow-auto rounded-lg bg-muted p-4 font-mono text-sm break-words whitespace-pre-wrap"
>
{displayTextContent}
</div>
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="69rem" />
{:else if isAudio}
<div class="flex items-center justify-center p-8">
<div class="w-full max-w-md text-center">
<Music class="mx-auto mb-4 h-16 w-16 text-muted-foreground" />
{#if attachment?.type === 'audioFile'}
{#if uploadedFile?.preview}
<audio controls class="mb-4 w-full" src={uploadedFile.preview}>
Your browser does not support the audio element.
</audio>
{:else if isAudio && attachment && 'mimeType' in attachment && 'base64Data' in attachment}
<audio
controls
class="mb-4 w-full"
src="data:{attachment.mimeType};base64,{attachment.base64Data}"
src={`data:${attachment.mimeType};base64,${attachment.base64Data}`}
>
Your browser does not support the audio element.
</audio>
{:else if uploadedFile?.preview}
<audio controls class="mb-4 w-full" src={uploadedFile.preview}>
Your browser does not support the audio element.
</audio>
{:else}
<p class="mb-4 text-muted-foreground">Audio preview not available</p>
{/if}

View File

@ -1,7 +1,7 @@
<script lang="ts">
import { RemoveButton } from '$lib/components/app';
import { formatFileSize, getFileTypeLabel, getPreviewText } from '$lib/utils/file-preview';
import { FileTypeCategory, MimeTypeText } from '$lib/enums/files';
import { getFileTypeLabel, getPreviewText, formatFileSize, isTextFile } from '$lib/utils';
import { AttachmentType } from '$lib/enums';
interface Props {
class?: string;
@ -12,7 +12,9 @@
readonly?: boolean;
size?: number;
textContent?: string;
type: string;
// Either uploaded file or stored attachment
uploadedFile?: ChatUploadedFile;
attachment?: DatabaseMessageExtra;
}
let {
@ -24,11 +26,41 @@
readonly = false,
size,
textContent,
type
uploadedFile,
attachment
}: Props = $props();
let isText = $derived(isTextFile(attachment, uploadedFile));
let fileTypeLabel = $derived.by(() => {
if (uploadedFile?.type) {
return getFileTypeLabel(uploadedFile.type);
}
if (attachment) {
if ('mimeType' in attachment && attachment.mimeType) {
return getFileTypeLabel(attachment.mimeType);
}
if (attachment.type) {
return getFileTypeLabel(attachment.type);
}
}
return getFileTypeLabel(name);
});
let pdfProcessingMode = $derived.by(() => {
if (attachment?.type === AttachmentType.PDF) {
const pdfAttachment = attachment as DatabaseMessageExtraPdfFile;
return pdfAttachment.processedAsImages ? 'Sent as Image' : 'Sent as Text';
}
return null;
});
</script>
{#if type === MimeTypeText.PLAIN || type === FileTypeCategory.TEXT}
{#if isText}
{#if readonly}
<!-- Readonly mode (ChatMessage) -->
<button
@ -45,7 +77,7 @@
<span class="text-xs text-muted-foreground">{formatFileSize(size)}</span>
{/if}
{#if textContent && type === 'text'}
{#if textContent}
<div class="relative mt-2 w-full">
<div
class="overflow-hidden font-mono text-xs leading-relaxed break-words whitespace-pre-wrap text-muted-foreground"
@ -105,17 +137,21 @@
<div
class="flex h-8 w-8 items-center justify-center rounded bg-primary/10 text-xs font-medium text-primary"
>
{getFileTypeLabel(type)}
{fileTypeLabel}
</div>
<div class="flex flex-col gap-1">
<div class="flex flex-col gap-0.5">
<span
class="max-w-24 truncate text-sm font-medium text-foreground group-hover:pr-6 md:max-w-32"
class="max-w-24 truncate text-sm font-medium text-foreground {readonly
? ''
: 'group-hover:pr-6'} md:max-w-32"
>
{name}
</span>
{#if size}
{#if pdfProcessingMode}
<span class="text-left text-xs text-muted-foreground">{pdfProcessingMode}</span>
{:else if size}
<span class="text-left text-xs text-muted-foreground">{formatFileSize(size)}</span>
{/if}
</div>

View File

@ -30,7 +30,9 @@
}: Props = $props();
</script>
<div class="group relative overflow-hidden rounded-lg border border-border bg-muted {className}">
<div
class="group relative overflow-hidden rounded-lg bg-muted shadow-lg dark:border dark:border-muted {className}"
>
{#if onClick}
<button
type="button"

View File

@ -2,10 +2,8 @@
import { ChatAttachmentThumbnailImage, ChatAttachmentThumbnailFile } from '$lib/components/app';
import { Button } from '$lib/components/ui/button';
import { ChevronLeft, ChevronRight } from '@lucide/svelte';
import { FileTypeCategory } from '$lib/enums/files';
import { getFileTypeCategory } from '$lib/utils/file-type';
import { DialogChatAttachmentPreview, DialogChatAttachmentsViewAll } from '$lib/components/app';
import type { ChatAttachmentDisplayItem, ChatAttachmentPreviewItem } from '$lib/types/chat';
import { getAttachmentDisplayItems } from '$lib/utils';
interface Props {
class?: string;
@ -22,6 +20,8 @@
imageWidth?: string;
// Limit display to single row with "+ X more" button
limitToSingleRow?: boolean;
// For vision modality check
activeModelId?: string;
}
let {
@ -35,10 +35,11 @@
imageClass = '',
imageHeight = 'h-24',
imageWidth = 'w-auto',
limitToSingleRow = false
limitToSingleRow = false,
activeModelId
}: Props = $props();
let displayItems = $derived(getDisplayItems());
let displayItems = $derived(getAttachmentDisplayItems({ uploadedFiles, attachments }));
let canScrollLeft = $state(false);
let canScrollRight = $state(false);
@ -49,81 +50,6 @@
let showViewAll = $derived(limitToSingleRow && displayItems.length > 0 && isScrollable);
let viewAllDialogOpen = $state(false);
function getDisplayItems(): ChatAttachmentDisplayItem[] {
const items: ChatAttachmentDisplayItem[] = [];
// Add uploaded files (ChatForm)
for (const file of uploadedFiles) {
items.push({
id: file.id,
name: file.name,
size: file.size,
preview: file.preview,
type: file.type,
isImage: getFileTypeCategory(file.type) === FileTypeCategory.IMAGE,
uploadedFile: file,
textContent: file.textContent
});
}
// Add stored attachments (ChatMessage)
for (const [index, attachment] of attachments.entries()) {
if (attachment.type === 'imageFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
preview: attachment.base64Url,
type: 'image',
isImage: true,
attachment,
attachmentIndex: index
});
} else if (attachment.type === 'textFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'text',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
} else if (attachment.type === 'context') {
// Legacy format from old webui - treat as text file
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'text',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
} else if (attachment.type === 'audioFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: attachment.mimeType || 'audio',
isImage: false,
attachment,
attachmentIndex: index
});
} else if (attachment.type === 'pdfFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'application/pdf',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
}
}
return items.reverse();
}
function openPreview(item: ChatAttachmentDisplayItem, event?: MouseEvent) {
event?.stopPropagation();
event?.preventDefault();
@ -133,7 +59,6 @@
attachment: item.attachment,
preview: item.preview,
name: item.name,
type: item.type,
size: item.size,
textContent: item.textContent
};
@ -181,6 +106,7 @@
{#if displayItems.length > 0}
<div class={className} {style}>
{#if limitToSingleRow}
<div class="relative">
<button
class="absolute top-1/2 left-4 z-10 flex h-6 w-6 -translate-y-1/2 items-center justify-center rounded-full bg-foreground/15 shadow-md backdrop-blur-xs transition-opacity hover:bg-foreground/35 {canScrollLeft
@ -200,7 +126,9 @@
{#each displayItems as item (item.id)}
{#if item.isImage && item.preview}
<ChatAttachmentThumbnailImage
class="flex-shrink-0 cursor-pointer {limitToSingleRow ? 'first:ml-4 last:mr-4' : ''}"
class="flex-shrink-0 cursor-pointer {limitToSingleRow
? 'first:ml-4 last:mr-4'
: ''}"
id={item.id}
name={item.name}
preview={item.preview}
@ -213,14 +141,17 @@
/>
{:else}
<ChatAttachmentThumbnailFile
class="flex-shrink-0 cursor-pointer {limitToSingleRow ? 'first:ml-4 last:mr-4' : ''}"
class="flex-shrink-0 cursor-pointer {limitToSingleRow
? 'first:ml-4 last:mr-4'
: ''}"
id={item.id}
name={item.name}
type={item.type}
size={item.size}
{readonly}
onRemove={onFileRemove}
textContent={item.textContent}
attachment={item.attachment}
uploadedFile={item.uploadedFile}
onClick={(event) => openPreview(item, event)}
/>
{/if}
@ -247,10 +178,43 @@
class="h-6 text-xs text-muted-foreground hover:text-foreground"
onclick={() => (viewAllDialogOpen = true)}
>
View all
View all ({displayItems.length})
</Button>
</div>
{/if}
{:else}
<div class="flex flex-wrap items-start justify-end gap-3">
{#each displayItems as item (item.id)}
{#if item.isImage && item.preview}
<ChatAttachmentThumbnailImage
class="cursor-pointer"
id={item.id}
name={item.name}
preview={item.preview}
{readonly}
onRemove={onFileRemove}
height={imageHeight}
width={imageWidth}
{imageClass}
onClick={(event) => openPreview(item, event)}
/>
{:else}
<ChatAttachmentThumbnailFile
class="cursor-pointer"
id={item.id}
name={item.name}
size={item.size}
{readonly}
onRemove={onFileRemove}
textContent={item.textContent}
attachment={item.attachment}
uploadedFile={item.uploadedFile}
onClick={(event?: MouseEvent) => openPreview(item, event)}
/>
{/if}
{/each}
</div>
{/if}
</div>
{/if}
@ -261,9 +225,9 @@
attachment={previewItem.attachment}
preview={previewItem.preview}
name={previewItem.name}
type={previewItem.type}
size={previewItem.size}
textContent={previewItem.textContent}
{activeModelId}
/>
{/if}
@ -275,4 +239,5 @@
{onFileRemove}
imageHeight="h-64"
{imageClass}
{activeModelId}
/>

View File

@ -4,9 +4,7 @@
ChatAttachmentThumbnailFile,
DialogChatAttachmentPreview
} from '$lib/components/app';
import { FileTypeCategory } from '$lib/enums/files';
import { getFileTypeCategory } from '$lib/utils/file-type';
import type { ChatAttachmentDisplayItem, ChatAttachmentPreviewItem } from '$lib/types/chat';
import { getAttachmentDisplayItems } from '$lib/utils';
interface Props {
uploadedFiles?: ChatUploadedFile[];
@ -16,6 +14,7 @@
imageHeight?: string;
imageWidth?: string;
imageClass?: string;
activeModelId?: string;
}
let {
@ -25,89 +24,17 @@
onFileRemove,
imageHeight = 'h-24',
imageWidth = 'w-auto',
imageClass = ''
imageClass = '',
activeModelId
}: Props = $props();
let previewDialogOpen = $state(false);
let previewItem = $state<ChatAttachmentPreviewItem | null>(null);
let displayItems = $derived(getDisplayItems());
let displayItems = $derived(getAttachmentDisplayItems({ uploadedFiles, attachments }));
let imageItems = $derived(displayItems.filter((item) => item.isImage));
let fileItems = $derived(displayItems.filter((item) => !item.isImage));
function getDisplayItems(): ChatAttachmentDisplayItem[] {
const items: ChatAttachmentDisplayItem[] = [];
for (const file of uploadedFiles) {
items.push({
id: file.id,
name: file.name,
size: file.size,
preview: file.preview,
type: file.type,
isImage: getFileTypeCategory(file.type) === FileTypeCategory.IMAGE,
uploadedFile: file,
textContent: file.textContent
});
}
for (const [index, attachment] of attachments.entries()) {
if (attachment.type === 'imageFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
preview: attachment.base64Url,
type: 'image',
isImage: true,
attachment,
attachmentIndex: index
});
} else if (attachment.type === 'textFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'text',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
} else if (attachment.type === 'context') {
// Legacy format from old webui - treat as text file
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'text',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
} else if (attachment.type === 'audioFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: attachment.mimeType || 'audio',
isImage: false,
attachment,
attachmentIndex: index
});
} else if (attachment.type === 'pdfFile') {
items.push({
id: `attachment-${index}`,
name: attachment.name,
type: 'application/pdf',
isImage: false,
attachment,
attachmentIndex: index,
textContent: attachment.content
});
}
}
return items.reverse();
}
function openPreview(item: (typeof displayItems)[0], event?: Event) {
if (event) {
event.preventDefault();
@ -119,7 +46,6 @@
attachment: item.attachment,
preview: item.preview,
name: item.name,
type: item.type,
size: item.size,
textContent: item.textContent
};
@ -138,12 +64,13 @@
class="cursor-pointer"
id={item.id}
name={item.name}
type={item.type}
size={item.size}
{readonly}
onRemove={onFileRemove}
textContent={item.textContent}
onClick={(event) => openPreview(item, event)}
attachment={item.attachment}
uploadedFile={item.uploadedFile}
onClick={(event?: MouseEvent) => openPreview(item, event)}
/>
{/each}
</div>
@ -183,8 +110,8 @@
attachment={previewItem.attachment}
preview={previewItem.preview}
name={previewItem.name}
type={previewItem.type}
size={previewItem.size}
textContent={previewItem.textContent}
{activeModelId}
/>
{/if}

View File

@ -9,15 +9,13 @@
} from '$lib/components/app';
import { INPUT_CLASSES } from '$lib/constants/input-classes';
import { config } from '$lib/stores/settings.svelte';
import { FileTypeCategory, MimeTypeApplication } from '$lib/enums/files';
import {
AudioRecorder,
convertToWav,
createAudioFile,
isAudioRecordingSupported
} from '$lib/utils/audio-recording';
import { onMount } from 'svelte';
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
import { isRouterMode } from '$lib/stores/server.svelte';
import { chatStore } from '$lib/stores/chat.svelte';
import { activeMessages } from '$lib/stores/conversations.svelte';
import {
FileTypeCategory,
MimeTypeApplication,
FileExtensionAudio,
FileExtensionImage,
FileExtensionPdf,
@ -25,8 +23,15 @@
MimeTypeAudio,
MimeTypeImage,
MimeTypeText
} from '$lib/enums/files';
import { isIMEComposing } from '$lib/utils/is-ime-composing';
} from '$lib/enums';
import { isIMEComposing } from '$lib/utils';
import {
AudioRecorder,
convertToWav,
createAudioFile,
isAudioRecordingSupported
} from '$lib/utils/browser-only';
import { onMount } from 'svelte';
interface Props {
class?: string;
@ -53,6 +58,7 @@
}: Props = $props();
let audioRecorder: AudioRecorder | undefined;
let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
let currentConfig = $derived(config());
let fileAcceptString = $state<string | undefined>(undefined);
let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
@ -63,18 +69,97 @@
let recordingSupported = $state(false);
let textareaRef: ChatFormTextarea | undefined = $state(undefined);
// Check if model is selected (in ROUTER mode)
let conversationModel = $derived(
chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
);
let isRouter = $derived(isRouterMode());
let hasModelSelected = $derived(!isRouter || !!conversationModel || !!selectedModelId());
// Get active model ID for capability detection
let activeModelId = $derived.by(() => {
const options = modelOptions();
if (!isRouter) {
return options.length > 0 ? options[0].model : null;
}
// First try user-selected model
const selectedId = selectedModelId();
if (selectedId) {
const model = options.find((m) => m.id === selectedId);
if (model) return model.model;
}
// Fallback to conversation model
if (conversationModel) {
const model = options.find((m) => m.model === conversationModel);
if (model) return model.model;
}
return null;
});
// State for model props reactivity
let modelPropsVersion = $state(0);
// Fetch model props when active model changes (works for both MODEL and ROUTER mode)
$effect(() => {
if (activeModelId) {
const cached = modelsStore.getModelProps(activeModelId);
if (!cached) {
modelsStore.fetchModelProps(activeModelId).then(() => {
modelPropsVersion++;
});
}
}
});
// Derive modalities from active model (works for both MODEL and ROUTER mode)
let hasAudioModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion; // Trigger reactivity on props fetch
return modelsStore.modelSupportsAudio(activeModelId);
}
return false;
});
let hasVisionModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion; // Trigger reactivity on props fetch
return modelsStore.modelSupportsVision(activeModelId);
}
return false;
});
function checkModelSelected(): boolean {
if (!hasModelSelected) {
// Open the model selector
chatFormActionsRef?.openModelSelector();
return false;
}
return true;
}
function getAcceptStringForFileType(fileType: FileTypeCategory): string {
switch (fileType) {
case FileTypeCategory.IMAGE:
return [...Object.values(FileExtensionImage), ...Object.values(MimeTypeImage)].join(',');
case FileTypeCategory.AUDIO:
return [...Object.values(FileExtensionAudio), ...Object.values(MimeTypeAudio)].join(',');
case FileTypeCategory.PDF:
return [...Object.values(FileExtensionPdf), ...Object.values(MimeTypeApplication)].join(
','
);
case FileTypeCategory.TEXT:
return [...Object.values(FileExtensionText), MimeTypeText.PLAIN].join(',');
default:
return '';
}
@ -103,6 +188,9 @@
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
// Check if model is selected first
if (!checkModelSelected()) return;
const messageToSend = message.trim();
const filesToSend = [...uploadedFiles];
@ -131,6 +219,7 @@
if (files.length > 0) {
event.preventDefault();
onFileUpload?.(files);
return;
}
@ -154,6 +243,7 @@
async function handleMicClick() {
if (!audioRecorder || !recordingSupported) {
console.warn('Audio recording not supported');
return;
}
@ -187,6 +277,9 @@
event.preventDefault();
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
// Check if model is selected first
if (!checkModelSelected()) return;
const messageToSend = message.trim();
const filesToSend = [...uploadedFiles];
@ -225,12 +318,16 @@
<ChatFormFileInputInvisible
bind:this={fileInputRef}
bind:accept={fileAcceptString}
{hasAudioModality}
{hasVisionModality}
onFileSelect={handleFileSelect}
/>
<form
onsubmit={handleSubmit}
class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {className}"
class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
? 'cursor-not-allowed opacity-60'
: ''} {className}"
>
<ChatAttachmentsList
bind:uploadedFiles
@ -238,6 +335,7 @@
limitToSingleRow
class="py-5"
style="scroll-padding: 1rem;"
activeModelId={activeModelId ?? undefined}
/>
<div
@ -252,10 +350,13 @@
/>
<ChatFormActions
bind:this={chatFormActionsRef}
canSend={message.trim().length > 0 || uploadedFiles.length > 0}
hasText={message.trim().length > 0}
{disabled}
{isLoading}
{isRecording}
{uploadedFiles}
onFileUpload={handleFileUpload}
onMicClick={handleMicClick}
onStop={handleStop}

View File

@ -1,22 +1,29 @@
<script lang="ts">
import { Paperclip, Image, FileText, File, Volume2 } from '@lucide/svelte';
import { Paperclip } from '@lucide/svelte';
import { Button } from '$lib/components/ui/button';
import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
import * as Tooltip from '$lib/components/ui/tooltip';
import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
import { FileTypeCategory } from '$lib/enums/files';
import { supportsAudio, supportsVision } from '$lib/stores/server.svelte';
import { FILE_TYPE_ICONS } from '$lib/constants/icons';
import { FileTypeCategory } from '$lib/enums';
interface Props {
class?: string;
disabled?: boolean;
hasAudioModality?: boolean;
hasVisionModality?: boolean;
onFileUpload?: (fileType?: FileTypeCategory) => void;
}
let { class: className = '', disabled = false, onFileUpload }: Props = $props();
let {
class: className = '',
disabled = false,
hasAudioModality = false,
hasVisionModality = false,
onFileUpload
}: Props = $props();
const fileUploadTooltipText = $derived.by(() => {
return !supportsVision()
return !hasVisionModality
? 'Text files and PDFs supported. Images, audio, and video require vision models.'
: 'Attach files';
});
@ -29,7 +36,7 @@
<div class="flex items-center gap-1 {className}">
<DropdownMenu.Root>
<DropdownMenu.Trigger name="Attach files">
<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
<Tooltip.Root>
<Tooltip.Trigger>
<Button
class="file-upload-button h-8 w-8 rounded-full bg-transparent p-0 text-muted-foreground hover:bg-foreground/10 hover:text-foreground"
@ -49,40 +56,40 @@
</DropdownMenu.Trigger>
<DropdownMenu.Content align="start" class="w-48">
<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
<Tooltip.Root>
<Tooltip.Trigger class="w-full">
<DropdownMenu.Item
class="images-button flex cursor-pointer items-center gap-2"
disabled={!supportsVision()}
disabled={!hasVisionModality}
onclick={() => handleFileUpload(FileTypeCategory.IMAGE)}
>
<Image class="h-4 w-4" />
<FILE_TYPE_ICONS.image class="h-4 w-4" />
<span>Images</span>
</DropdownMenu.Item>
</Tooltip.Trigger>
{#if !supportsVision()}
{#if !hasVisionModality}
<Tooltip.Content>
<p>Images require vision models to be processed</p>
</Tooltip.Content>
{/if}
</Tooltip.Root>
<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
<Tooltip.Root>
<Tooltip.Trigger class="w-full">
<DropdownMenu.Item
class="audio-button flex cursor-pointer items-center gap-2"
disabled={!supportsAudio()}
disabled={!hasAudioModality}
onclick={() => handleFileUpload(FileTypeCategory.AUDIO)}
>
<Volume2 class="h-4 w-4" />
<FILE_TYPE_ICONS.audio class="h-4 w-4" />
<span>Audio Files</span>
</DropdownMenu.Item>
</Tooltip.Trigger>
{#if !supportsAudio()}
{#if !hasAudioModality}
<Tooltip.Content>
<p>Audio files require audio models to be processed</p>
</Tooltip.Content>
@ -93,24 +100,24 @@
class="flex cursor-pointer items-center gap-2"
onclick={() => handleFileUpload(FileTypeCategory.TEXT)}
>
<FileText class="h-4 w-4" />
<FILE_TYPE_ICONS.text class="h-4 w-4" />
<span>Text Files</span>
</DropdownMenu.Item>
<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
<Tooltip.Root>
<Tooltip.Trigger class="w-full">
<DropdownMenu.Item
class="flex cursor-pointer items-center gap-2"
onclick={() => handleFileUpload(FileTypeCategory.PDF)}
>
<File class="h-4 w-4" />
<FILE_TYPE_ICONS.pdf class="h-4 w-4" />
<span>PDF Files</span>
</DropdownMenu.Item>
</Tooltip.Trigger>
{#if !supportsVision()}
{#if !hasVisionModality}
<Tooltip.Content>
<p>PDFs will be converted to text. Image-based PDFs may not work properly.</p>
</Tooltip.Content>

View File

@ -1,12 +1,12 @@
<script lang="ts">
import { Mic } from '@lucide/svelte';
import { Mic, Square } from '@lucide/svelte';
import { Button } from '$lib/components/ui/button';
import * as Tooltip from '$lib/components/ui/tooltip';
import { supportsAudio } from '$lib/stores/server.svelte';
interface Props {
class?: string;
disabled?: boolean;
hasAudioModality?: boolean;
isLoading?: boolean;
isRecording?: boolean;
onMicClick?: () => void;
@ -15,6 +15,7 @@
let {
class: className = '',
disabled = false,
hasAudioModality = false,
isLoading = false,
isRecording = false,
onMicClick
@ -22,25 +23,27 @@
</script>
<div class="flex items-center gap-1 {className}">
<Tooltip.Root delayDuration={100}>
<Tooltip.Root>
<Tooltip.Trigger>
<Button
class="h-8 w-8 rounded-full p-0 {isRecording
? 'animate-pulse bg-red-500 text-white hover:bg-red-600'
: 'bg-transparent text-muted-foreground hover:bg-foreground/10 hover:text-foreground'} {!supportsAudio()
? 'cursor-not-allowed opacity-50'
: ''}"
disabled={disabled || isLoading || !supportsAudio()}
disabled={disabled || isLoading || !hasAudioModality}
onclick={onMicClick}
type="button"
>
<span class="sr-only">{isRecording ? 'Stop recording' : 'Start recording'}</span>
{#if isRecording}
<Square class="h-4 w-4 animate-pulse fill-white" />
{:else}
<Mic class="h-4 w-4" />
{/if}
</Button>
</Tooltip.Trigger>
{#if !supportsAudio()}
{#if !hasAudioModality}
<Tooltip.Content>
<p>Current model does not support audio</p>
</Tooltip.Content>

View File

@ -0,0 +1,55 @@
<script lang="ts">
import { ArrowUp } from '@lucide/svelte';
import { Button } from '$lib/components/ui/button';
import * as Tooltip from '$lib/components/ui/tooltip';
import { cn } from '$lib/components/ui/utils';
interface Props {
canSend?: boolean;
disabled?: boolean;
isLoading?: boolean;
showErrorState?: boolean;
tooltipLabel?: string;
}
let {
canSend = false,
disabled = false,
isLoading = false,
showErrorState = false,
tooltipLabel
}: Props = $props();
let isDisabled = $derived(!canSend || disabled || isLoading);
</script>
{#snippet submitButton(props = {})}
<Button
type="submit"
disabled={isDisabled}
class={cn(
'h-8 w-8 rounded-full p-0',
showErrorState
? 'bg-red-400/10 text-red-400 hover:bg-red-400/20 hover:text-red-400 disabled:opacity-100'
: ''
)}
{...props}
>
<span class="sr-only">Send</span>
<ArrowUp class="h-12 w-12" />
</Button>
{/snippet}
{#if tooltipLabel}
<Tooltip.Root>
<Tooltip.Trigger>
{@render submitButton()}
</Tooltip.Trigger>
<Tooltip.Content>
<p>{tooltipLabel}</p>
</Tooltip.Content>
</Tooltip.Root>
{:else}
{@render submitButton()}
{/if}

View File

@ -1,13 +1,20 @@
<script lang="ts">
import { Square, ArrowUp } from '@lucide/svelte';
import { Square } from '@lucide/svelte';
import { Button } from '$lib/components/ui/button';
import {
ChatFormActionFileAttachments,
ChatFormActionRecord,
ChatFormModelSelector
ChatFormActionSubmit,
ModelsSelector
} from '$lib/components/app';
import { FileTypeCategory } from '$lib/enums';
import { getFileTypeCategory } from '$lib/utils';
import { config } from '$lib/stores/settings.svelte';
import type { FileTypeCategory } from '$lib/enums/files';
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
import { isRouterMode } from '$lib/stores/server.svelte';
import { chatStore } from '$lib/stores/chat.svelte';
import { activeMessages, usedModalities } from '$lib/stores/conversations.svelte';
import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
interface Props {
canSend?: boolean;
@ -15,6 +22,8 @@
disabled?: boolean;
isLoading?: boolean;
isRecording?: boolean;
hasText?: boolean;
uploadedFiles?: ChatUploadedFile[];
onFileUpload?: (fileType?: FileTypeCategory) => void;
onMicClick?: () => void;
onStop?: () => void;
@ -26,20 +35,150 @@
disabled = false,
isLoading = false,
isRecording = false,
hasText = false,
uploadedFiles = [],
onFileUpload,
onMicClick,
onStop
}: Props = $props();
let currentConfig = $derived(config());
let isRouter = $derived(isRouterMode());
let conversationModel = $derived(
chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
);
let previousConversationModel: string | null = null;
$effect(() => {
if (conversationModel && conversationModel !== previousConversationModel) {
previousConversationModel = conversationModel;
modelsStore.selectModelByName(conversationModel);
}
});
let activeModelId = $derived.by(() => {
const options = modelOptions();
if (!isRouter) {
return options.length > 0 ? options[0].model : null;
}
const selectedId = selectedModelId();
if (selectedId) {
const model = options.find((m) => m.id === selectedId);
if (model) return model.model;
}
if (conversationModel) {
const model = options.find((m) => m.model === conversationModel);
if (model) return model.model;
}
return null;
});
let modelPropsVersion = $state(0); // Used to trigger reactivity after fetch
$effect(() => {
if (activeModelId) {
const cached = modelsStore.getModelProps(activeModelId);
if (!cached) {
modelsStore.fetchModelProps(activeModelId).then(() => {
modelPropsVersion++;
});
}
}
});
let hasAudioModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion;
return modelsStore.modelSupportsAudio(activeModelId);
}
return false;
});
let hasVisionModality = $derived.by(() => {
if (activeModelId) {
void modelPropsVersion;
return modelsStore.modelSupportsVision(activeModelId);
}
return false;
});
let hasAudioAttachments = $derived(
uploadedFiles.some((file) => getFileTypeCategory(file.type) === FileTypeCategory.AUDIO)
);
let shouldShowRecordButton = $derived(
hasAudioModality && !hasText && !hasAudioAttachments && currentConfig.autoMicOnEmpty
);
let hasModelSelected = $derived(!isRouter || !!conversationModel || !!selectedModelId());
let isSelectedModelInCache = $derived.by(() => {
if (!isRouter) return true;
if (conversationModel) {
return modelOptions().some((option) => option.model === conversationModel);
}
const currentModelId = selectedModelId();
if (!currentModelId) return false;
return modelOptions().some((option) => option.id === currentModelId);
});
let submitTooltip = $derived.by(() => {
if (!hasModelSelected) {
return 'Please select a model first';
}
if (!isSelectedModelInCache) {
return 'Selected model is not available, please select another';
}
return '';
});
let selectorModelRef: ModelsSelector | undefined = $state(undefined);
export function openModelSelector() {
selectorModelRef?.open();
}
const { handleModelChange } = useModelChangeValidation({
getRequiredModalities: () => usedModalities(),
onValidationFailure: async (previousModelId) => {
if (previousModelId) {
await modelsStore.selectModelById(previousModelId);
}
}
});
</script>
<div class="flex w-full items-center gap-2 {className}">
<ChatFormActionFileAttachments class="mr-auto" {disabled} {onFileUpload} />
<div class="flex w-full items-center gap-3 {className}" style="container-type: inline-size">
<ChatFormActionFileAttachments
class="mr-auto"
{disabled}
{hasAudioModality}
{hasVisionModality}
{onFileUpload}
/>
{#if currentConfig.modelSelectorEnabled}
<ChatFormModelSelector class="shrink-0" />
{/if}
<ModelsSelector
bind:this={selectorModelRef}
currentModel={conversationModel}
forceForegroundText={true}
useGlobalSelection={true}
onModelChange={handleModelChange}
/>
{#if isLoading}
<Button
@ -50,16 +189,15 @@
<span class="sr-only">Stop</span>
<Square class="h-8 w-8 fill-destructive stroke-destructive" />
</Button>
{:else if shouldShowRecordButton}
<ChatFormActionRecord {disabled} {hasAudioModality} {isLoading} {isRecording} {onMicClick} />
{:else}
<ChatFormActionRecord {disabled} {isLoading} {isRecording} {onMicClick} />
<Button
type="submit"
disabled={!canSend || disabled || isLoading}
class="h-8 w-8 rounded-full p-0"
>
<span class="sr-only">Send</span>
<ArrowUp class="h-12 w-12" />
</Button>
<ChatFormActionSubmit
canSend={canSend && hasModelSelected && isSelectedModelInCache}
{disabled}
{isLoading}
tooltipLabel={submitTooltip}
showErrorState={hasModelSelected && !isSelectedModelInCache}
/>
{/if}
</div>

View File

@ -1,9 +1,11 @@
<script lang="ts">
import { generateModalityAwareAcceptString } from '$lib/utils/modality-file-validation';
import { generateModalityAwareAcceptString } from '$lib/utils';
interface Props {
accept?: string;
class?: string;
hasAudioModality?: boolean;
hasVisionModality?: boolean;
multiple?: boolean;
onFileSelect?: (files: File[]) => void;
}
@ -11,6 +13,8 @@
let {
accept = $bindable(),
class: className = '',
hasAudioModality = false,
hasVisionModality = false,
multiple = true,
onFileSelect
}: Props = $props();
@ -18,7 +22,13 @@
let fileInputElement: HTMLInputElement | undefined;
// Use modality-aware accept string by default, but allow override
let finalAccept = $derived(accept ?? generateModalityAwareAcceptString());
let finalAccept = $derived(
accept ??
generateModalityAwareAcceptString({
hasVision: hasVisionModality,
hasAudio: hasAudioModality
})
);
export function click() {
fileInputElement?.click();

View File

@ -1,352 +0,0 @@
<script lang="ts">
import { onMount, tick } from 'svelte';
import { ChevronDown, Loader2 } from '@lucide/svelte';
import { cn } from '$lib/components/ui/utils';
import { portalToBody } from '$lib/utils/portal-to-body';
import {
fetchModels,
modelOptions,
modelsError,
modelsLoading,
modelsUpdating,
selectModel,
selectedModelId
} from '$lib/stores/models.svelte';
import type { ModelOption } from '$lib/types/models';
interface Props {
class?: string;
}
let { class: className = '' }: Props = $props();
let options = $derived(modelOptions());
let loading = $derived(modelsLoading());
let updating = $derived(modelsUpdating());
let error = $derived(modelsError());
let activeId = $derived(selectedModelId());
let isMounted = $state(false);
let isOpen = $state(false);
let container: HTMLDivElement | null = null;
let triggerButton = $state<HTMLButtonElement | null>(null);
let menuRef = $state<HTMLDivElement | null>(null);
let menuPosition = $state<{
top: number;
left: number;
width: number;
placement: 'top' | 'bottom';
maxHeight: number;
} | null>(null);
let lockedWidth: number | null = null;
onMount(async () => {
try {
await fetchModels();
} catch (error) {
console.error('Unable to load models:', error);
} finally {
isMounted = true;
}
});
function handlePointerDown(event: PointerEvent) {
if (!container) return;
const target = event.target as Node | null;
if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
closeMenu();
}
}
function handleKeydown(event: KeyboardEvent) {
if (event.key === 'Escape') {
closeMenu();
}
}
function handleResize() {
if (isOpen) {
updateMenuPosition();
}
}
async function handleSelect(value: string | undefined) {
if (!value) return;
const option = options.find((item) => item.id === value);
if (!option) {
console.error('Model is no longer available');
return;
}
try {
await selectModel(option.id);
} catch (error) {
console.error('Failed to switch model:', error);
}
}
const VIEWPORT_GUTTER = 8;
const MENU_OFFSET = 6;
const MENU_MAX_WIDTH = 320;
async function openMenu() {
if (loading || updating) return;
isOpen = true;
await tick();
updateMenuPosition();
requestAnimationFrame(() => updateMenuPosition());
}
function toggleOpen() {
if (loading || updating) return;
if (isOpen) {
closeMenu();
} else {
void openMenu();
}
}
function closeMenu() {
if (!isOpen) return;
isOpen = false;
menuPosition = null;
lockedWidth = null;
}
async function handleOptionSelect(optionId: string) {
try {
await handleSelect(optionId);
} finally {
closeMenu();
}
}
$effect(() => {
if (loading || updating) {
closeMenu();
}
});
$effect(() => {
const optionCount = options.length;
if (!isOpen || optionCount <= 0) return;
queueMicrotask(() => updateMenuPosition());
});
function updateMenuPosition() {
if (!isOpen || !triggerButton || !menuRef) return;
const triggerRect = triggerButton.getBoundingClientRect();
const viewportWidth = window.innerWidth;
const viewportHeight = window.innerHeight;
if (viewportWidth === 0 || viewportHeight === 0) return;
const scrollWidth = menuRef.scrollWidth;
const scrollHeight = menuRef.scrollHeight;
const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
const safeMaxWidth =
constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
let width = lockedWidth;
if (width === null) {
const naturalWidth = Math.min(scrollWidth, safeMaxWidth);
const baseWidth = Math.max(triggerRect.width, naturalWidth, desiredMinWidth);
width = Math.min(baseWidth, safeMaxWidth || baseWidth);
lockedWidth = width;
} else {
width = Math.min(Math.max(width, desiredMinWidth), safeMaxWidth || width);
}
if (width > 0) {
menuRef.style.width = `${width}px`;
}
const availableBelow = Math.max(
0,
viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
);
const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
function computePlacement(placement: 'top' | 'bottom') {
const available = placement === 'bottom' ? availableBelow : availableAbove;
const allowedHeight =
available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
const maxHeight = Math.min(scrollHeight, allowedHeight);
const height = Math.max(0, maxHeight);
let top: number;
if (placement === 'bottom') {
const rawTop = triggerRect.bottom + MENU_OFFSET;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
} else {
top = Math.min(Math.max(rawTop, minTop), maxTop);
}
} else {
const rawTop = triggerRect.top - MENU_OFFSET - height;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
} else {
top = Math.max(Math.min(rawTop, maxTop), minTop);
}
}
return { placement, top, height, maxHeight };
}
const belowMetrics = computePlacement('bottom');
const aboveMetrics = computePlacement('top');
let metrics = belowMetrics;
if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
metrics = aboveMetrics;
}
menuRef.style.maxHeight = metrics.maxHeight > 0 ? `${Math.round(metrics.maxHeight)}px` : '';
let left = triggerRect.right - width;
const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
if (maxLeft < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
} else {
if (left > maxLeft) {
left = maxLeft;
}
if (left < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
}
}
menuPosition = {
top: Math.round(metrics.top),
left: Math.round(left),
width: Math.round(width),
placement: metrics.placement,
maxHeight: Math.round(metrics.maxHeight)
};
}
function getDisplayOption(): ModelOption | undefined {
if (activeId) {
return options.find((option) => option.id === activeId);
}
return options[0];
}
</script>
<svelte:window onresize={handleResize} />
<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
<div
class={cn('relative z-10 flex max-w-[200px] min-w-[120px] flex-col items-end gap-1', className)}
bind:this={container}
>
{#if loading && options.length === 0 && !isMounted}
<div class="flex items-center gap-2 text-xs text-muted-foreground">
<Loader2 class="h-4 w-4 animate-spin" />
Loading models…
</div>
{:else if options.length === 0}
<p class="text-xs text-muted-foreground">No models available.</p>
{:else}
{@const selectedOption = getDisplayOption()}
<div class="relative w-full">
<button
type="button"
class={cn(
'flex w-full items-center justify-end gap-2 rounded-md px-2 py-1 text-sm text-muted-foreground transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60',
isOpen ? 'text-foreground' : ''
)}
aria-haspopup="listbox"
aria-expanded={isOpen}
onclick={toggleOpen}
bind:this={triggerButton}
disabled={loading || updating}
>
<span class="max-w-[160px] truncate text-right font-medium">
{selectedOption?.name || 'Select model'}
</span>
{#if updating}
<Loader2 class="h-3.5 w-3.5 animate-spin text-muted-foreground" />
{:else}
<ChevronDown
class={cn(
'h-4 w-4 text-muted-foreground transition-transform',
isOpen ? 'rotate-180 text-foreground' : ''
)}
/>
{/if}
</button>
{#if isOpen}
<div
bind:this={menuRef}
use:portalToBody
class={cn(
'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
)}
role="listbox"
style:top={menuPosition ? `${menuPosition.top}px` : undefined}
style:left={menuPosition ? `${menuPosition.left}px` : undefined}
style:width={menuPosition ? `${menuPosition.width}px` : undefined}
data-placement={menuPosition?.placement ?? 'bottom'}
>
<div
class="overflow-y-auto py-1"
style:max-height={menuPosition && menuPosition.maxHeight > 0
? `${menuPosition.maxHeight}px`
: undefined}
>
{#each options as option (option.id)}
<button
type="button"
class={cn(
'flex w-full flex-col items-start gap-0.5 px-3 py-2 text-left text-sm transition hover:bg-muted focus:bg-muted focus:outline-none',
option.id === selectedOption?.id ? 'bg-accent text-accent-foreground' : ''
)}
role="option"
aria-selected={option.id === selectedOption?.id}
onclick={() => handleOptionSelect(option.id)}
>
<span class="block w-full truncate font-medium" title={option.name}>
{option.name}
</span>
{#if option.description}
<span class="text-xs text-muted-foreground">{option.description}</span>
{/if}
</button>
{/each}
</div>
</div>
{/if}
</div>
{/if}
{#if error}
<p class="text-xs text-destructive">{error}</p>
{/if}
</div>

Some files were not shown because too many files have changed in this diff Show More