Merge branch 'master' into tarek/feat/lfm2-asr-upstream

This commit is contained in:
Xuan Son Nguyen 2025-12-16 17:30:59 +01:00
commit 7865a1519e
75 changed files with 4841 additions and 3248 deletions

View File

@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"]
### Target: light
# Lightweight image containing only llama-cli
# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light

View File

@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
ENV LC_ALL=C.utf8

View File

@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service

View File

@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
%files
%{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service

295
.github/workflows/server-webui.yml vendored Normal file
View File

@ -0,0 +1,295 @@
# Server WebUI build and tests
name: Server WebUI
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh

View File

@ -76,270 +76,6 @@ jobs:
run: |
pip install -r tools/server/tests/requirements.txt
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh
server-windows:
runs-on: windows-2022

View File

@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

View File

@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// per-example default params
// we define here to make sure it's included in llama-gen-docs
if (ex == LLAMA_EXAMPLE_COMPLETION) {
params.use_jinja = false; // disable jinja by default
} else if (ex == LLAMA_EXAMPLE_MTMD) {
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
} else if (ex == LLAMA_EXAMPLE_SERVER) {
params.n_parallel = -1; // auto by default
}
params.use_color = tty_can_use_colors();
// load dynamic backends
@ -1107,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
string_format("max number of context checkpoints to create per slot (default: %d)\n"
string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
@ -1115,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) {
params.cache_ram_mib = value;
@ -1123,12 +1136,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
[](common_params & params) {
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_UNIFIED"));
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
@ -1888,13 +1900,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
}
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(common_arg(
{"-np", "--parallel"}, "N",
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
[](common_params & params, int value) {
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL"));
if (ex == LLAMA_EXAMPLE_SERVER) {
// this is to make sure this option appears in the server-specific section of the help message
add_opt(common_arg(
{"-np", "--parallel"}, "N",
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
[](common_params & params, int value) {
if (value == 0) {
throw std::invalid_argument("error: invalid value for n_parallel\n");
}
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
} else {
add_opt(common_arg(
{"-np", "--parallel"}, "N",
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
[](common_params & params, int value) {
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL"));
}
add_opt(common_arg(
{"-ns", "--sequences"}, "N",
string_format("number of sequences to decode (default: %d)", params.n_sequences),

View File

@ -4,9 +4,14 @@
using json = nlohmann::json;
static std::string_view trim_trailing_space(std::string_view sv) {
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
int count = 0;
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
if (max != -1 && count <= max) {
break;
}
sv.remove_suffix(1);
count++;
}
return sv;
}
@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote
std::string dumped = json(node.text).dump();
std::string dumped = json(trim_trailing_space(node.text)).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true;
}
@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
needs_closing_quote = false;
}
}
@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
}
if (is_tool_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
needs_closing_quote = false;
}
current_tool->arguments += "}";
}
}

View File

@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function<void(const
}
}
static void foreach_parameter(const json & function, const std::function<void(const std::string &, const json &, bool)> & fn) {
if (!function.contains("parameters") || !function.at("parameters").is_object()) {
return;
}
const auto & params = function.at("parameters");
if (!params.contains("properties") || !params.at("properties").is_object()) {
return;
}
const auto & props = params.at("properties");
std::set<std::string> required;
if (params.contains("required") && params.at("required").is_array()) {
params.at("required").get_to(required);
}
for (const auto & [name, prop] : props.items()) {
bool is_required = (required.find(name) != required.end());
fn(name, prop, is_required);
}
}
static std::string apply(
const common_chat_template & tmpl,
const struct templates_params & inputs,
@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data;
}
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
if (inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
}
}
// Response format parser
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
}
// Tool call parser
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
auto schema_info = common_schema_info();
schema_info.resolve_refs(parameters);
auto tool_open = "<function=" + p.tool_name(p.literal(name)) + ">\n";
auto tool_close = p.literal("</function>\n");
auto args = p.sequence();
auto arg_string = p.rule("xml-arg-string", p.until_one_of({
"\n</parameter>",
"\n<parameter=",
"\n</function>"
}));
foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
auto rule_name = "tool-" + name + "-arg-" + param_name;
auto arg_open = "<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">\n";
auto arg_close = p.literal("</parameter>\n");
auto arg_value = p.eps();
if (schema_info.resolves_to_string(param_schema)) {
arg_value = p.tool_arg_string_value(arg_string) + "\n";
} else {
arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
}
// Model may or my not close with </parameter>
auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
});
tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
});
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_call = p.rule("tool-call", "<tool_call>\n" + tool_choice + "</tool_call>" + p.space());
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
return reasoning << p.content(p.until("<tool_call>")) << tool_calls;
}
// Content only parser
include_grammar = false;
return reasoning << p.content(p.rest());
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"}
};
}
return data;
}
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("<function=") != std::string::npos &&
src.find("<parameters>") != std::string::npos &&
src.find("<parameter=") != std::string::npos) {
// Nemotron 3 Nano 30B A3B
if (src.find("<think>") != std::string::npos) {
return common_chat_params_init_nemotron_v3(tmpl, params);
}
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
}

View File

@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
class SchemaConverter {
class common_schema_converter {
private:
friend class common_schema_info;
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
@ -729,7 +730,7 @@ private:
}
public:
SchemaConverter(
common_schema_converter(
const std::function<json(const std::string &)> & fetch_json,
bool dotall)
: _fetch_json(fetch_json), _dotall(dotall)
@ -990,6 +991,134 @@ public:
}
};
// common_schema_info implementation (pimpl)
common_schema_info::common_schema_info()
: impl_(std::make_unique<common_schema_converter>(
[](const std::string &) { return json(); },
false)) {}
common_schema_info::~common_schema_info() = default;
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
impl_->resolve_refs(schema, "");
}
// Determines if a JSON schema can resolve to a string type through any path.
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
// true, allowing callers to handle the value as a raw string for simplicity.
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
std::unordered_set<std::string> visited_refs;
std::function<bool(const json &)> check = [&](const json & s) -> bool {
if (!s.is_object()) {
return false;
}
// Handle $ref
if (s.contains("$ref")) {
const std::string & ref = s["$ref"];
if (visited_refs.find(ref) != visited_refs.end()) {
// Circular reference, assume not a string to be safe
return false;
}
visited_refs.insert(ref);
auto it = impl_->_refs.find(ref);
if (it != impl_->_refs.end()) {
return check(it->second);
}
return false;
}
// Check type field
if (s.contains("type")) {
const json & schema_type = s["type"];
if (schema_type.is_string()) {
if (schema_type == "string") {
return true;
}
} else if (schema_type.is_array()) {
// Type can be an array like ["string", "null"]
for (const auto & t : schema_type) {
if (t == "string") {
return true;
}
}
}
}
// Check oneOf/anyOf - if any alternative can be a string
if (s.contains("oneOf")) {
for (const auto & alt : s["oneOf"]) {
if (check(alt)) {
return true;
}
}
}
if (s.contains("anyOf")) {
for (const auto & alt : s["anyOf"]) {
if (check(alt)) {
return true;
}
}
}
// Check allOf - all components must be compatible with string type
if (s.contains("allOf")) {
bool all_string = true;
for (const auto & component : s["allOf"]) {
if (!check(component)) {
all_string = false;
break;
}
}
if (all_string) {
return true;
}
}
// Check const - if the constant value is a string
if (s.contains("const")) {
if (s["const"].is_string()) {
return true;
}
}
// Check enum - if any enum value is a string
if (s.contains("enum")) {
for (const auto & val : s["enum"]) {
if (val.is_string()) {
return true;
}
}
}
// String-specific keywords imply string type
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
return true;
}
// Check format - many formats imply string
if (s.contains("format")) {
const std::string & fmt = s["format"];
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
fmt.find("uuid") == 0) {
return true;
}
}
return false;
};
return check(schema);
}
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
#ifdef LLAMA_USE_LLGUIDANCE
if (!force_gbnf) {
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
}
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule);

View File

@ -3,11 +3,31 @@
#include <nlohmann/json_fwd.hpp>
#include <functional>
#include <memory>
#include <string>
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
bool force_gbnf = false);
class common_schema_converter;
// Probes a JSON schema to extract information about its structure and type constraints.
class common_schema_info {
std::unique_ptr<common_schema_converter> impl_;
public:
common_schema_info();
~common_schema_info();
common_schema_info(const common_schema_info &) = delete;
common_schema_info & operator=(const common_schema_info &) = delete;
common_schema_info(common_schema_info &&) noexcept;
common_schema_info & operator=(common_schema_info &&) noexcept;
void resolve_refs(nlohmann::ordered_json & schema);
bool resolves_to_string(const nlohmann::ordered_json & schema);
};
struct common_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;

View File

@ -425,7 +425,7 @@ struct parser_executor {
if (result.need_more_input()) {
// Propagate - need to know what child would match before negating
return result;
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
}
// Child failed, so negation succeeds

View File

@ -865,6 +865,14 @@ class TextModel(ModelBase):
logger.warning(f"Unknown RoPE type: {rope_type}")
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
if "mrope_section" in self.rope_parameters:
mrope_section = self.rope_parameters["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
if (rope_theta := rope_params.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
logger.info(f"gguf: rope theta = {rope_theta}")
@ -3742,9 +3750,6 @@ class Qwen2VLModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self):
try:
@ -4380,6 +4385,30 @@ class Qwen3VLVisionModel(MmprojModel):
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
class Glm4VVisionModel(Qwen3VLVisionModel):
def set_gguf_parameters(self):
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
if hidden_act == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)
rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.visual."):
name = name.replace("model.visual.", "visual.")
if name.startswith("visual.merger."):
return [(self.map_tensor_name(name), data_torch)]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL
@ -4388,20 +4417,6 @@ class Qwen3VLTextModel(Qwen3Model):
super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@ -4420,22 +4435,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@ -7798,6 +7797,15 @@ class JaisModel(TextModel):
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
class Glm4Model(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4
use_mrope = False
partial_rotary_factor = 0.5
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
if "mrope_section" in self.rope_parameters:
self.use_mrope = True
logger.info("Q/K weight will need to be permuted for M-RoPE")
def set_vocab(self):
from transformers import AutoTokenizer
@ -7819,17 +7827,49 @@ class Glm4Model(TextModel):
super().set_gguf_parameters()
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
@staticmethod
def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
orig_shape = weights.shape
if len(orig_shape) == 1:
weights = weights.unsqueeze(1) # [out_dim, 1]
if len(weights.shape) != 2:
raise ValueError("Only 1D and 2D tensors are supported.")
n_effective_heads = weights.shape[0] // head_dim
if n_head_kv is not None and n_effective_heads != n_head:
if n_effective_heads != n_head_kv:
raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
rotary_dim = int(head_dim * partial_rotary_factor)
if rotary_dim % 2 != 0:
raise ValueError("rotary_dim must be even.")
reshaped = weights.reshape(n_effective_heads, head_dim, -1)
rot_part = reshaped[:, :rotary_dim, :]
non_rot_part = reshaped[:, rotary_dim:, :]
permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
combined = torch.cat((permuted_rot, non_rot_part), dim=1)
result = combined.reshape(weights.shape)
return result if len(orig_shape) != 1 else result.squeeze(1)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.visual."): # ignore visual part of Glm4v
return []
elif name.startswith("model.language_model."):
name = name.replace("language_model.", "") # for Glm4v
if self.use_mrope:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams["num_key_value_heads"]
n_embd = self.hparams["hidden_size"]
head_dim = n_embd // n_head
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4MoeForCausalLM")
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
class Glm4MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE
@ -7896,6 +7936,7 @@ class Glm4MoeModel(TextModel):
_experts: list[dict[str, Tensor]] | None = None
# note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
@ -8493,8 +8534,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
class NemotronHModel(GraniteHybridModel):
"""Hybrid mamba2/attention model from NVIDIA"""
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
is_moe: bool = False
def __init__(self, *args, **kwargs):
# We have to determine the correct model architecture (MoE vs non-MoE) before
# calling the parent __init__. This is because the parent constructor
# uses self.model_arch to build the tensor name map, and all MoE-specific
# mappings would be missed if it were called with the default non-MoE arch.
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
if "num_experts_per_tok" in hparams:
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
self.is_moe = True
super().__init__(*args, **kwargs)
# Save the top-level head_dim for later
@ -8506,9 +8557,11 @@ class NemotronHModel(GraniteHybridModel):
# Update the ssm / attn / mlp layers
# M: Mamba2, *: Attention, -: MLP
# MoE:
# M: Mamba2, *: Attention, E: Expert
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
def get_attn_layers(self):
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
@ -8524,10 +8577,28 @@ class NemotronHModel(GraniteHybridModel):
# Set feed_forward_length
# NOTE: This will trigger an override warning. This is preferrable to
# duplicating all the parent logic
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
self.gguf_writer.add_feed_forward_length([
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
])
if not self.is_moe:
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
self.gguf_writer.add_feed_forward_length([
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
])
else:
moe_intermediate_size = self.hparams["moe_intermediate_size"]
self.gguf_writer.add_feed_forward_length([
moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
def set_vocab(self):
super().set_vocab()
@ -8535,7 +8606,81 @@ class NemotronHModel(GraniteHybridModel):
# The tokenizer _does_ add a BOS token (via post_processor type
# TemplateProcessing) but does not set add_bos_token to true in the
# config, so we need to explicitly override it here.
self.gguf_writer.add_add_bos_token(True)
if not self.is_moe:
self.gguf_writer.add_add_bos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.is_moe and bid is not None:
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
mapped_name = self.map_tensor_name(new_name)
return [(mapped_name, data_torch)]
if name.endswith("mixer.dt_bias"):
new_name = name.replace("dt_bias", "dt.bias")
mapped_name = self.map_tensor_name(new_name)
return [(mapped_name, data_torch)]
if name.endswith("mixer.conv1d.weight"):
squeezed_data = data_torch.squeeze()
mapped_name = self.map_tensor_name(name)
return [(mapped_name, squeezed_data)]
if name.endswith("mixer.A_log"):
transformed_data = -torch.exp(data_torch)
reshaped_data = transformed_data.squeeze().reshape(-1, 1)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.endswith("mixer.D"):
reshaped_data = data_torch.squeeze().reshape(-1, 1)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.endswith("mixer.norm.weight"):
reshaped_data = data_torch.reshape(8, 512)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.find("mixer.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 2:
# merge the experts into a single tensor
tensors: list[tuple[str, Tensor]] = []
for w_name in ["down_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return super().modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("BailingMoeForCausalLM")

View File

@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family:
- Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
#### Verified devices
| Intel GPU | Status | Verified Model |

View File

@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
- Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

View File

@ -7,9 +7,9 @@
## Images
We have three Docker images available for this project:
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
Additionally, there the following images, similar to the above:
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
On completion, you are ready to play!
```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
```
or with a light image:
```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
```
or with a server image:
@ -59,6 +61,8 @@ or with a server image:
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
```
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
## Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -80,9 +84,9 @@ The defaults are:
The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
## Usage
@ -114,9 +118,9 @@ The defaults are:
The resulting images, are essentially the same as the non-MUSA images:
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
## Usage

View File

@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
}
}
static void export_md(std::string fname, llama_example ex) {
static void export_md(std::string fname, llama_example ex, std::string name) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
common_params params;
@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
write_table(file, common_options);
file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options);
file << "\n\n**Example-specific params**\n\n";
file << "\n\n**" << name << "-specific params**\n\n";
write_table(file, specific_options);
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
// TODO: add CLI
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
return 0;
}

View File

@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the
model/models need to the ggml-org on Hugging Face. This tool/example tries to
help with this process.
> 📝 **Note:** When adding a new model from an existing family, verify the
> previous version passes logits verification first. Existing models can have
> subtle numerical differences that don't affect generation quality but cause
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
> the conversion script, or in an upstream implementation, can save significant
> debugging time.
### Overview
The idea is that the makefile targets and scripts here can be used in the
development/conversion process assisting with things like:

View File

@ -7,7 +7,7 @@ base_model:
Recommended way to run this model:
```sh
llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
llama-server -hf {namespace}/{model_name}-GGUF -c 0
```
Then, access http://localhost:8080

View File

@ -34,8 +34,11 @@ done
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
else
# Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)

View File

@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
JAIS = auto()
NEMOTRON = auto()
NEMOTRON_H = auto()
NEMOTRON_H_MOE = auto()
EXAONE = auto()
EXAONE4 = auto()
GRANITE = auto()
@ -642,6 +643,7 @@ class MODEL_TENSOR(IntEnum):
V_MMPROJ_PEG = auto()
V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_NORM = auto()
V_ENC_EMBD_POS = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_ATTN_QKV = auto()
@ -660,6 +662,7 @@ class MODEL_TENSOR(IntEnum):
V_LAYER_SCALE_2 = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_MM_POST_NORM = auto()
V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3
V_MM_SOFT_EMB_NORM = auto() # gemma3
@ -799,6 +802,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.EXAONE4: "exaone4",
MODEL_ARCH.GRANITE: "granite",
@ -1027,6 +1031,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd",
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
@ -1045,6 +1050,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
@ -1118,6 +1124,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_MMPROJ_PEG,
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_NORM,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_QKV,
@ -1136,6 +1143,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_LAYER_SCALE_2,
MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM,
MODEL_TENSOR.V_MM_POST_NORM,
MODEL_TENSOR.V_MM_INP_PROJ,
MODEL_TENSOR.V_MM_INP_NORM,
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
@ -2571,6 +2579,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.NEMOTRON_H_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_NORM,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
# experts
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
# shared expert
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.EXAONE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -3371,6 +3406,7 @@ class VisionProjectorType:
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
LFM2A = "lfm2a" # audio
GLM4V = "glm4v"
# Items here are (block size, type size)

View File

@ -379,6 +379,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.gate", # lfm2moe
"model.layers.{bid}.mlp.router.gate", # afmoe
"layers.{bid}.gate", # mistral-large
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@ -392,6 +393,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.expert_bias", # afmoe
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
),
# Feed-forward up
@ -440,7 +442,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
@ -454,6 +456,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.down_proj",
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
"layers.{bid}.shared_experts.w3", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
),
MODEL_TENSOR.FFN_UP_CHEXP: (
@ -548,7 +551,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
@ -563,6 +566,7 @@ class TensorNameMap:
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
"layers.{bid}.shared_experts.w2", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
),
MODEL_TENSOR.FFN_DOWN_CHEXP: (
@ -706,6 +710,7 @@ class TensorNameMap:
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
"backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
),
MODEL_TENSOR.SSM_DT_NORM: (
@ -1215,6 +1220,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"model.vision.linear_proj.linear_proj", # cogvlm
"visual.merger.proj", # glm4v
),
MODEL_TENSOR.V_MMPROJ_MLP: (
@ -1248,6 +1254,10 @@ class TensorNameMap:
"model.vision.patch_embedding.proj", # cogvlm
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
"visual.post_conv_layernorm", # glm4v
),
MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding",
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
@ -1257,6 +1267,7 @@ class TensorNameMap:
"vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
@ -1412,6 +1423,11 @@ class TensorNameMap:
"vision_model.layernorm_post", # llama4
"visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
),
MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
),
MODEL_TENSOR.V_MM_INP_PROJ: (
@ -1481,6 +1497,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
"patch_merger.merging_layer", # mistral
"visual.downsample", # glm4v
),
MODEL_TENSOR.V_DS_NORM: (
@ -1501,14 +1518,17 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_UP: (
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
"visual.merger.up_proj", # glm4v
),
MODEL_TENSOR.V_MM_DOWN: (
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
"visual.merger.down_proj", # glm4v
),
MODEL_TENSOR.V_MM_GATE: (
"model.vision.linear_proj.gate_proj", # cogvlm
"visual.merger.gate_proj", # glm4v
),
MODEL_TENSOR.V_TOK_BOI: (

View File

@ -0,0 +1,204 @@
{% macro render_extra_keys(json_dict, handled_keys) %}
{%- if json_dict is mapping %}
{%- for json_key in json_dict if json_key not in handled_keys %}
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
{%- else %}
{{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
{%- endif %}
{%- endfor %}
{%- endif %}
{% endmacro %}
{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
{%- set ns = namespace(last_user_idx = -1) %}
{%- set loop_messages = messages %}
{%- for m in loop_messages %}
{%- if m["role"] == "user" %}
{%- set ns.last_user_idx = loop.index0 %}
{%- endif %}
{%- endfor %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set system_message = "" %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = [] %}
{%- endif %}
{# Recompute last_user_idx relative to loop_messages after handling system #}
{%- set ns = namespace(last_user_idx = -1) %}
{%- for m in loop_messages %}
{%- if m["role"] == "user" %}
{%- set ns.last_user_idx = loop.index0 %}
{%- endif %}
{%- endfor %}
{%- if system_message is defined %}
{{- "<|im_start|>system\n" + system_message }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- "<|im_start|>system\n" }}
{%- endif %}
{%- endif %}
{%- if tools is iterable and tools | length > 0 %}
{%- if system_message is defined and system_message | length > 0 %}
{{- "\n\n" }}
{%- endif %}
{{- "# Tools\n\nYou have access to the following functions:\n\n" }}
{{- "<tools>" }}
{%- for tool in tools %}
{%- if tool.function is defined %}
{%- set tool = tool.function %}
{%- endif %}
{{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
{%- if tool.description is defined %}
{{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
{%- endif %}
{{- '\n<parameters>' }}
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
{%- for param_name, param_fields in tool.parameters.properties|items %}
{{- '\n<parameter>' }}
{{- '\n<name>' ~ param_name ~ '</name>' }}
{%- if param_fields.type is defined %}
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
{%- endif %}
{%- if param_fields.description is defined %}
{{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
{%- endif %}
{%- if param_fields.enum is defined %}
{{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
{%- endif %}
{%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
{{- render_extra_keys(param_fields, handled_keys) }}
{{- '\n</parameter>' }}
{%- endfor %}
{%- endif %}
{% set handled_keys = ['type', 'properties', 'required'] %}
{{- render_extra_keys(tool.parameters, handled_keys) }}
{%- if tool.parameters is defined and tool.parameters.required is defined %}
{{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
{%- endif %}
{{- '\n</parameters>' }}
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
{{- render_extra_keys(tool, handled_keys) }}
{{- '\n</function>' }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- endif %}
{%- if system_message is defined %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- for message in loop_messages %}
{%- if message.role == "assistant" %}
{# Add reasoning content in to content field for unified processing below. #}
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
{%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
{%- else %}
{%- set content = message.content | default('', true) %}
{%- if content is string -%}
{# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
{%- if '<think>' not in content and '</think>' not in content -%}
{%- set content = "<think></think>" ~ content -%}
{%- endif -%}
{%- else -%}
{%- set content = content -%}
{%- endif -%}
{%- endif %}
{%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
{# Assistant message has tool calls. #}
{{- '<|im_start|>assistant\n' }}
{%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
{%- if content is string and content | trim | length > 0 %}
{%- if include_content %}
{{- (content | trim) ~ '\n' -}}
{%- else %}
{%- set c = (content | string) %}
{%- if '</think>' in c %}
{# Keep only content after the last closing think. Also generation prompt causes this. #}
{%- set c = c.split('</think>')[-1] %}
{%- elif '<think>' in c %}
{# If <think> was opened but never closed, drop the trailing think segment #}
{%- set c = c.split('<think>')[0] %}
{%- endif %}
{%- set c = "<think></think>" ~ c | trim %}
{%- if c | length > 0 %}
{{- c ~ '\n' -}}
{%- endif %}
{%- endif %}
{%- else %}
{{- "<think></think>" -}}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
{%- if tool_call.arguments is defined %}
{%- for args_name, args_value in tool_call.arguments|items %}
{{- '<parameter=' ~ args_name ~ '>\n' -}}
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
{{- args_value ~ '\n</parameter>\n' -}}
{%- endfor %}
{%- endif %}
{{- '</function>\n</tool_call>\n' -}}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- else %}
{# Assistant message doesn't have tool calls. #}
{%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
{{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
{%- else %}
{%- set c = (content | default('', true) | string) %}
{%- if '<think>' in c and '</think>' in c %}
{%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
{%- endif %}
{%- set c = c | trim %}
{%- if c | length > 0 %}
{{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
{%- else %}
{{- '<|im_start|>assistant\n<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endif %}
{%- elif message.role == "user" or message.role == "system" %}
{{- '<|im_start|>' + message.role + '\n' }}
{%- set content = message.content | string %}
{{- content }}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user\n' }}
{%- endif %}
{{- '<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>\n' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{%- if enable_thinking %}
{{- '<|im_start|>assistant\n<think>\n' }}
{%- else %}
{{- '<|im_start|>assistant\n<think></think>' }}
{%- endif %}
{%- endif %}

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
#include "ggml.h" // ggml_op
#include <string>
#include <set>
//
// gguf constants (sync with gguf.py)
@ -79,6 +80,7 @@ enum llm_arch {
LLM_ARCH_JAIS,
LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_NEMOTRON_H_MOE,
LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4,
LLM_ARCH_RWKV6,
@ -315,6 +317,7 @@ enum llm_tensor {
LLM_TENSOR_DENSE_3_OUT,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT,
@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
const int bid;
const int xid;
const std::set<llm_tensor> model_tensors;
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
std::string str() const;
operator std::string() const {
@ -546,11 +553,11 @@ struct LLM_TN {
llm_arch arch;
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
return { arch, tensor, suffix, bid, xid };
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
}
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
return { arch, tensor, nullptr, bid, xid };
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
}
};

View File

@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
}
}
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= s_copy->ne[0] == mctx->get_n_rs();
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
res &= head == mctx->get_head();
res &= rs_z == mctx->get_rs_z();
return res;
}
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
GGML_UNUSED(ubatch);
@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
inp_attn->set_input(ubatch);
inp_rs->set_input(ubatch);
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_rs; ++i) {
data[i] = mctx->get_recr()->s_copy(i);
}
}
}
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
return res;
}
//
@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cur = ggml_relu(ctx0, cur);
cb(cur, "ffn_moe_relu", il);
} break;
case LLM_FFN_RELU_SQR:
if (gate_exps) {
// TODO: add support for gated squared relu
GGML_ABORT("fatal error: gated squared relu not implemented");
} else {
cur = ggml_relu(ctx0, cur);
cur = ggml_sqr(ctx0, cur);
cb(cur, "ffn_moe_relu_sqr", il);
} break;
default:
GGML_ABORT("fatal error");
}
@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
inp->head = mctx_cur->get_head();
inp->rs_z = mctx_cur->get_rs_z();
return inp;
}
@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
}

View File

@ -225,6 +225,8 @@ public:
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * s_copy; // I32 [n_rs]
// views of s_copy, computed once per graph
@ -233,6 +235,10 @@ public:
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
const llama_memory_recurrent_context * mctx;
// used in view offsets, need to match for valid graph reuse
uint32_t head;
int32_t rs_z;
};
class llm_graph_input_cross_embd : public llm_graph_input_i {
@ -365,22 +371,28 @@ public:
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
public:
llm_graph_input_mem_hybrid(
const llama_cparams & cparams,
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_context * mctx) :
std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_context * mctx) :
inp_attn(std::move(inp_attn)),
inp_rs(std::move(inp_rs)),
cparams(cparams),
mctx(mctx) { }
virtual ~llm_graph_input_mem_hybrid() = default;
void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
std::unique_ptr<llm_graph_input_rs> inp_rs;
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
const llama_cparams cparams;
const llama_memory_hybrid_context * mctx;
};

View File

@ -2,6 +2,7 @@
#include "ggml.h"
#include <algorithm>
#include <cassert>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
@ -230,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
return false;
}
bool llama_hparams::use_mrope() const {
return rope_sections[0] > 0 && rope_sections[1] > 0;
}

View File

@ -270,6 +270,8 @@ struct llama_hparams {
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
bool use_mrope() const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
}

View File

@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
@ -1688,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_GLM4:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_9B; break;
case 61: type = LLM_TYPE_32B; break;
@ -1697,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_GLM4_MOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
// MoE parameters
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
@ -1797,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
} break;
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
@ -1812,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
switch (hparams.n_layer) {
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
case 56: type = LLM_TYPE_9B; break;
default: type = LLM_TYPE_UNKNOWN;
}
@ -5159,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
} break;
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{
// mamba2 Mixer SSM params
// NOTE: int64_t for tensor dimensions
@ -5169,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
// embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5218,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else {
// mlp layers
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
} else {
if (n_expert != 0) {
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
// MoE branch
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
// Shared expert branch
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
} else {
// mlp layers
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
}
}
}
} break;
@ -6850,7 +6879,8 @@ void llama_model::print_info() const {
arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_NEMOTRON_H) {
arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@ -6905,7 +6935,8 @@ void llama_model::print_info() const {
if (arch == LLM_ARCH_MINICPM ||
arch == LLM_ARCH_GRANITE ||
arch == LLM_ARCH_GRANITE_MOE ||
arch == LLM_ARCH_GRANITE_HYBRID) {
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@ -7086,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
if (arch == LLM_ARCH_FALCON_H1) {
filter_attn = [&](int32_t) { return true; };
filter_recr = [&](int32_t) { return true; };
} else if (arch == LLM_ARCH_NEMOTRON_H) {
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
filter_attn = [&](int32_t il) {
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
};
@ -7457,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm = std::make_unique<llm_build_nemotron>(*this, params);
} break;
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
} break;
@ -7741,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ARWKV7:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values
@ -7761,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_PLM:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GLM4:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
@ -7823,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE:
case LLM_ARCH_SMALLTHINKER:
case LLM_ARCH_GLM4_MOE:
case LLM_ARCH_SEED_OSS:
case LLM_ARCH_GROVEMOE:
case LLM_ARCH_APERTUS:
@ -7840,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_QWEN3VLMOE:
return LLAMA_ROPE_TYPE_IMROPE;
case LLM_ARCH_GLM4:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
case LLM_ARCH_GLM4_MOE:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");

View File

@ -113,6 +113,7 @@ enum llm_type {
LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B,
LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air

View File

@ -241,6 +241,13 @@ static void llama_params_fit_impl(
global_surplus += memory_reduction;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (global_surplus >= 0) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
}
} else {
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
__func__, hp_nct, n_ctx_min);
@ -249,10 +256,6 @@ static void llama_params_fit_impl(
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
}
}
if (global_surplus >= 0) {
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
return;
}
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {

View File

@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
bool use_mrope = hparams.use_mrope();
if (ubatch.embd && !use_mrope) {
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
}
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
}
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
if (use_mrope) {
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
} else {
// Normal RoPE
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);

View File

@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
bool use_mrope = hparams.use_mrope();
if (ubatch.embd && !use_mrope) {
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
}
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
}
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
if (use_mrope) {
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
} else {
// Normal RoPE
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);

View File

@ -441,23 +441,13 @@ private:
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
int il);
ggml_tensor * build_delta_net_recurrent(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il);
ggml_tensor * build_delta_net_chunking(
ggml_tensor * q,
ggml_tensor * k,
@ -467,8 +457,18 @@ private:
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_delta_net_autoregressive(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
int il);
ggml_tensor * build_norm_gated(
ggml_tensor * input,
ggml_tensor * weights,

View File

@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
}
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * ffn_inp = cur;
ggml_tensor * moe_out =
build_moe_ffn(ffn_inp,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr, // no gate
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
cb(moe_out, "ffn_moe_out", il);
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
cur = build_cvec(cur, il);
cb(cur, "l_out", il);

View File

@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * causal_mask =
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
} else {
// Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
const int64_t n_tokens = q->ne[2];
@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
// TODO: can this ever be false?
const bool use_qk_l2norm = true;
const float eps_norm = hparams.f_norm_rms_eps;
if (use_qk_l2norm) {
const float eps_norm = hparams.f_norm_rms_eps;
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
}
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
const float scale = 1.0f / sqrtf(S_v);
@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
beta = ggml_sigmoid(ctx0, beta);
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
cb(v_beta, "v_beta", il);
cb(k_beta, "k_beta", il);
ggml_tensor * chunked_mask =
ggml_view_4d(ctx0, causal_mask, chunk_size,
chunk_size, causal_mask->ne[2], causal_mask->ne[3],
causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * chunked_diag_mask =
ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3],
causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
ggml_tensor * chunked_identity =
ggml_view_4d(ctx0, identity, chunk_size,
chunk_size, identity->ne[2], identity->ne[3],
identity->nb[1], identity->nb[2], identity->nb[3], 0);
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il);
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
cb(decay_mask, "decay_mask", il);
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
decay_mask = ggml_exp(ctx0, decay_mask);
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, chunked_mask);
attn = ggml_add(ctx0, attn, chunked_identity);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il);
@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
attn = ggml_mul(ctx0, attn, diag_mask);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
return ggml_concat(ctx0, flat_output, flat_state, 0);
}
ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
const int64_t n_tokens = q->ne[2];
@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
// TODO: can this ever be false?
const bool use_qk_l2norm = true;
const float eps_norm = hparams.f_norm_rms_eps;
if (use_qk_l2norm) {
const float eps_norm = hparams.f_norm_rms_eps;
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
}
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
const float scale = 1.0f / sqrtf(S_v);
q = ggml_scale(ctx0, q, scale);
q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta);
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
cb(beta, "beta_in", il);
cb(g, "g_in", il);
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
cb(q, "q_perm", il);
cb(k, "k_perm", il);
cb(v, "v_perm", il);
cb(beta, "beta_perm", il);
cb(g, "g_perm", il);
cb(state, "state_in", il);
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
// Apply exponential to g_t
g_t = ggml_exp(ctx0, g_t);
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
// Apply the gated delta rule for the single timestep
// last_recurrent_state = last_recurrent_state * g_t
state = ggml_mul(ctx0, state, g_t);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
// we need to sum over dim=-2, so we transpose, sum, then transpose again
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
cb(k_beta, "k_beta", il);
cb(v_beta, "v_beta", il);
cb(g_cumsum, "g_cumsum", il);
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
// delta = (v_t - kv_mem) * beta_t
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs]
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs]
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
state = ggml_add(ctx0, state, k_t_delta);
// Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
// ggml_tensor * gcs_i_broadcast =
// ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
// n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
// Don't need this, this one will get auto-broadcast
ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
// Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
// Apply exponential to get the decay mask values
decay_mask = ggml_exp(ctx0, decay_mask);
// Apply lower triangular mask again to ensure only lower triangular values remain
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
cb(decay_mask, "decay_mask", il);
// attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
cb(kmulkbeta, "kmulkbeta", il);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_rec", il);
// for i in range(1, chunk_size):
// row = attn[..., i, :i].clone()
// sub = attn[..., :i, :i].clone()
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
//
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
// value = attn @ v_beta
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
cb(v, "value_beta", il);
// k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
cb(gexp, "g_cum_exp", il);
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il);
ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il);
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
attn = ggml_mul_mat(ctx0, k, q);
attn = ggml_mul(ctx0, attn, decay_mask);
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
cb(attn, "attn_decay_key", il);
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
cb(v_prime, "v_prime", il);
// v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
cb(v_attn, "v_attn", il);
ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out, "core_attn_out", il);
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * g_cum_last =
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
cb(g_cum_last, "g_cum_last", il);
ggml_tensor * gexp_last =
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
cb(gexp_last, "gexp_last", il);
ggml_tensor * g_cum_last_3d =
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
cb(g_cum_last_3d, "g_cum_last_3d", il);
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
cb(g_cumsum_3d, "g_cumsum_3d", il);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
cb(g_diff, "g_diff", il);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
cb(g_diff_exp, "g_diff_exp", il);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
cb(key_gdiff, "key_gdiff", il);
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
cb(kgdmulvnew, "kgdmulvnew", il);
state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
// Compute the attention output
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
// again, since it's over dim = -2, transpose, sum, transpose back
ggml_tensor * core_attn_out =
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il);
// flatten output
ggml_tensor * flat_output =
ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
// flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0);
}
@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) {
const auto * mctx_cur = inp->mctx;
@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(mixed_ba, "linear_attn_mixed_ba", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Split mixed_ba into b and a (beta and alpha parameters)
int64_t split_sizes_ba[2] = {
@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
cb(z, "z", il);
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
ggml_nelements(mixed_qkvz));
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il);
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
ggml_tensor * attn_out;
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
}
cb(attn_out, "attn_out", il);
// The tensors were concatenated 1d, so we need to extract them 1d as well

View File

@ -3588,6 +3588,163 @@ static void test_template_output_peg_parsers() {
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
});
}
{
// NVIDIA Nemotron-3 Nano
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
// Test basic message
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "Hello, world!\nWhat's up?";
t.expect = message_assist;
});
// Test basic message and reasoning with reasoning_format = none
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
t.expect.content = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
});
// Test basic message and reasoning with reasoning_format = auto
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
t.params.enable_thinking = true;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.expect = message_assist_thoughts;
});
// Test tool call
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {special_function_tool};
t.expect = message_assist_call;
});
// Test tool call with reasoning
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"I'm\nthinking\n</think>\n"
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {special_function_tool};
t.expect = message_assist_call_thoughts;
});
// Test parallel tool calls
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>\n"
"<tool_call>\n"
"<function=special_function_with_opt>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"<parameter=arg2>\n"
"2\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.parallel_tool_calls = true;
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
t.expect.tool_calls = {{
/* .name = */ "special_function",
/* .arguments = */ R"({"arg1": 1})",
/* .id = */ {},
}, {
/* .name = */ "special_function_with_opt",
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
/* .id = */ {},
}};
});
// Test tool call with string parameter
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=python>\n"
"<parameter=code>\n"
"def hello():\n"
" print(\"Hello, world!\")\n"
"\n"
"hello()\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {python_tool};
t.expect.tool_calls = {{
/* .name = */ "python",
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
/* .id = */ {},
}};
});
// Test tool call with string parameter and no closing </parameter> tag
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=python>\n"
"<parameter=code>\n"
"def hello():\n"
" print(\"Hello, world!\")\n"
"\n"
"hello()\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {python_tool};
t.expect.tool_calls = {{
/* .name = */ "python",
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
/* .id = */ {},
}};
});
// Test response format
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"I need to output the invoice details in JSON\n"
"</think>\n"
R"({"amount": 123.45, "date": "2025-12-03"})";
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.json_schema = invoice_schema;
t.expect.reasoning_content = "I need to output the invoice details in JSON";
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
});
}
}
static void test_msg_diffs_compute() {

View File

@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
});
}
static void test_resolves_to_string() {
fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
fprintf(stderr, "- %s\n", name.c_str());
common_schema_info info;
auto schema = nlohmann::ordered_json::parse(schema_str);
info.resolve_refs(schema);
bool result = info.resolves_to_string(schema);
if (result != expected) {
fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
fprintf(stderr, "Schema: %s\n", schema_str.c_str());
fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
assert(false);
}
};
// Basic type checks
test("type string", R"({"type": "string"})", true);
test("type integer", R"({"type": "integer"})", false);
test("type number", R"({"type": "number"})", false);
test("type boolean", R"({"type": "boolean"})", false);
test("type object", R"({"type": "object"})", false);
test("type array", R"({"type": "array"})", false);
// Type array (nullable string)
test("type array with string", R"({"type": ["string", "null"]})", true);
test("type array without string", R"({"type": ["integer", "null"]})", false);
// String-specific keywords
test("minLength implies string", R"({"minLength": 1})", true);
test("maxLength implies string", R"({"maxLength": 10})", true);
test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
// Format
test("format date", R"({"format": "date"})", true);
test("format uuid", R"({"format": "uuid"})", true);
test("format email", R"({"format": "email"})", true);
// Const
test("const string", R"({"const": "hello"})", true);
test("const number", R"({"const": 123})", false);
// Enum
test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
// anyOf
test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
// oneOf
test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
// allOf - all must be strings
test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
// $ref
test("$ref to string",
R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
test("$ref to integer",
R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
// Nested
test("nested anyOf with string",
R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
fprintf(stderr, "All resolves_to_string tests passed!\n");
}
int main() {
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
test_resolves_to_string();
test_all("C++", [](const TestCase & tc) {
try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));

View File

@ -87,9 +87,6 @@ int main(int argc, char ** argv) {
common_params params;
g_params = &params;
// disable jinja by default
params.use_jinja = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
return 1;
}

View File

@ -16,6 +16,7 @@ add_library(mtmd
models/models.h
models/cogvlm.cpp
models/conformer.cpp
models/glm4v.cpp
models/internvl.cpp
models/kimivl.cpp
models/llama4.cpp

View File

@ -9,6 +9,8 @@
#include <vector>
#include <functional>
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
struct clip_graph {
const clip_model & model;
const clip_hparams & hparams;
@ -49,7 +51,7 @@ struct clip_graph {
void cb(ggml_tensor * cur0, const char * name, int il) const;
// siglip2 naflex
ggml_tensor * resize_position_embeddings();
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
// build vision transformer (ViT) cgraph
// this function should cover most of the models

View File

@ -68,6 +68,7 @@
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_NORM_EMBD "v.norm_embd.%s"
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -86,6 +87,10 @@
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MM_UP "mm.up.%s"
#define TN_MM_GATE "mm.gate.%s"
#define TN_MM_DOWN "mm.down.%s"
#define TN_MM_POST_NORM "mm.post_norm.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
@ -95,7 +100,7 @@
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
@ -177,6 +182,7 @@ enum projector_type {
PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_UNKNOWN,
};
@ -205,6 +211,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
@ -508,6 +515,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
}
}
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
//
// API used internally with mtmd
//

View File

@ -183,6 +183,8 @@ struct clip_model {
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
ggml_tensor * patch_bias = nullptr;
ggml_tensor * position_embeddings = nullptr;
ggml_tensor * norm_embd_w = nullptr;
ggml_tensor * norm_embd_b = nullptr;
ggml_tensor * pre_ln_w = nullptr;
ggml_tensor * pre_ln_b = nullptr;
@ -197,6 +199,14 @@ struct clip_model {
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
ggml_tensor * mm_fc_w;
ggml_tensor * mm_fc_b;
ggml_tensor * mm_ffn_up_w = nullptr;
ggml_tensor * mm_ffn_up_b = nullptr;
ggml_tensor * mm_ffn_gate_w = nullptr;
ggml_tensor * mm_ffn_gate_b = nullptr;
ggml_tensor * mm_ffn_down_w = nullptr;
ggml_tensor * mm_ffn_down_b = nullptr;
ggml_tensor * mm_post_norm_w = nullptr;
ggml_tensor * mm_post_norm_b = nullptr;
// LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr;
@ -278,9 +288,10 @@ struct clip_model {
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr;
// pixtral
// pixtral, glm4v
ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr;
ggml_tensor * mm_patch_merger_b = nullptr;
// ultravox / whisper encoder
ggml_tensor * conv1d_1_w = nullptr;

View File

@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
}
// siglip2 naflex
ggml_tensor * clip_graph::resize_position_embeddings() {
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
const uint32_t mode = interpolation_mode;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
GGML_ASSERT(pos_embd);
@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
? ggml_rms_norm(ctx0, cur, norm_eps)
: ggml_norm(ctx0, cur, norm_eps);
if (mw || mb) {
cb(cur, "norm", il);
}
if (mw) {
cur = ggml_mul(ctx0, cur, mw);
if (mb) {
cb(cur, "norm_w", il);
}
cb(cur, "norm_w", il);
}
if (mb) {
cur = ggml_add(ctx0, cur, mb);
cb(cur, "norm_b", il);
}
return cur;
@ -846,6 +841,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_conformer>(ctx, img);
} break;
case PROJECTOR_TYPE_GLM4V:
{
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break;
default:
GGML_ABORT("missing cgraph builder");
}
@ -1159,6 +1158,14 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
}
} break;
case PROJECTOR_TYPE_GLM4V:
{
hparams.rope_theta = 10000.0f;
hparams.n_merge = 2; // default value for GLM4-V
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 4096);
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_LLAMA4:
{
hparams.rope_theta = 10000.0f;
@ -1295,6 +1302,9 @@ struct clip_model_loader {
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
// layers
@ -1483,6 +1493,20 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GLM4V:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
} break;
case PROJECTOR_TYPE_GEMMA3:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
@ -1511,8 +1535,8 @@ struct clip_model_loader {
// [IMG_BREAK] token embedding
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
// for mistral small 3.1
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
} break;
case PROJECTOR_TYPE_LIGHTONOCR:
{
@ -1520,8 +1544,8 @@ struct clip_model_loader {
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
{
@ -1932,6 +1956,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
if (ctx_params.warmup) {
loader.warmup(*ctx_vision);
}
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
}
if (loader.has_audio) {
@ -2641,6 +2667,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
clip_image_u8 resized;
@ -2883,16 +2910,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
return img->nx / (params.patch_size * 2);
const auto & proj = ctx->proj_type();
switch (proj) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return (img->nx / params.patch_size) / 2;
default:
break;
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams;
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
return img->ny / (params.patch_size * 2);
const auto & proj = ctx->proj_type();
switch (proj) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return (img->ny / params.patch_size) / 2;
default:
break;
}
return 1;
}
@ -2949,6 +2990,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{
// dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2);
@ -3200,6 +3242,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{
const int merge_ratio = hparams.n_merge;
const int pw = image_size_width / patch_size;
@ -3447,7 +3490,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (vec != nullptr) {
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
}
return true;
}
@ -3497,6 +3542,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_LFM2A:
return ctx->model.position_embeddings->ne[0];
case PROJECTOR_TYPE_GLM4V:
return ctx->model.mm_ffn_down_w->ne[1];
default:
GGML_ABORT("Unknown projector type");
}
@ -3513,10 +3560,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
}
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
bool clip_is_mrope(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
}
bool clip_is_llava(const struct clip_ctx * ctx) {
@ -3577,3 +3625,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
return &ctx->model.hparams;
}
//
// API for debugging
//
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
clip_image_f32 img;
img.nx = w;
img.ny = h;
img.buf.resize(h * w * 3);
for (int i = 0; i < h * w * 3; i++) {
img.buf[i] = static_cast<float>(fill_value);
}
bool cur_debug_graph = ctx->debug_graph;
ctx->debug_graph = true;
clip_image_encode(ctx, 1, &img, nullptr);
ctx->debug_graph = cur_debug_graph;
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
}

View File

@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
int clip_is_minicpmv(const struct clip_ctx * ctx);
bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
bool clip_is_mrope(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_is_gemma3(const struct clip_ctx * ctx);

120
tools/mtmd/models/glm4v.cpp Normal file
View File

@ -0,0 +1,120 @@
#include "models.h"
ggml_cgraph * clip_graph_glm4v::build() {
GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
norm_type norm_t = NORM_TYPE_RMS;
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
// add patch bias
inp = ggml_add(ctx0, inp, model.patch_bias);
cb(inp, "patch_bias", -1);
// pos-conv norm
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
// calculate absolute position embedding and apply
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
learned_pos_embd = ggml_cont_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
learned_pos_embd = ggml_reshape_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
learned_pos_embd = ggml_cont_3d(
ctx0, learned_pos_embd,
n_embd, n_patches_x * n_patches_y, batch_size);
cb(learned_pos_embd, "learned_pos_embd", -1);
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return ggml_rope_multi(
ctx0, cur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
};
ggml_tensor * cur = build_vit(
inp, n_patches,
norm_t,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
// GLM4V projector
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
// patch merger (downsample)
{
int n_merge = hparams.n_merge;
GGML_ASSERT(n_merge > 0);
int n_token_out = n_patches / n_merge / n_merge;
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
}
// FC projector
{
cur = ggml_mul_mat(ctx0, model.projection, cur);
// default LayerNorm (post_projection_norm)
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_gelu_erf(ctx0, cur);
cb(cur, "after_fc_proj", -1);
}
// FFN projector
{
cur = build_ffn(cur,
model.mm_ffn_up_w, model.mm_ffn_up_b,
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
model.mm_ffn_down_w, model.mm_ffn_down_b,
hparams.ffn_op, -1);
cb(cur, "after_ffn_proj", -1);
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -61,3 +61,8 @@ struct clip_graph_conformer : clip_graph {
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@ -270,8 +270,6 @@ int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
return 1;

View File

@ -217,7 +217,7 @@ struct mtmd_context {
void init_vision() {
GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_v);
use_mrope = clip_is_mrope(ctx_v);
projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v);
@ -309,6 +309,10 @@ struct mtmd_context {
img_beg = "<|image_start|>";
img_end = "<|image_end|>";
} else if (proj == PROJECTOR_TYPE_GLM4V) {
img_beg = "<|begin_of_image|>";
img_end = "<|end_of_image|>";
}
}

View File

@ -52,7 +52,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
@ -67,11 +66,10 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
@ -150,19 +148,20 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
**Example-specific params**
**Server-specific params**
| Argument | Explanation |
| -------- | ----------- |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-sp, --special` | special tokens output enabled (default: false) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
@ -1430,7 +1429,7 @@ Model presets allow advanced users to define custom configurations using an `.in
llama-server --models-preset ./my-models.ini
```
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`.
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
@ -1445,7 +1444,7 @@ version = 1
; string value
chat-template = chatml
; numeric value
n-gpu-layer = 123
n-gpu-layers = 123
; flag value (for certain flags, you need to use the "no-" prefix for negation)
jinja = true
; shorthand argument (for example, context size)

Binary file not shown.

View File

@ -73,12 +73,17 @@ int main(int argc, char ** argv, char ** envp) {
return 1;
}
// TODO: should we have a separate n_parallel parameter for the server?
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
// TODO: this is a common configuration that is suitable for most local use cases
// however, overriding the parameters is a bit confusing - figure out something more intuitive
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
// validate batch size for embeddings
// embeddings require all tokens to be processed in a single ubatch
// see https://github.com/ggml-org/llama.cpp/issues/12836
if (params.embedding && params.n_batch > params.n_ubatch) {
LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
params.n_batch = params.n_ubatch;
}
if (params.n_parallel < 0) {
LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
params.n_parallel = 4;
params.kv_unified = true;

View File

@ -619,11 +619,12 @@ flowchart TB
### Test Types
| Type | Tool | Location | Command |
| ------------- | ------------------ | -------------------------------- | ------------------- |
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
| **Unit** | Vitest | `tests/client/`, `tests/server/` | `npm run test:unit` |
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
| Type | Tool | Location | Command |
| ------------- | ------------------ | ---------------- | ------------------- |
| **Unit** | Vitest | `tests/unit/` | `npm run test:unit` |
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
| **Client** | Vitest | `tests/client/`. | `npm run test:unit` |
### Running Tests

View File

@ -13,12 +13,11 @@
"reset": "rm -rf .svelte-kit node_modules",
"format": "prettier --write .",
"lint": "prettier --check . && eslint .",
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:server -- --run && npm run test:e2e",
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
"test:e2e": "playwright test",
"test:client": "vitest --project=client",
"test:server": "vitest --project=server",
"test:unit": "vitest --project=unit",
"test:ui": "vitest --project=ui",
"test:unit": "vitest",
"storybook": "storybook dev -p 6006",
"build-storybook": "storybook build",
"cleanup": "rm -rf .svelte-kit build node_modules test-results"

View File

@ -241,7 +241,7 @@
</div>
{/if}
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="69rem" />
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="calc(69rem - 2rem)" />
{:else if isAudio}
<div class="flex items-center justify-center p-8">
<div class="w-full max-w-md text-center">

View File

@ -24,7 +24,7 @@
MimeTypeImage,
MimeTypeText
} from '$lib/enums';
import { isIMEComposing } from '$lib/utils';
import { isIMEComposing, parseClipboardContent } from '$lib/utils';
import {
AudioRecorder,
convertToWav,
@ -191,7 +191,6 @@
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
// Check if model is selected first
if (!checkModelSelected()) return;
const messageToSend = message.trim();
@ -228,6 +227,31 @@
const text = event.clipboardData.getData(MimeTypeText.PLAIN);
if (text.startsWith('"')) {
const parsed = parseClipboardContent(text);
if (parsed.textAttachments.length > 0) {
event.preventDefault();
message = parsed.message;
const attachmentFiles = parsed.textAttachments.map(
(att) =>
new File([att.content], att.name, {
type: MimeTypeText.PLAIN
})
);
onFileUpload?.(attachmentFiles);
setTimeout(() => {
textareaRef?.focus();
}, 10);
return;
}
}
if (
text.length > 0 &&
pasteLongTextToFileLength > 0 &&

View File

@ -35,7 +35,7 @@
<div class="flex items-center gap-1 {className}">
<DropdownMenu.Root>
<DropdownMenu.Trigger name="Attach files">
<DropdownMenu.Trigger name="Attach files" {disabled}>
<Tooltip.Root>
<Tooltip.Trigger>
<Button

View File

@ -173,6 +173,7 @@
/>
<ModelsSelector
{disabled}
bind:this={selectorModelRef}
currentModel={conversationModel}
forceForegroundText={true}

View File

@ -1,6 +1,7 @@
<script lang="ts">
import { chatStore } from '$lib/stores/chat.svelte';
import { copyToClipboard, isIMEComposing } from '$lib/utils';
import { config } from '$lib/stores/settings.svelte';
import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
import ChatMessageUser from './ChatMessageUser.svelte';
import ChatMessageSystem from './ChatMessageSystem.svelte';
@ -87,7 +88,9 @@
}
async function handleCopy() {
await copyToClipboard(message.content, 'Message copied to clipboard');
const asPlainText = Boolean(config().copyTextAttachmentsAsPlainText);
const clipboardContent = formatMessageForClipboard(message.content, message.extra, asPlainText);
await copyToClipboard(clipboardContent, 'Message copied to clipboard');
onCopy?.(message);
}

View File

@ -57,6 +57,11 @@
label: 'Paste long text to file length',
type: 'input'
},
{
key: 'copyTextAttachmentsAsPlainText',
label: 'Copy text attachments as plain text',
type: 'checkbox'
},
{
key: 'enableContinueGeneration',
label: 'Enable "Continue" button',
@ -109,6 +114,16 @@
key: 'disableAutoScroll',
label: 'Disable automatic scroll',
type: 'checkbox'
},
{
key: 'alwaysShowSidebarOnDesktop',
label: 'Always show sidebar on desktop',
type: 'checkbox'
},
{
key: 'autoShowSidebarOnNewChat',
label: 'Auto-show sidebar on new chat',
type: 'checkbox'
}
]
},
@ -404,7 +419,7 @@
</div>
<!-- Mobile Header with Horizontal Scrollable Menu -->
<div class="flex flex-col md:hidden">
<div class="flex flex-col pt-6 md:hidden">
<div class="border-b border-border/30 py-4">
<!-- Horizontal Scrollable Category Menu with Navigation -->
<div class="relative flex items-center" style="scroll-padding: 1rem;">

View File

@ -72,9 +72,10 @@
<div
class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}"
style="max-height: {maxHeight};"
style="max-height: {maxHeight}; max-width: {maxWidth};"
>
<pre class="m-0 overflow-x-auto p-4 max-w-[{maxWidth}]"><code class="hljs text-sm leading-relaxed"
<!-- Needs to be formatted as single line for proper rendering -->
<pre class="m-0 overflow-x-auto p-4"><code class="hljs text-sm leading-relaxed"
>{@html highlightedHtml}</code
></pre>
</div>

View File

@ -179,51 +179,37 @@
});
});
// Handle changes to the model selector pop-down or the model dialog, depending on if the server is in
// router mode or not.
function handleOpenChange(open: boolean) {
if (loading || updating) return;
if (open) {
isOpen = true;
searchTerm = '';
highlightedIndex = -1;
if (isRouter) {
if (open) {
isOpen = true;
searchTerm = '';
highlightedIndex = -1;
// Focus search input after popover opens
tick().then(() => {
requestAnimationFrame(() => searchInputRef?.focus());
});
// Focus search input after popover opens
tick().then(() => {
requestAnimationFrame(() => searchInputRef?.focus());
});
if (isRouter) {
modelsStore.fetchRouterModels().then(() => {
modelsStore.fetchModalitiesForLoadedModels();
});
} else {
isOpen = false;
searchTerm = '';
highlightedIndex = -1;
}
} else {
isOpen = false;
searchTerm = '';
highlightedIndex = -1;
showModelDialog = open;
}
}
function handleTriggerClick() {
if (loading || updating) return;
if (!isRouter) {
// Single model mode: show dialog instead of popover
showModelDialog = true;
}
// For router mode, the Popover handles open/close
}
export function open() {
if (isRouter) {
handleOpenChange(true);
} else {
showModelDialog = true;
}
}
function closeMenu() {
handleOpenChange(false);
handleOpenChange(true);
}
function handleSearchKeyDown(event: KeyboardEvent) {
@ -292,7 +278,7 @@
}
if (shouldCloseMenu) {
closeMenu();
handleOpenChange(false);
// Focus the chat textarea after model selection
requestAnimationFrame(() => {
@ -360,8 +346,181 @@
{:else}
{@const selectedOption = getDisplayOption()}
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
<Popover.Trigger
{#if isRouter}
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
<Popover.Trigger
class={cn(
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
!isCurrentModelInCache()
? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
: forceForegroundText
? 'text-foreground'
: isHighlightedCurrentModelActive
? 'text-foreground'
: 'text-muted-foreground',
isOpen ? 'text-foreground' : ''
)}
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
disabled={disabled || updating}
>
<Package class="h-3.5 w-3.5" />
<span class="truncate font-medium">
{selectedOption?.model || 'Select model'}
</span>
{#if updating}
<Loader2 class="h-3 w-3.5 animate-spin" />
{:else}
<ChevronDown class="h-3 w-3.5" />
{/if}
</Popover.Trigger>
<Popover.Content
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
align="end"
sideOffset={8}
collisionPadding={16}
>
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
<div
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
>
<SearchInput
id="model-search"
placeholder="Search models..."
bind:value={searchTerm}
bind:ref={searchInputRef}
onClose={() => handleOpenChange(false)}
onKeyDown={handleSearchKeyDown}
/>
</div>
<div
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
>
{#if !isCurrentModelInCache() && currentModel}
<!-- Show unavailable model as first option (disabled) -->
<button
type="button"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
role="option"
aria-selected="true"
aria-disabled="true"
disabled
>
<span class="truncate">{selectedOption?.name || currentModel}</span>
<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
</button>
<div class="my-1 h-px bg-border"></div>
{/if}
{#if filteredOptions.length === 0}
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
{/if}
{#each filteredOptions as option, index (option.id)}
{@const status = getModelStatus(option.model)}
{@const isLoaded = status === ServerModelStatus.LOADED}
{@const isLoading = status === ServerModelStatus.LOADING}
{@const isSelected = currentModel === option.model || activeId === option.id}
{@const isCompatible = isModelCompatible(option)}
{@const isHighlighted = index === highlightedIndex}
{@const missingModalities = getMissingModalities(option)}
<div
class={cn(
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
isCompatible
? 'cursor-pointer hover:bg-muted focus:bg-muted'
: 'cursor-not-allowed opacity-50',
isSelected || isHighlighted
? 'bg-accent text-accent-foreground'
: isCompatible
? 'hover:bg-accent hover:text-accent-foreground'
: '',
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
)}
role="option"
aria-selected={isSelected || isHighlighted}
aria-disabled={!isCompatible}
tabindex={isCompatible ? 0 : -1}
onclick={() => isCompatible && handleSelect(option.id)}
onmouseenter={() => (highlightedIndex = index)}
onkeydown={(e) => {
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
e.preventDefault();
handleSelect(option.id);
}
}}
>
<span class="min-w-0 flex-1 truncate">{option.model}</span>
{#if missingModalities}
<span class="flex shrink-0 items-center gap-1 text-muted-foreground/70">
{#if missingModalities.vision}
<Tooltip.Root>
<Tooltip.Trigger>
<EyeOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No vision support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
{#if missingModalities.audio}
<Tooltip.Root>
<Tooltip.Trigger>
<MicOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No audio support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
</span>
{/if}
{#if isLoading}
<Tooltip.Root>
<Tooltip.Trigger>
<Loader2 class="h-4 w-4 shrink-0 animate-spin text-muted-foreground" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Loading model...</p>
</Tooltip.Content>
</Tooltip.Root>
{:else if isLoaded}
<Tooltip.Root>
<Tooltip.Trigger>
<button
type="button"
class="relative ml-2 flex h-4 w-4 shrink-0 items-center justify-center"
onclick={(e) => {
e.stopPropagation();
modelsStore.unloadModel(option.model);
}}
>
<span
class="mr-2 h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
></span>
<Power
class="absolute mr-2 h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
/>
</button>
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Unload model</p>
</Tooltip.Content>
</Tooltip.Root>
{:else}
<span class="mx-2 h-2 w-2 rounded-full bg-muted-foreground/50"></span>
{/if}
</div>
{/each}
</div>
</div>
</Popover.Content>
</Popover.Root>
{:else}
<button
class={cn(
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
!isCurrentModelInCache()
@ -374,165 +533,20 @@
isOpen ? 'text-foreground' : ''
)}
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
onclick={handleTriggerClick}
disabled={disabled || updating || !isRouter}
onclick={() => handleOpenChange(true)}
disabled={disabled || updating}
>
<Package class="h-3.5 w-3.5" />
<span class="truncate font-medium">
{selectedOption?.model || 'Select model'}
{selectedOption?.model}
</span>
{#if updating}
<Loader2 class="h-3 w-3.5 animate-spin" />
{:else if isRouter}
<ChevronDown class="h-3 w-3.5" />
{/if}
</Popover.Trigger>
<Popover.Content
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
align="end"
sideOffset={8}
collisionPadding={16}
>
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
<div
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
>
<SearchInput
id="model-search"
placeholder="Search models..."
bind:value={searchTerm}
bind:ref={searchInputRef}
onClose={closeMenu}
onKeyDown={handleSearchKeyDown}
/>
</div>
<div
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
>
{#if !isCurrentModelInCache() && currentModel}
<!-- Show unavailable model as first option (disabled) -->
<button
type="button"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
role="option"
aria-selected="true"
aria-disabled="true"
disabled
>
<span class="truncate">{selectedOption?.name || currentModel}</span>
<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
</button>
<div class="my-1 h-px bg-border"></div>
{/if}
{#if filteredOptions.length === 0}
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
{/if}
{#each filteredOptions as option, index (option.id)}
{@const status = getModelStatus(option.model)}
{@const isLoaded = status === ServerModelStatus.LOADED}
{@const isLoading = status === ServerModelStatus.LOADING}
{@const isSelected = currentModel === option.model || activeId === option.id}
{@const isCompatible = isModelCompatible(option)}
{@const isHighlighted = index === highlightedIndex}
{@const missingModalities = getMissingModalities(option)}
<div
class={cn(
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
isCompatible
? 'cursor-pointer hover:bg-muted focus:bg-muted'
: 'cursor-not-allowed opacity-50',
isSelected || isHighlighted
? 'bg-accent text-accent-foreground'
: isCompatible
? 'hover:bg-accent hover:text-accent-foreground'
: '',
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
)}
role="option"
aria-selected={isSelected || isHighlighted}
aria-disabled={!isCompatible}
tabindex={isCompatible ? 0 : -1}
onclick={() => isCompatible && handleSelect(option.id)}
onmouseenter={() => (highlightedIndex = index)}
onkeydown={(e) => {
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
e.preventDefault();
handleSelect(option.id);
}
}}
>
<span class="min-w-0 flex-1 truncate">{option.model}</span>
{#if missingModalities}
<span class="flex shrink-0 items-center gap-1 text-muted-foreground/70">
{#if missingModalities.vision}
<Tooltip.Root>
<Tooltip.Trigger>
<EyeOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No vision support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
{#if missingModalities.audio}
<Tooltip.Root>
<Tooltip.Trigger>
<MicOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No audio support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
</span>
{/if}
{#if isLoading}
<Tooltip.Root>
<Tooltip.Trigger>
<Loader2 class="h-4 w-4 shrink-0 animate-spin text-muted-foreground" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Loading model...</p>
</Tooltip.Content>
</Tooltip.Root>
{:else if isLoaded}
<Tooltip.Root>
<Tooltip.Trigger>
<button
type="button"
class="relative ml-2 flex h-4 w-4 shrink-0 items-center justify-center"
onclick={(e) => {
e.stopPropagation();
modelsStore.unloadModel(option.model);
}}
>
<span
class="mr-2 h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
></span>
<Power
class="absolute mr-2 h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
/>
</button>
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Unload model</p>
</Tooltip.Content>
</Tooltip.Root>
{:else}
<span class="mx-2 h-2 w-2 rounded-full bg-muted-foreground/50"></span>
{/if}
</div>
{/each}
</div>
</div>
</Popover.Content>
</Popover.Root>
</button>
{/if}
{/if}
</div>

View File

@ -12,9 +12,12 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
showMessageStats: true,
askForTitleConfirmation: false,
pasteLongTextToFileLen: 2500,
copyTextAttachmentsAsPlainText: false,
pdfAsImage: false,
disableAutoScroll: false,
renderUserContentAsMarkdown: false,
alwaysShowSidebarOnDesktop: false,
autoShowSidebarOnNewChat: true,
autoMicOnEmpty: false,
// make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature',
@ -50,6 +53,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
pasteLongTextToFileLen:
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
copyTextAttachmentsAsPlainText:
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
temperature:
@ -96,6 +101,10 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
disableAutoScroll:
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
alwaysShowSidebarOnDesktop:
'Always keep the sidebar visible on desktop instead of auto-hiding it.',
autoShowSidebarOnNewChat:
'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
autoMicOnEmpty:
'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
pyInterpreterEnabled:

View File

@ -0,0 +1,262 @@
import { toast } from 'svelte-sonner';
import { AttachmentType } from '$lib/enums';
import type {
DatabaseMessageExtra,
DatabaseMessageExtraTextFile,
DatabaseMessageExtraLegacyContext
} from '$lib/types/database';
/**
* Copy text to clipboard with toast notification
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
* @param text - Text to copy to clipboard
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyToClipboard(
text: string,
successMessage = 'Copied to clipboard',
errorMessage = 'Failed to copy to clipboard'
): Promise<boolean> {
try {
// Try modern clipboard API first (secure contexts only)
if (navigator.clipboard && navigator.clipboard.writeText) {
await navigator.clipboard.writeText(text);
toast.success(successMessage);
return true;
}
// Fallback for non-secure contexts
const textArea = document.createElement('textarea');
textArea.value = text;
textArea.style.position = 'fixed';
textArea.style.left = '-999999px';
textArea.style.top = '-999999px';
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
const successful = document.execCommand('copy');
document.body.removeChild(textArea);
if (successful) {
toast.success(successMessage);
return true;
} else {
throw new Error('execCommand failed');
}
} catch (error) {
console.error('Failed to copy to clipboard:', error);
toast.error(errorMessage);
return false;
}
}
/**
* Copy code with HTML entity decoding and toast notification
* @param rawCode - Raw code string that may contain HTML entities
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyCodeToClipboard(
rawCode: string,
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise<boolean> {
const doc = new DOMParser().parseFromString(rawCode, 'text/html');
const decodedCode = doc.body.textContent ?? rawCode;
return copyToClipboard(decodedCode, successMessage, errorMessage);
}
/**
* Format for text attachments when copied to clipboard
*/
export interface ClipboardTextAttachment {
type: typeof AttachmentType.TEXT;
name: string;
content: string;
}
/**
* Parsed result from clipboard content
*/
export interface ParsedClipboardContent {
message: string;
textAttachments: ClipboardTextAttachment[];
}
/**
* Formats a message with text attachments for clipboard copying.
*
* Default format (asPlainText = false):
* ```
* "Text message content"
* [
* {"type":"TEXT","name":"filename.txt","content":"..."},
* {"type":"TEXT","name":"another.txt","content":"..."}
* ]
* ```
*
* Plain text format (asPlainText = true):
* ```
* Text message content
*
* file content here
*
* another file content
* ```
*
* @param content - The message text content
* @param extras - Optional array of message attachments
* @param asPlainText - If true, format as plain text without JSON structure
* @returns Formatted string for clipboard
*/
export function formatMessageForClipboard(
content: string,
extras?: DatabaseMessageExtra[],
asPlainText: boolean = false
): string {
// Filter only text attachments (TEXT type and legacy CONTEXT type)
const textAttachments =
extras?.filter(
(extra): extra is DatabaseMessageExtraTextFile | DatabaseMessageExtraLegacyContext =>
extra.type === AttachmentType.TEXT || extra.type === AttachmentType.LEGACY_CONTEXT
) ?? [];
if (textAttachments.length === 0) {
return content;
}
if (asPlainText) {
const parts = [content];
for (const att of textAttachments) {
parts.push(att.content);
}
return parts.join('\n\n');
}
const clipboardAttachments: ClipboardTextAttachment[] = textAttachments.map((att) => ({
type: AttachmentType.TEXT,
name: att.name,
content: att.content
}));
return `${JSON.stringify(content)}\n${JSON.stringify(clipboardAttachments, null, 2)}`;
}
/**
* Parses clipboard content to extract message and text attachments.
* Supports both plain text and the special format with attachments.
*
* @param clipboardText - Raw text from clipboard
* @returns Parsed content with message and attachments
*/
export function parseClipboardContent(clipboardText: string): ParsedClipboardContent {
const defaultResult: ParsedClipboardContent = {
message: clipboardText,
textAttachments: []
};
if (!clipboardText.startsWith('"')) {
return defaultResult;
}
try {
let stringEndIndex = -1;
let escaped = false;
for (let i = 1; i < clipboardText.length; i++) {
const char = clipboardText[i];
if (escaped) {
escaped = false;
continue;
}
if (char === '\\') {
escaped = true;
continue;
}
if (char === '"') {
stringEndIndex = i;
break;
}
}
if (stringEndIndex === -1) {
return defaultResult;
}
const jsonStringPart = clipboardText.substring(0, stringEndIndex + 1);
const remainingPart = clipboardText.substring(stringEndIndex + 1).trim();
const message = JSON.parse(jsonStringPart) as string;
if (!remainingPart || !remainingPart.startsWith('[')) {
return {
message,
textAttachments: []
};
}
const attachments = JSON.parse(remainingPart) as unknown[];
const validAttachments: ClipboardTextAttachment[] = [];
for (const att of attachments) {
if (isValidTextAttachment(att)) {
validAttachments.push({
type: AttachmentType.TEXT,
name: att.name,
content: att.content
});
}
}
return {
message,
textAttachments: validAttachments
};
} catch {
return defaultResult;
}
}
/**
* Type guard to validate a text attachment object
* @param obj The object to validate
* @returns true if the object is a valid text attachment
*/
function isValidTextAttachment(
obj: unknown
): obj is { type: string; name: string; content: string } {
if (typeof obj !== 'object' || obj === null) {
return false;
}
const record = obj as Record<string, unknown>;
return (
(record.type === AttachmentType.TEXT || record.type === 'TEXT') &&
typeof record.name === 'string' &&
typeof record.content === 'string'
);
}
/**
* Checks if clipboard content contains our special format with attachments
* @param clipboardText - Raw text from clipboard
* @returns true if the clipboard content contains our special format with attachments
*/
export function hasClipboardAttachments(clipboardText: string): boolean {
if (!clipboardText.startsWith('"')) {
return false;
}
const parsed = parseClipboardContent(clipboardText);
return parsed.textAttachments.length > 0;
}

View File

@ -1,71 +0,0 @@
import { toast } from 'svelte-sonner';
/**
* Copy text to clipboard with toast notification
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
* @param text - Text to copy to clipboard
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyToClipboard(
text: string,
successMessage = 'Copied to clipboard',
errorMessage = 'Failed to copy to clipboard'
): Promise<boolean> {
try {
// Try modern clipboard API first (secure contexts only)
if (navigator.clipboard && navigator.clipboard.writeText) {
await navigator.clipboard.writeText(text);
toast.success(successMessage);
return true;
}
// Fallback for non-secure contexts
const textArea = document.createElement('textarea');
textArea.value = text;
textArea.style.position = 'fixed';
textArea.style.left = '-999999px';
textArea.style.top = '-999999px';
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
const successful = document.execCommand('copy');
document.body.removeChild(textArea);
if (successful) {
toast.success(successMessage);
return true;
} else {
throw new Error('execCommand failed');
}
} catch (error) {
console.error('Failed to copy to clipboard:', error);
toast.error(errorMessage);
return false;
}
}
/**
* Copy code with HTML entity decoding and toast notification
* @param rawCode - Raw code string that may contain HTML entities
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyCodeToClipboard(
rawCode: string,
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise<boolean> {
// Decode HTML entities
const decodedCode = rawCode
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return copyToClipboard(decodedCode, successMessage, errorMessage);
}

View File

@ -40,7 +40,15 @@ export { setConfigValue, getConfigValue, configToParameterRecord } from './confi
export { createMessageCountMap, getMessageCount } from './conversation-utils';
// Clipboard utilities
export { copyToClipboard, copyCodeToClipboard } from './copy';
export {
copyToClipboard,
copyCodeToClipboard,
formatMessageForClipboard,
parseClipboardContent,
hasClipboardAttachments,
type ClipboardTextAttachment,
type ParsedClipboardContent
} from './clipboard';
// File preview utilities
export { getFileTypeLabel } from './file-preview';

View File

@ -14,6 +14,7 @@
import { goto } from '$app/navigation';
import { modelsStore } from '$lib/stores/models.svelte';
import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
let { children } = $props();
@ -21,6 +22,10 @@
let isHomeRoute = $derived(page.route.id === '/');
let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true');
let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading());
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
let autoShowSidebarOnNewChat = $derived(config().autoShowSidebarOnNewChat);
let isMobile = new IsMobile();
let isDesktop = $derived(!isMobile.current);
let sidebarOpen = $state(false);
let innerHeight = $state<number | undefined>();
let chatSidebar:
@ -76,6 +81,11 @@
}
$effect(() => {
if (alwaysShowSidebarOnDesktop && isDesktop) {
sidebarOpen = true;
return;
}
if (isHomeRoute && !isNewChatMode) {
// Auto-collapse sidebar when navigating to home route (but not in new chat mode)
sidebarOpen = false;
@ -83,8 +93,11 @@
// Keep sidebar open in new chat mode
sidebarOpen = true;
} else if (isChatRoute) {
// On chat routes, show sidebar by default
sidebarOpen = true;
// On chat routes, only auto-show sidebar if setting is enabled
if (autoShowSidebarOnNewChat) {
sidebarOpen = true;
}
// If setting is disabled, don't change sidebar state - let user control it manually
} else {
// Other routes follow default behavior
sidebarOpen = showSidebarByDefault;
@ -190,12 +203,14 @@
<ChatSidebar bind:this={chatSidebar} />
</Sidebar.Root>
<Sidebar.Trigger
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
? 'md:left-[var(--sidebar-width)]'
: ''}"
style="translate: 1rem 1rem;"
/>
{#if !(alwaysShowSidebarOnDesktop && isDesktop)}
<Sidebar.Trigger
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
? 'md:left-[var(--sidebar-width)]'
: ''}"
style="translate: 1rem 1rem;"
/>
{/if}
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
{@render children?.()}

View File

@ -1,7 +0,0 @@
import { describe, it, expect } from 'vitest';
describe('sum test', () => {
it('adds 1 + 2 to equal 3', () => {
expect(1 + 2).toBe(3);
});
});

View File

@ -0,0 +1,423 @@
import { describe, it, expect } from 'vitest';
import { AttachmentType } from '$lib/enums';
import {
formatMessageForClipboard,
parseClipboardContent,
hasClipboardAttachments
} from '$lib/utils/clipboard';
describe('formatMessageForClipboard', () => {
it('returns plain content when no extras', () => {
const result = formatMessageForClipboard('Hello world', undefined);
expect(result).toBe('Hello world');
});
it('returns plain content when extras is empty array', () => {
const result = formatMessageForClipboard('Hello world', []);
expect(result).toBe('Hello world');
});
it('handles empty string content', () => {
const result = formatMessageForClipboard('', undefined);
expect(result).toBe('');
});
it('returns plain content when extras has only non-text attachments', () => {
const extras = [
{
type: AttachmentType.IMAGE as const,
name: 'image.png',
base64Url: 'data:image/png;base64,...'
}
];
const result = formatMessageForClipboard('Hello world', extras);
expect(result).toBe('Hello world');
});
it('filters non-text attachments and keeps only text ones', () => {
const extras = [
{
type: AttachmentType.IMAGE as const,
name: 'image.png',
base64Url: 'data:image/png;base64,...'
},
{
type: AttachmentType.TEXT as const,
name: 'file.txt',
content: 'Text content'
},
{
type: AttachmentType.PDF as const,
name: 'doc.pdf',
base64Data: 'data:application/pdf;base64,...',
content: 'PDF content',
processedAsImages: false
}
];
const result = formatMessageForClipboard('Hello', extras);
expect(result).toContain('"file.txt"');
expect(result).not.toContain('image.png');
expect(result).not.toContain('doc.pdf');
});
it('formats message with text attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'File 1 content'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'File 2 content'
}
];
const result = formatMessageForClipboard('Hello world', extras);
expect(result).toContain('"Hello world"');
expect(result).toContain('"type": "TEXT"');
expect(result).toContain('"name": "file1.txt"');
expect(result).toContain('"content": "File 1 content"');
expect(result).toContain('"name": "file2.txt"');
});
it('handles content with quotes and special characters', () => {
const content = 'Hello "world" with\nnewline';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'test.txt',
content: 'Test content'
}
];
const result = formatMessageForClipboard(content, extras);
// Should be valid JSON
expect(result.startsWith('"')).toBe(true);
// The content should be properly escaped
const parsed = JSON.parse(result.split('\n')[0]);
expect(parsed).toBe(content);
});
it('converts legacy context type to TEXT type', () => {
const extras = [
{
type: AttachmentType.LEGACY_CONTEXT as const,
name: 'legacy.txt',
content: 'Legacy content'
}
];
const result = formatMessageForClipboard('Hello', extras);
expect(result).toContain('"type": "TEXT"');
expect(result).not.toContain('"context"');
});
it('handles attachment content with special characters', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'code.js',
content: 'const x = "hello\\nworld";\nconst y = `template ${var}`;'
}
];
const formatted = formatMessageForClipboard('Check this code', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.textAttachments[0].content).toBe(
'const x = "hello\\nworld";\nconst y = `template ${var}`;'
);
});
it('handles unicode characters in content and attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'unicode.txt',
content: '日本語テスト 🎉 émojis'
}
];
const formatted = formatMessageForClipboard('Привет мир 👋', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('Привет мир 👋');
expect(parsed.textAttachments[0].content).toBe('日本語テスト 🎉 émojis');
});
it('formats as plain text when asPlainText is true', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'File 1 content'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'File 2 content'
}
];
const result = formatMessageForClipboard('Hello world', extras, true);
expect(result).toBe('Hello world\n\nFile 1 content\n\nFile 2 content');
});
it('returns plain content when asPlainText is true but no attachments', () => {
const result = formatMessageForClipboard('Hello world', [], true);
expect(result).toBe('Hello world');
});
it('plain text mode does not use JSON format', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'test.txt',
content: 'Test content'
}
];
const result = formatMessageForClipboard('Hello', extras, true);
expect(result).not.toContain('"type"');
expect(result).not.toContain('[');
expect(result).toBe('Hello\n\nTest content');
});
});
describe('parseClipboardContent', () => {
it('returns plain text as message when not in special format', () => {
const result = parseClipboardContent('Hello world');
expect(result.message).toBe('Hello world');
expect(result.textAttachments).toHaveLength(0);
});
it('handles empty string input', () => {
const result = parseClipboardContent('');
expect(result.message).toBe('');
expect(result.textAttachments).toHaveLength(0);
});
it('handles whitespace-only input', () => {
const result = parseClipboardContent(' \n\t ');
expect(result.message).toBe(' \n\t ');
expect(result.textAttachments).toHaveLength(0);
});
it('returns plain text as message when starts with quote but invalid format', () => {
const result = parseClipboardContent('"Unclosed quote');
expect(result.message).toBe('"Unclosed quote');
expect(result.textAttachments).toHaveLength(0);
});
it('returns original text when JSON array is malformed', () => {
const input = '"Hello"\n[invalid json';
const result = parseClipboardContent(input);
expect(result.message).toBe('"Hello"\n[invalid json');
expect(result.textAttachments).toHaveLength(0);
});
it('parses message with text attachments', () => {
const input = `"Hello world"
[
{"type":"TEXT","name":"file1.txt","content":"File 1 content"},
{"type":"TEXT","name":"file2.txt","content":"File 2 content"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello world');
expect(result.textAttachments).toHaveLength(2);
expect(result.textAttachments[0].name).toBe('file1.txt');
expect(result.textAttachments[0].content).toBe('File 1 content');
expect(result.textAttachments[1].name).toBe('file2.txt');
expect(result.textAttachments[1].content).toBe('File 2 content');
});
it('handles escaped quotes in message', () => {
const input = `"Hello \\"world\\" with quotes"
[
{"type":"TEXT","name":"file.txt","content":"test"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello "world" with quotes');
expect(result.textAttachments).toHaveLength(1);
});
it('handles newlines in message', () => {
const input = `"Hello\\nworld"
[
{"type":"TEXT","name":"file.txt","content":"test"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello\nworld');
expect(result.textAttachments).toHaveLength(1);
});
it('returns message only when no array follows', () => {
const input = '"Just a quoted string"';
const result = parseClipboardContent(input);
expect(result.message).toBe('Just a quoted string');
expect(result.textAttachments).toHaveLength(0);
});
it('filters out invalid attachment objects', () => {
const input = `"Hello"
[
{"type":"TEXT","name":"valid.txt","content":"valid"},
{"type":"INVALID","name":"invalid.txt","content":"invalid"},
{"name":"missing-type.txt","content":"missing"},
{"type":"TEXT","content":"missing name"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello');
expect(result.textAttachments).toHaveLength(1);
expect(result.textAttachments[0].name).toBe('valid.txt');
});
it('handles empty attachments array', () => {
const input = '"Hello"\n[]';
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello');
expect(result.textAttachments).toHaveLength(0);
});
it('roundtrips correctly with formatMessageForClipboard', () => {
const originalContent = 'Hello "world" with\nspecial characters';
const originalExtras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'Content with\nnewlines and "quotes"'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'Another file'
}
];
const formatted = formatMessageForClipboard(originalContent, originalExtras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(originalContent);
expect(parsed.textAttachments).toHaveLength(2);
expect(parsed.textAttachments[0].name).toBe('file1.txt');
expect(parsed.textAttachments[0].content).toBe('Content with\nnewlines and "quotes"');
expect(parsed.textAttachments[1].name).toBe('file2.txt');
expect(parsed.textAttachments[1].content).toBe('Another file');
});
});
describe('hasClipboardAttachments', () => {
it('returns false for plain text', () => {
expect(hasClipboardAttachments('Hello world')).toBe(false);
});
it('returns false for empty string', () => {
expect(hasClipboardAttachments('')).toBe(false);
});
it('returns false for quoted string without attachments', () => {
expect(hasClipboardAttachments('"Hello world"')).toBe(false);
});
it('returns true for valid format with attachments', () => {
const input = `"Hello"
[{"type":"TEXT","name":"file.txt","content":"test"}]`;
expect(hasClipboardAttachments(input)).toBe(true);
});
it('returns false for format with empty attachments array', () => {
const input = '"Hello"\n[]';
expect(hasClipboardAttachments(input)).toBe(false);
});
it('returns false for malformed JSON', () => {
expect(hasClipboardAttachments('"Hello"\n[broken')).toBe(false);
});
});
describe('roundtrip edge cases', () => {
it('preserves empty message with attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file.txt',
content: 'Content only'
}
];
const formatted = formatMessageForClipboard('', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('');
expect(parsed.textAttachments).toHaveLength(1);
expect(parsed.textAttachments[0].content).toBe('Content only');
});
it('preserves attachment with empty content', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'empty.txt',
content: ''
}
];
const formatted = formatMessageForClipboard('Message', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('Message');
expect(parsed.textAttachments).toHaveLength(1);
expect(parsed.textAttachments[0].content).toBe('');
});
it('preserves multiple backslashes', () => {
const content = 'Path: C:\\\\Users\\\\test\\\\file.txt';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'path.txt',
content: 'D:\\\\Data\\\\file'
}
];
const formatted = formatMessageForClipboard(content, extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(content);
expect(parsed.textAttachments[0].content).toBe('D:\\\\Data\\\\file');
});
it('preserves tabs and various whitespace', () => {
const content = 'Line1\t\tTabbed\n Spaced\r\nCRLF';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'whitespace.txt',
content: '\t\t\n\n '
}
];
const formatted = formatMessageForClipboard(content, extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(content);
expect(parsed.textAttachments[0].content).toBe('\t\t\n\n ');
});
});

View File

@ -1,6 +1,6 @@
/* eslint-disable no-irregular-whitespace */
import { describe, it, expect, test } from 'vitest';
import { maskInlineLaTeX, preprocessLaTeX } from './latex-protection';
import { maskInlineLaTeX, preprocessLaTeX } from '$lib/utils/latex-protection';
describe('maskInlineLaTeX', () => {
it('should protect LaTeX $x + y$ but not money $3.99', () => {

View File

@ -1,5 +1,5 @@
import { describe, expect, it } from 'vitest';
import { isValidModelName, normalizeModelName } from './model-names';
import { isValidModelName, normalizeModelName } from '$lib/utils/model-names';
describe('normalizeModelName', () => {
it('preserves Hugging Face org/model format (single slash)', () => {

View File

@ -125,9 +125,9 @@ export default defineConfig({
{
extends: './vite.config.ts',
test: {
name: 'server',
name: 'unit',
environment: 'node',
include: ['tests/server/**/*.{test,spec}.{js,ts}']
include: ['tests/unit/**/*.{test,spec}.{js,ts}']
}
},
{