Merge branch 'ggml-org:master' into power-law-sampler

This commit is contained in:
ddh0 2025-12-16 13:33:03 -06:00 committed by GitHub
commit 58aa1c6f5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
75 changed files with 4843 additions and 3250 deletions

View File

@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"] # ENTRYPOINT ["/app/llama-server"]
### Target: light ### Target: light
# Lightweight image containing only llama-cli # Lightweight image containing only llama-cli and llama-completion
# ============================================================================== # ==============================================================================
FROM base AS light FROM base AS light

View File

@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \ RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT # TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View File

@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cuda-cli %{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server %{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple %{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service /usr/lib/systemd/system/llamacuda.service

View File

@ -39,6 +39,7 @@ make -j
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
%files %files
%{_bindir}/llama-cli %{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server %{_bindir}/llama-server
%{_bindir}/llama-simple %{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service /usr/lib/systemd/system/llama.service

295
.github/workflows/server-webui.yml vendored Normal file
View File

@ -0,0 +1,295 @@
# Server WebUI build and tests
name: Server WebUI
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh

View File

@ -76,270 +76,6 @@ jobs:
run: | run: |
pip install -r tools/server/tests/requirements.txt pip install -r tools/server/tests/requirements.txt
webui-setup:
name: WebUI Setup
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: npm ci
working-directory: tools/server/webui
webui-check:
needs: webui-setup
name: WebUI Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Run type checking
run: npm run check
working-directory: tools/server/webui
- name: Run linting
run: npm run lint
working-directory: tools/server/webui
webui-build:
needs: webui-check
name: WebUI Build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Build application
run: npm run build
working-directory: tools/server/webui
webui-tests:
needs: webui-build
name: Run WebUI tests
permissions:
contents: read
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
- name: Restore node_modules cache
uses: actions/cache@v4
with:
path: tools/server/webui/node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
- name: Install Playwright browsers
run: npx playwright install --with-deps
working-directory: tools/server/webui
- name: Build Storybook
run: npm run build-storybook
working-directory: tools/server/webui
- name: Run Client tests
run: npm run test:client
working-directory: tools/server/webui
- name: Run Server tests
run: npm run test:server
working-directory: tools/server/webui
- name: Run UI tests
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/server/webui
- name: Run E2E tests
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
needs: [webui-tests]
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh
server-windows: server-windows:
runs-on: windows-2022 runs-on: windows-2022

View File

@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report. Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

View File

@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
} }
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// per-example default params
// we define here to make sure it's included in llama-gen-docs
if (ex == LLAMA_EXAMPLE_COMPLETION) {
params.use_jinja = false; // disable jinja by default
} else if (ex == LLAMA_EXAMPLE_MTMD) {
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
} else if (ex == LLAMA_EXAMPLE_SERVER) {
params.n_parallel = -1; // auto by default
}
params.use_color = tty_can_use_colors(); params.use_color = tty_can_use_colors();
// load dynamic backends // load dynamic backends
@ -1107,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SWA_FULL")); ).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg( add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N", {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
string_format("max number of context checkpoints to create per slot (default: %d)\n" string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints), "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_ctx_checkpoints = value; params.n_ctx_checkpoints = value;
@ -1115,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
{"--cache-ram", "-cram"}, "N", {"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n" string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib), "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
[](common_params & params, int value) { [](common_params & params, int value) {
params.cache_ram_mib = value; params.cache_ram_mib = value;
@ -1123,12 +1136,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
{"--kv-unified", "-kvu"}, {"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n" "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.kv_unified = true; params.kv_unified = true;
} }
).set_env("LLAMA_ARG_KV_UNIFIED")); ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--context-shift"}, {"--context-shift"},
{"--no-context-shift"}, {"--no-context-shift"},
@ -1906,13 +1918,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n"); LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
} }
).set_env("LLAMA_ARG_DEFRAG_THOLD")); ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(common_arg( if (ex == LLAMA_EXAMPLE_SERVER) {
{"-np", "--parallel"}, "N", // this is to make sure this option appears in the server-specific section of the help message
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), add_opt(common_arg(
[](common_params & params, int value) { {"-np", "--parallel"}, "N",
params.n_parallel = value; string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
} [](common_params & params, int value) {
).set_env("LLAMA_ARG_N_PARALLEL")); if (value == 0) {
throw std::invalid_argument("error: invalid value for n_parallel\n");
}
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
} else {
add_opt(common_arg(
{"-np", "--parallel"}, "N",
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
[](common_params & params, int value) {
params.n_parallel = value;
}
).set_env("LLAMA_ARG_N_PARALLEL"));
}
add_opt(common_arg( add_opt(common_arg(
{"-ns", "--sequences"}, "N", {"-ns", "--sequences"}, "N",
string_format("number of sequences to decode (default: %d)", params.n_sequences), string_format("number of sequences to decode (default: %d)", params.n_sequences),

View File

@ -4,9 +4,14 @@
using json = nlohmann::json; using json = nlohmann::json;
static std::string_view trim_trailing_space(std::string_view sv) { static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
int count = 0;
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) { while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
if (max != -1 && count <= max) {
break;
}
sv.remove_suffix(1); sv.remove_suffix(1);
count++;
} }
return sv; return sv;
} }
@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_string && current_tool) { if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote // Serialize to JSON, but exclude the end quote
std::string dumped = json(node.text).dump(); std::string dumped = json(trim_trailing_space(node.text)).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1); current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true; needs_closing_quote = true;
} }
@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
if (is_arg_close && current_tool) { if (is_arg_close && current_tool) {
if (needs_closing_quote) { if (needs_closing_quote) {
current_tool->arguments += "\""; current_tool->arguments += "\"";
needs_closing_quote = false;
} }
} }
@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
} }
if (is_tool_close && current_tool) { if (is_tool_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
needs_closing_quote = false;
}
current_tool->arguments += "}"; current_tool->arguments += "}";
} }
} }

View File

@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function<void(const
} }
} }
static void foreach_parameter(const json & function, const std::function<void(const std::string &, const json &, bool)> & fn) {
if (!function.contains("parameters") || !function.at("parameters").is_object()) {
return;
}
const auto & params = function.at("parameters");
if (!params.contains("properties") || !params.at("properties").is_object()) {
return;
}
const auto & props = params.at("properties");
std::set<std::string> required;
if (params.contains("required") && params.at("required").is_array()) {
params.at("required").get_to(required);
}
for (const auto & [name, prop] : props.items()) {
bool is_required = (required.find(name) != required.end());
fn(name, prop, is_required);
}
}
static std::string apply( static std::string apply(
const common_chat_template & tmpl, const common_chat_template & tmpl,
const struct templates_params & inputs, const struct templates_params & inputs,
@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data; return data;
} }
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
if (inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
}
}
// Response format parser
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
}
// Tool call parser
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
auto schema_info = common_schema_info();
schema_info.resolve_refs(parameters);
auto tool_open = "<function=" + p.tool_name(p.literal(name)) + ">\n";
auto tool_close = p.literal("</function>\n");
auto args = p.sequence();
auto arg_string = p.rule("xml-arg-string", p.until_one_of({
"\n</parameter>",
"\n<parameter=",
"\n</function>"
}));
foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
auto rule_name = "tool-" + name + "-arg-" + param_name;
auto arg_open = "<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">\n";
auto arg_close = p.literal("</parameter>\n");
auto arg_value = p.eps();
if (schema_info.resolves_to_string(param_schema)) {
arg_value = p.tool_arg_string_value(arg_string) + "\n";
} else {
arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
}
// Model may or my not close with </parameter>
auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
});
tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
});
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_call = p.rule("tool-call", "<tool_call>\n" + tool_choice + "</tool_call>" + p.space());
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
return reasoning << p.content(p.until("<tool_call>")) << tool_calls;
}
// Content only parser
include_grammar = false;
return reasoning << p.content(p.rest());
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"}
};
}
return data;
}
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja(
src.find("<function=") != std::string::npos && src.find("<function=") != std::string::npos &&
src.find("<parameters>") != std::string::npos && src.find("<parameters>") != std::string::npos &&
src.find("<parameter=") != std::string::npos) { src.find("<parameter=") != std::string::npos) {
// Nemotron 3 Nano 30B A3B
if (src.find("<think>") != std::string::npos) {
return common_chat_params_init_nemotron_v3(tmpl, params);
}
return common_chat_params_init_qwen3_coder_xml(tmpl, params); return common_chat_params_init_qwen3_coder_xml(tmpl, params);
} }

View File

@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); } std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
class SchemaConverter { class common_schema_converter {
private: private:
friend class common_schema_info;
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
@ -729,7 +730,7 @@ private:
} }
public: public:
SchemaConverter( common_schema_converter(
const std::function<json(const std::string &)> & fetch_json, const std::function<json(const std::string &)> & fetch_json,
bool dotall) bool dotall)
: _fetch_json(fetch_json), _dotall(dotall) : _fetch_json(fetch_json), _dotall(dotall)
@ -990,6 +991,134 @@ public:
} }
}; };
// common_schema_info implementation (pimpl)
common_schema_info::common_schema_info()
: impl_(std::make_unique<common_schema_converter>(
[](const std::string &) { return json(); },
false)) {}
common_schema_info::~common_schema_info() = default;
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
impl_->resolve_refs(schema, "");
}
// Determines if a JSON schema can resolve to a string type through any path.
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
// true, allowing callers to handle the value as a raw string for simplicity.
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
std::unordered_set<std::string> visited_refs;
std::function<bool(const json &)> check = [&](const json & s) -> bool {
if (!s.is_object()) {
return false;
}
// Handle $ref
if (s.contains("$ref")) {
const std::string & ref = s["$ref"];
if (visited_refs.find(ref) != visited_refs.end()) {
// Circular reference, assume not a string to be safe
return false;
}
visited_refs.insert(ref);
auto it = impl_->_refs.find(ref);
if (it != impl_->_refs.end()) {
return check(it->second);
}
return false;
}
// Check type field
if (s.contains("type")) {
const json & schema_type = s["type"];
if (schema_type.is_string()) {
if (schema_type == "string") {
return true;
}
} else if (schema_type.is_array()) {
// Type can be an array like ["string", "null"]
for (const auto & t : schema_type) {
if (t == "string") {
return true;
}
}
}
}
// Check oneOf/anyOf - if any alternative can be a string
if (s.contains("oneOf")) {
for (const auto & alt : s["oneOf"]) {
if (check(alt)) {
return true;
}
}
}
if (s.contains("anyOf")) {
for (const auto & alt : s["anyOf"]) {
if (check(alt)) {
return true;
}
}
}
// Check allOf - all components must be compatible with string type
if (s.contains("allOf")) {
bool all_string = true;
for (const auto & component : s["allOf"]) {
if (!check(component)) {
all_string = false;
break;
}
}
if (all_string) {
return true;
}
}
// Check const - if the constant value is a string
if (s.contains("const")) {
if (s["const"].is_string()) {
return true;
}
}
// Check enum - if any enum value is a string
if (s.contains("enum")) {
for (const auto & val : s["enum"]) {
if (val.is_string()) {
return true;
}
}
}
// String-specific keywords imply string type
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
return true;
}
// Check format - many formats imply string
if (s.contains("format")) {
const std::string & fmt = s["format"];
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
fmt.find("uuid") == 0) {
return true;
}
}
return false;
};
return check(schema);
}
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
#ifdef LLAMA_USE_LLGUIDANCE #ifdef LLAMA_USE_LLGUIDANCE
if (!force_gbnf) { if (!force_gbnf) {
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
} }
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) { std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall); common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
common_grammar_builder builder { common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) { /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule); return converter._add_rule(name, rule);

View File

@ -3,11 +3,31 @@
#include <nlohmann/json_fwd.hpp> #include <nlohmann/json_fwd.hpp>
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
bool force_gbnf = false); bool force_gbnf = false);
class common_schema_converter;
// Probes a JSON schema to extract information about its structure and type constraints.
class common_schema_info {
std::unique_ptr<common_schema_converter> impl_;
public:
common_schema_info();
~common_schema_info();
common_schema_info(const common_schema_info &) = delete;
common_schema_info & operator=(const common_schema_info &) = delete;
common_schema_info(common_schema_info &&) noexcept;
common_schema_info & operator=(common_schema_info &&) noexcept;
void resolve_refs(nlohmann::ordered_json & schema);
bool resolves_to_string(const nlohmann::ordered_json & schema);
};
struct common_grammar_builder { struct common_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule; std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema; std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;

View File

@ -425,7 +425,7 @@ struct parser_executor {
if (result.need_more_input()) { if (result.need_more_input()) {
// Propagate - need to know what child would match before negating // Propagate - need to know what child would match before negating
return result; return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
} }
// Child failed, so negation succeeds // Child failed, so negation succeeds

View File

@ -862,6 +862,14 @@ class TextModel(ModelBase):
logger.warning(f"Unknown RoPE type: {rope_type}") logger.warning(f"Unknown RoPE type: {rope_type}")
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
if "mrope_section" in self.rope_parameters:
mrope_section = self.rope_parameters["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
if (rope_theta := rope_params.get("rope_theta")) is not None: if (rope_theta := rope_params.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta) self.gguf_writer.add_rope_freq_base(rope_theta)
logger.info(f"gguf: rope theta = {rope_theta}") logger.info(f"gguf: rope theta = {rope_theta}")
@ -3739,9 +3747,6 @@ class Qwen2VLModel(TextModel):
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self): def set_vocab(self):
try: try:
@ -4377,6 +4382,30 @@ class Qwen3VLVisionModel(MmprojModel):
return super().modify_tensors(data_torch, name, bid) return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
class Glm4VVisionModel(Qwen3VLVisionModel):
def set_gguf_parameters(self):
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
if hidden_act == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)
rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.visual."):
name = name.replace("model.visual.", "visual.")
if name.startswith("visual.merger."):
return [(self.map_tensor_name(name), data_torch)]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration") @ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model): class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL model_arch = gguf.MODEL_ARCH.QWEN3VL
@ -4385,20 +4414,6 @@ class Qwen3VLTextModel(Qwen3Model):
super().set_gguf_parameters() super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {}) vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@ -4417,22 +4432,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {}) vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
@ -7795,6 +7794,15 @@ class JaisModel(TextModel):
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
class Glm4Model(TextModel): class Glm4Model(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4 model_arch = gguf.MODEL_ARCH.GLM4
use_mrope = False
partial_rotary_factor = 0.5
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
if "mrope_section" in self.rope_parameters:
self.use_mrope = True
logger.info("Q/K weight will need to be permuted for M-RoPE")
def set_vocab(self): def set_vocab(self):
from transformers import AutoTokenizer from transformers import AutoTokenizer
@ -7816,17 +7824,49 @@ class Glm4Model(TextModel):
super().set_gguf_parameters() super().set_gguf_parameters()
if (rope_dim := self.hparams.get("head_dim")) is None: if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
@staticmethod
def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
orig_shape = weights.shape
if len(orig_shape) == 1:
weights = weights.unsqueeze(1) # [out_dim, 1]
if len(weights.shape) != 2:
raise ValueError("Only 1D and 2D tensors are supported.")
n_effective_heads = weights.shape[0] // head_dim
if n_head_kv is not None and n_effective_heads != n_head:
if n_effective_heads != n_head_kv:
raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
rotary_dim = int(head_dim * partial_rotary_factor)
if rotary_dim % 2 != 0:
raise ValueError("rotary_dim must be even.")
reshaped = weights.reshape(n_effective_heads, head_dim, -1)
rot_part = reshaped[:, :rotary_dim, :]
non_rot_part = reshaped[:, rotary_dim:, :]
permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
combined = torch.cat((permuted_rot, non_rot_part), dim=1)
result = combined.reshape(weights.shape)
return result if len(orig_shape) != 1 else result.squeeze(1)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.visual."): # ignore visual part of Glm4v if name.startswith("model.visual."): # ignore visual part of Glm4v
return [] return []
elif name.startswith("model.language_model."): elif name.startswith("model.language_model."):
name = name.replace("language_model.", "") # for Glm4v name = name.replace("language_model.", "") # for Glm4v
if self.use_mrope:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams["num_key_value_heads"]
n_embd = self.hparams["hidden_size"]
head_dim = n_embd // n_head
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
return super().modify_tensors(data_torch, name, bid) return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4MoeForCausalLM") @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
class Glm4MoeModel(TextModel): class Glm4MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE model_arch = gguf.MODEL_ARCH.GLM4_MOE
@ -7893,6 +7933,7 @@ class Glm4MoeModel(TextModel):
_experts: list[dict[str, Tensor]] | None = None _experts: list[dict[str, Tensor]] | None = None
# note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
def modify_tensors( def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]: ) -> Iterable[tuple[str, Tensor]]:
@ -8490,8 +8531,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
class NemotronHModel(GraniteHybridModel): class NemotronHModel(GraniteHybridModel):
"""Hybrid mamba2/attention model from NVIDIA""" """Hybrid mamba2/attention model from NVIDIA"""
model_arch = gguf.MODEL_ARCH.NEMOTRON_H model_arch = gguf.MODEL_ARCH.NEMOTRON_H
is_moe: bool = False
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
# We have to determine the correct model architecture (MoE vs non-MoE) before
# calling the parent __init__. This is because the parent constructor
# uses self.model_arch to build the tensor name map, and all MoE-specific
# mappings would be missed if it were called with the default non-MoE arch.
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
if "num_experts_per_tok" in hparams:
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
self.is_moe = True
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Save the top-level head_dim for later # Save the top-level head_dim for later
@ -8503,9 +8554,11 @@ class NemotronHModel(GraniteHybridModel):
# Update the ssm / attn / mlp layers # Update the ssm / attn / mlp layers
# M: Mamba2, *: Attention, -: MLP # M: Mamba2, *: Attention, -: MLP
# MoE:
# M: Mamba2, *: Attention, E: Expert
hybrid_override_pattern = self.hparams["hybrid_override_pattern"] hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"] self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"] self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
def get_attn_layers(self): def get_attn_layers(self):
hybrid_override_pattern = self.hparams["hybrid_override_pattern"] hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
@ -8521,10 +8574,28 @@ class NemotronHModel(GraniteHybridModel):
# Set feed_forward_length # Set feed_forward_length
# NOTE: This will trigger an override warning. This is preferrable to # NOTE: This will trigger an override warning. This is preferrable to
# duplicating all the parent logic # duplicating all the parent logic
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) if not self.is_moe:
self.gguf_writer.add_feed_forward_length([ n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count) self.gguf_writer.add_feed_forward_length([
]) n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
])
else:
moe_intermediate_size = self.hparams["moe_intermediate_size"]
self.gguf_writer.add_feed_forward_length([
moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
def set_vocab(self): def set_vocab(self):
super().set_vocab() super().set_vocab()
@ -8532,7 +8603,81 @@ class NemotronHModel(GraniteHybridModel):
# The tokenizer _does_ add a BOS token (via post_processor type # The tokenizer _does_ add a BOS token (via post_processor type
# TemplateProcessing) but does not set add_bos_token to true in the # TemplateProcessing) but does not set add_bos_token to true in the
# config, so we need to explicitly override it here. # config, so we need to explicitly override it here.
self.gguf_writer.add_add_bos_token(True) if not self.is_moe:
self.gguf_writer.add_add_bos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.is_moe and bid is not None:
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
mapped_name = self.map_tensor_name(new_name)
return [(mapped_name, data_torch)]
if name.endswith("mixer.dt_bias"):
new_name = name.replace("dt_bias", "dt.bias")
mapped_name = self.map_tensor_name(new_name)
return [(mapped_name, data_torch)]
if name.endswith("mixer.conv1d.weight"):
squeezed_data = data_torch.squeeze()
mapped_name = self.map_tensor_name(name)
return [(mapped_name, squeezed_data)]
if name.endswith("mixer.A_log"):
transformed_data = -torch.exp(data_torch)
reshaped_data = transformed_data.squeeze().reshape(-1, 1)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.endswith("mixer.D"):
reshaped_data = data_torch.squeeze().reshape(-1, 1)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.endswith("mixer.norm.weight"):
reshaped_data = data_torch.reshape(8, 512)
mapped_name = self.map_tensor_name(name)
return [(mapped_name, reshaped_data)]
if name.find("mixer.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 2:
# merge the experts into a single tensor
tensors: list[tuple[str, Tensor]] = []
for w_name in ["down_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return super().modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("BailingMoeForCausalLM") @ModelBase.register("BailingMoeForCausalLM")

View File

@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family:
- Intel Built-in Arc GPU - Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). - Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
#### Verified devices #### Verified devices
| Intel GPU | Status | Verified Model | | Intel GPU | Status | Verified Model |

View File

@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`. 1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`: 2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map. - Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map. - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`. 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`. 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

View File

@ -7,9 +7,9 @@
## Images ## Images
We have three Docker images available for this project: We have three Docker images available for this project:
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) 3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
Additionally, there the following images, similar to the above: Additionally, there the following images, similar to the above:
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
On completion, you are ready to play! On completion, you are ready to play!
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
``` ```
or with a light image: or with a light image:
```bash ```bash
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
``` ```
or with a server image: or with a server image:
@ -59,6 +61,8 @@ or with a server image:
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
``` ```
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
## Docker With CUDA ## Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -80,9 +84,9 @@ The defaults are:
The resulting images, are essentially the same as the non-CUDA images: The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. 2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. 3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
## Usage ## Usage
@ -114,9 +118,9 @@ The defaults are:
The resulting images, are essentially the same as the non-MUSA images: The resulting images, are essentially the same as the non-MUSA images:
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-musa`: This image only includes the main executable file. 2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
3. `local/llama.cpp:server-musa`: This image only includes the server executable file. 3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
## Usage ## Usage

View File

@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
} }
} }
static void export_md(std::string fname, llama_example ex) { static void export_md(std::string fname, llama_example ex, std::string name) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
common_params params; common_params params;
@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
write_table(file, common_options); write_table(file, common_options);
file << "\n\n**Sampling params**\n\n"; file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options); write_table(file, sparam_options);
file << "\n\n**Example-specific params**\n\n"; file << "\n\n**" << name << "-specific params**\n\n";
write_table(file, specific_options); write_table(file, specific_options);
} }
int main(int, char **) { int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION); // TODO: add CLI
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
return 0; return 0;
} }

View File

@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the
model/models need to the ggml-org on Hugging Face. This tool/example tries to model/models need to the ggml-org on Hugging Face. This tool/example tries to
help with this process. help with this process.
> 📝 **Note:** When adding a new model from an existing family, verify the
> previous version passes logits verification first. Existing models can have
> subtle numerical differences that don't affect generation quality but cause
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
> the conversion script, or in an upstream implementation, can save significant
> debugging time.
### Overview ### Overview
The idea is that the makefile targets and scripts here can be used in the The idea is that the makefile targets and scripts here can be used in the
development/conversion process assisting with things like: development/conversion process assisting with things like:

View File

@ -7,7 +7,7 @@ base_model:
Recommended way to run this model: Recommended way to run this model:
```sh ```sh
llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa llama-server -hf {namespace}/{model_name}-GGUF -c 0
``` ```
Then, access http://localhost:8080 Then, access http://localhost:8080

View File

@ -34,8 +34,11 @@ done
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}" MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
if [ -t 0 ]; then if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
else else
# Process piped JSON data and convert to binary (matching logits.cpp format) # Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)

View File

@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
JAIS = auto() JAIS = auto()
NEMOTRON = auto() NEMOTRON = auto()
NEMOTRON_H = auto() NEMOTRON_H = auto()
NEMOTRON_H_MOE = auto()
EXAONE = auto() EXAONE = auto()
EXAONE4 = auto() EXAONE4 = auto()
GRANITE = auto() GRANITE = auto()
@ -642,6 +643,7 @@ class MODEL_TENSOR(IntEnum):
V_MMPROJ_PEG = auto() V_MMPROJ_PEG = auto()
V_ENC_EMBD_CLS = auto() V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_NORM = auto()
V_ENC_EMBD_POS = auto() V_ENC_EMBD_POS = auto()
V_ENC_INPUT_NORM = auto() V_ENC_INPUT_NORM = auto()
V_ENC_ATTN_QKV = auto() V_ENC_ATTN_QKV = auto()
@ -660,6 +662,7 @@ class MODEL_TENSOR(IntEnum):
V_LAYER_SCALE_2 = auto() V_LAYER_SCALE_2 = auto()
V_PRE_NORM = auto() V_PRE_NORM = auto()
V_POST_NORM = auto() V_POST_NORM = auto()
V_MM_POST_NORM = auto()
V_MM_INP_NORM = auto() V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3 V_MM_INP_PROJ = auto() # gemma3
V_MM_SOFT_EMB_NORM = auto() # gemma3 V_MM_SOFT_EMB_NORM = auto() # gemma3
@ -786,6 +789,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.JAIS: "jais", MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.NEMOTRON_H: "nemotron_h", MODEL_ARCH.NEMOTRON_H: "nemotron_h",
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.EXAONE4: "exaone4", MODEL_ARCH.EXAONE4: "exaone4",
MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE: "granite",
@ -1014,6 +1018,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd",
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
@ -1032,6 +1037,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
MODEL_TENSOR.V_POST_NORM: "v.post_ln", MODEL_TENSOR.V_POST_NORM: "v.post_ln",
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
@ -1092,6 +1098,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_MMPROJ_PEG, MODEL_TENSOR.V_MMPROJ_PEG,
MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_NORM,
MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_QKV, MODEL_TENSOR.V_ENC_ATTN_QKV,
@ -1110,6 +1117,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_LAYER_SCALE_2, MODEL_TENSOR.V_LAYER_SCALE_2,
MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM, MODEL_TENSOR.V_POST_NORM,
MODEL_TENSOR.V_MM_POST_NORM,
MODEL_TENSOR.V_MM_INP_PROJ, MODEL_TENSOR.V_MM_INP_PROJ,
MODEL_TENSOR.V_MM_INP_NORM, MODEL_TENSOR.V_MM_INP_NORM,
MODEL_TENSOR.V_MM_SOFT_EMB_NORM, MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
@ -2529,6 +2537,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.NEMOTRON_H_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_D,
MODEL_TENSOR.SSM_NORM,
MODEL_TENSOR.SSM_OUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
# experts
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
# shared expert
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.EXAONE: [ MODEL_ARCH.EXAONE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -3328,6 +3363,7 @@ class VisionProjectorType:
LIGHTONOCR = "lightonocr" LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm" COGVLM = "cogvlm"
JANUS_PRO = "janus_pro" JANUS_PRO = "janus_pro"
GLM4V = "glm4v"
# Items here are (block size, type size) # Items here are (block size, type size)

View File

@ -379,6 +379,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.gate", # lfm2moe "model.layers.{bid}.feed_forward.gate", # lfm2moe
"model.layers.{bid}.mlp.router.gate", # afmoe "model.layers.{bid}.mlp.router.gate", # afmoe
"layers.{bid}.gate", # mistral-large "layers.{bid}.gate", # mistral-large
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
), ),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@ -392,6 +393,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.expert_bias", # afmoe "model.layers.{bid}.mlp.expert_bias", # afmoe
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe "model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
), ),
# Feed-forward up # Feed-forward up
@ -440,7 +442,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4 "model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
@ -454,6 +456,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.down_proj", "model.layers.{bid}.feed_forward.down_proj",
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
"layers.{bid}.shared_experts.w3", # mistral-large "layers.{bid}.shared_experts.w3", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
), ),
MODEL_TENSOR.FFN_UP_CHEXP: ( MODEL_TENSOR.FFN_UP_CHEXP: (
@ -548,7 +551,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4 "model.layers.{bid}.feed_forward.experts.down_proj", # llama4
@ -563,6 +566,7 @@ class TensorNameMap:
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe "model.layers.{bid}.shared_mlp.output_linear", # granitemoe
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
"layers.{bid}.shared_experts.w2", # mistral-large "layers.{bid}.shared_experts.w2", # mistral-large
"backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
), ),
MODEL_TENSOR.FFN_DOWN_CHEXP: ( MODEL_TENSOR.FFN_DOWN_CHEXP: (
@ -706,6 +710,7 @@ class TensorNameMap:
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid "model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2 "model.layers.layers.{bid}.mixer.dt_proj", # plamo2
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next "model.layers.{bid}.linear_attn.dt_proj", # qwen3next
"backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
), ),
MODEL_TENSOR.SSM_DT_NORM: ( MODEL_TENSOR.SSM_DT_NORM: (
@ -1207,6 +1212,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: ( MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM "model.connector.modality_projection.proj", # SmolVLM
"model.vision.linear_proj.linear_proj", # cogvlm "model.vision.linear_proj.linear_proj", # cogvlm
"visual.merger.proj", # glm4v
), ),
MODEL_TENSOR.V_MMPROJ_MLP: ( MODEL_TENSOR.V_MMPROJ_MLP: (
@ -1240,6 +1246,10 @@ class TensorNameMap:
"model.vision.patch_embedding.proj", # cogvlm "model.vision.patch_embedding.proj", # cogvlm
), ),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
"visual.post_conv_layernorm", # glm4v
),
MODEL_TENSOR.V_ENC_EMBD_POS: ( MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding", "vision_tower.vision_model.embeddings.position_embedding",
"model.vision_tower.embeddings.position_embeddings", # Intern-S1 "model.vision_tower.embeddings.position_embeddings", # Intern-S1
@ -1249,6 +1259,7 @@ class TensorNameMap:
"vision_tower.patch_embed.pos_emb", # kimi-vl "vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl "visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm "model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
), ),
MODEL_TENSOR.V_ENC_ATTN_QKV: ( MODEL_TENSOR.V_ENC_ATTN_QKV: (
@ -1404,6 +1415,11 @@ class TensorNameMap:
"vision_model.layernorm_post", # llama4 "vision_model.layernorm_post", # llama4
"visual.merger.ln_q", # qwen2vl "visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl "vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
),
MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
), ),
MODEL_TENSOR.V_MM_INP_PROJ: ( MODEL_TENSOR.V_MM_INP_PROJ: (
@ -1473,6 +1489,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_PATCH_MERGER: ( MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
"patch_merger.merging_layer", # mistral "patch_merger.merging_layer", # mistral
"visual.downsample", # glm4v
), ),
MODEL_TENSOR.V_DS_NORM: ( MODEL_TENSOR.V_DS_NORM: (
@ -1493,14 +1510,17 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_UP: ( MODEL_TENSOR.V_MM_UP: (
"model.vision.linear_proj.dense_h_to_4h", # cogvlm "model.vision.linear_proj.dense_h_to_4h", # cogvlm
"visual.merger.up_proj", # glm4v
), ),
MODEL_TENSOR.V_MM_DOWN: ( MODEL_TENSOR.V_MM_DOWN: (
"model.vision.linear_proj.dense_4h_to_h", # cogvlm "model.vision.linear_proj.dense_4h_to_h", # cogvlm
"visual.merger.down_proj", # glm4v
), ),
MODEL_TENSOR.V_MM_GATE: ( MODEL_TENSOR.V_MM_GATE: (
"model.vision.linear_proj.gate_proj", # cogvlm "model.vision.linear_proj.gate_proj", # cogvlm
"visual.merger.gate_proj", # glm4v
), ),
MODEL_TENSOR.V_TOK_BOI: ( MODEL_TENSOR.V_TOK_BOI: (

View File

@ -0,0 +1,204 @@
{% macro render_extra_keys(json_dict, handled_keys) %}
{%- if json_dict is mapping %}
{%- for json_key in json_dict if json_key not in handled_keys %}
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
{%- else %}
{{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
{%- endif %}
{%- endfor %}
{%- endif %}
{% endmacro %}
{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
{%- set ns = namespace(last_user_idx = -1) %}
{%- set loop_messages = messages %}
{%- for m in loop_messages %}
{%- if m["role"] == "user" %}
{%- set ns.last_user_idx = loop.index0 %}
{%- endif %}
{%- endfor %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set system_message = "" %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = [] %}
{%- endif %}
{# Recompute last_user_idx relative to loop_messages after handling system #}
{%- set ns = namespace(last_user_idx = -1) %}
{%- for m in loop_messages %}
{%- if m["role"] == "user" %}
{%- set ns.last_user_idx = loop.index0 %}
{%- endif %}
{%- endfor %}
{%- if system_message is defined %}
{{- "<|im_start|>system\n" + system_message }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- "<|im_start|>system\n" }}
{%- endif %}
{%- endif %}
{%- if tools is iterable and tools | length > 0 %}
{%- if system_message is defined and system_message | length > 0 %}
{{- "\n\n" }}
{%- endif %}
{{- "# Tools\n\nYou have access to the following functions:\n\n" }}
{{- "<tools>" }}
{%- for tool in tools %}
{%- if tool.function is defined %}
{%- set tool = tool.function %}
{%- endif %}
{{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
{%- if tool.description is defined %}
{{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
{%- endif %}
{{- '\n<parameters>' }}
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
{%- for param_name, param_fields in tool.parameters.properties|items %}
{{- '\n<parameter>' }}
{{- '\n<name>' ~ param_name ~ '</name>' }}
{%- if param_fields.type is defined %}
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
{%- endif %}
{%- if param_fields.description is defined %}
{{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
{%- endif %}
{%- if param_fields.enum is defined %}
{{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
{%- endif %}
{%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
{{- render_extra_keys(param_fields, handled_keys) }}
{{- '\n</parameter>' }}
{%- endfor %}
{%- endif %}
{% set handled_keys = ['type', 'properties', 'required'] %}
{{- render_extra_keys(tool.parameters, handled_keys) }}
{%- if tool.parameters is defined and tool.parameters.required is defined %}
{{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
{%- endif %}
{{- '\n</parameters>' }}
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
{{- render_extra_keys(tool, handled_keys) }}
{{- '\n</function>' }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- endif %}
{%- if system_message is defined %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- for message in loop_messages %}
{%- if message.role == "assistant" %}
{# Add reasoning content in to content field for unified processing below. #}
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
{%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
{%- else %}
{%- set content = message.content | default('', true) %}
{%- if content is string -%}
{# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
{%- if '<think>' not in content and '</think>' not in content -%}
{%- set content = "<think></think>" ~ content -%}
{%- endif -%}
{%- else -%}
{%- set content = content -%}
{%- endif -%}
{%- endif %}
{%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
{# Assistant message has tool calls. #}
{{- '<|im_start|>assistant\n' }}
{%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
{%- if content is string and content | trim | length > 0 %}
{%- if include_content %}
{{- (content | trim) ~ '\n' -}}
{%- else %}
{%- set c = (content | string) %}
{%- if '</think>' in c %}
{# Keep only content after the last closing think. Also generation prompt causes this. #}
{%- set c = c.split('</think>')[-1] %}
{%- elif '<think>' in c %}
{# If <think> was opened but never closed, drop the trailing think segment #}
{%- set c = c.split('<think>')[0] %}
{%- endif %}
{%- set c = "<think></think>" ~ c | trim %}
{%- if c | length > 0 %}
{{- c ~ '\n' -}}
{%- endif %}
{%- endif %}
{%- else %}
{{- "<think></think>" -}}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
{%- if tool_call.arguments is defined %}
{%- for args_name, args_value in tool_call.arguments|items %}
{{- '<parameter=' ~ args_name ~ '>\n' -}}
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
{{- args_value ~ '\n</parameter>\n' -}}
{%- endfor %}
{%- endif %}
{{- '</function>\n</tool_call>\n' -}}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- else %}
{# Assistant message doesn't have tool calls. #}
{%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
{{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
{%- else %}
{%- set c = (content | default('', true) | string) %}
{%- if '<think>' in c and '</think>' in c %}
{%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
{%- endif %}
{%- set c = c | trim %}
{%- if c | length > 0 %}
{{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
{%- else %}
{{- '<|im_start|>assistant\n<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endif %}
{%- elif message.role == "user" or message.role == "system" %}
{{- '<|im_start|>' + message.role + '\n' }}
{%- set content = message.content | string %}
{{- content }}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user\n' }}
{%- endif %}
{{- '<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>\n' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{%- if enable_thinking %}
{{- '<|im_start|>assistant\n<think>\n' }}
{%- else %}
{{- '<|im_start|>assistant\n<think></think>' }}
{%- endif %}
{%- endif %}

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
#include "ggml.h" // ggml_op #include "ggml.h" // ggml_op
#include <string> #include <string>
#include <set>
// //
// gguf constants (sync with gguf.py) // gguf constants (sync with gguf.py)
@ -79,6 +80,7 @@ enum llm_arch {
LLM_ARCH_JAIS, LLM_ARCH_JAIS,
LLM_ARCH_NEMOTRON, LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H, LLM_ARCH_NEMOTRON_H,
LLM_ARCH_NEMOTRON_H_MOE,
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4, LLM_ARCH_EXAONE4,
LLM_ARCH_RWKV6, LLM_ARCH_RWKV6,
@ -315,6 +317,7 @@ enum llm_tensor {
LLM_TENSOR_DENSE_3_OUT, LLM_TENSOR_DENSE_3_OUT,
LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG, LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT, LLM_TENSOR_ROPE_FACTORS_SHORT,
@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
const int bid; const int bid;
const int xid; const int xid;
const std::set<llm_tensor> model_tensors;
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
std::string str() const; std::string str() const;
operator std::string() const { operator std::string() const {
@ -546,11 +553,11 @@ struct LLM_TN {
llm_arch arch; llm_arch arch;
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const { LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
return { arch, tensor, suffix, bid, xid }; return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
} }
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const { LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
return { arch, tensor, nullptr, bid, xid }; return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
} }
}; };

View File

@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
} }
} }
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= s_copy->ne[0] == mctx->get_n_rs();
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
res &= head == mctx->get_head();
res &= rs_z == mctx->get_rs_z();
return res;
}
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
GGML_UNUSED(ubatch); GGML_UNUSED(ubatch);
@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
} }
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
inp_attn->set_input(ubatch); mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
inp_rs->set_input(ubatch); mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for (uint32_t i = 0; i < n_rs; ++i) {
data[i] = mctx->get_recr()->s_copy(i);
}
}
}
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
this->mctx = mctx;
bool res = true;
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
return res;
} }
// //
@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cur = ggml_relu(ctx0, cur); cur = ggml_relu(ctx0, cur);
cb(cur, "ffn_moe_relu", il); cb(cur, "ffn_moe_relu", il);
} break; } break;
case LLM_FFN_RELU_SQR:
if (gate_exps) {
// TODO: add support for gated squared relu
GGML_ABORT("fatal error: gated squared relu not implemented");
} else {
cur = ggml_relu(ctx0, cur);
cur = ggml_sqr(ctx0, cur);
cb(cur, "ffn_moe_relu_sqr", il);
} break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0); inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]); inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
inp->head = mctx_cur->get_head();
inp->rs_z = mctx_cur->get_rs_z();
return inp; return inp;
} }
@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx); const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr()); auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur); auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
} }

View File

@ -225,6 +225,8 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * s_copy; // I32 [n_rs] ggml_tensor * s_copy; // I32 [n_rs]
// views of s_copy, computed once per graph // views of s_copy, computed once per graph
@ -233,6 +235,10 @@ public:
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs] ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
const llama_memory_recurrent_context * mctx; const llama_memory_recurrent_context * mctx;
// used in view offsets, need to match for valid graph reuse
uint32_t head;
int32_t rs_z;
}; };
class llm_graph_input_cross_embd : public llm_graph_input_i { class llm_graph_input_cross_embd : public llm_graph_input_i {
@ -365,22 +371,28 @@ public:
class llm_graph_input_mem_hybrid : public llm_graph_input_i { class llm_graph_input_mem_hybrid : public llm_graph_input_i {
public: public:
llm_graph_input_mem_hybrid( llm_graph_input_mem_hybrid(
const llama_cparams & cparams,
std::unique_ptr<llm_graph_input_attn_kv> inp_attn, std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
std::unique_ptr<llm_graph_input_rs> inp_rs, std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_context * mctx) : const llama_memory_hybrid_context * mctx) :
inp_attn(std::move(inp_attn)), inp_attn(std::move(inp_attn)),
inp_rs(std::move(inp_rs)), inp_rs(std::move(inp_rs)),
cparams(cparams),
mctx(mctx) { } mctx(mctx) { }
virtual ~llm_graph_input_mem_hybrid() = default; virtual ~llm_graph_input_mem_hybrid() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
bool can_reuse(const llm_graph_params & params) override;
std::unique_ptr<llm_graph_input_attn_kv> inp_attn; std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
std::unique_ptr<llm_graph_input_rs> inp_rs; std::unique_ptr<llm_graph_input_rs> inp_rs;
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); } llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
llm_graph_input_rs * get_recr() const { return inp_rs.get(); } llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
const llama_cparams cparams;
const llama_memory_hybrid_context * mctx; const llama_memory_hybrid_context * mctx;
}; };

View File

@ -2,6 +2,7 @@
#include "ggml.h" #include "ggml.h"
#include <algorithm>
#include <cassert> #include <cassert>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
@ -230,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
return false; return false;
} }
bool llama_hparams::use_mrope() const {
return rope_sections[0] > 0 && rope_sections[1] > 0;
}

View File

@ -270,6 +270,8 @@ struct llama_hparams {
// TODO: think of a better place for this function // TODO: think of a better place for this function
// TODO: pack the SWA params in a struct? // TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
bool use_mrope() const;
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
ubatches(std::move(ubatches)), ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal // note: here we copy the ubatches. not sure if this is ideal
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)), ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)), ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
} }

View File

@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16B_A1B: return "16B.A1B"; case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B"; case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_106B_A12B: return "106B.A12B"; case LLM_TYPE_106B_A12B: return "106B.A12B";
@ -1688,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break; } break;
case LLM_ARCH_GLM4: case LLM_ARCH_GLM4:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 40: type = LLM_TYPE_9B; break; case 40: type = LLM_TYPE_9B; break;
case 61: type = LLM_TYPE_32B; break; case 61: type = LLM_TYPE_32B; break;
@ -1697,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break; } break;
case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM4_MOE:
{ {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
// MoE parameters // MoE parameters
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
@ -1797,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{ {
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
@ -1812,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
case 56: type = LLM_TYPE_9B; break; case 56: type = LLM_TYPE_9B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
@ -5159,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} break; } break;
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{ {
// mamba2 Mixer SSM params // mamba2 Mixer SSM params
// NOTE: int64_t for tensor dimensions // NOTE: int64_t for tensor dimensions
@ -5169,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_group = hparams.ssm_n_group; const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head; const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
// embeddings // embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5218,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else { } else {
// mlp layers if (n_expert != 0) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0); layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED); // MoE branch
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
// Shared expert branch
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
} else {
// mlp layers
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
}
} }
} }
} break; } break;
@ -6207,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
if (output == NULL) { if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@ -6850,7 +6879,8 @@ void llama_model::print_info() const {
arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN3NEXT ||
arch == LLM_ARCH_NEMOTRON_H) { arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@ -6905,7 +6935,8 @@ void llama_model::print_info() const {
if (arch == LLM_ARCH_MINICPM || if (arch == LLM_ARCH_MINICPM ||
arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE ||
arch == LLM_ARCH_GRANITE_MOE || arch == LLM_ARCH_GRANITE_MOE ||
arch == LLM_ARCH_GRANITE_HYBRID) { arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@ -7086,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
if (arch == LLM_ARCH_FALCON_H1) { if (arch == LLM_ARCH_FALCON_H1) {
filter_attn = [&](int32_t) { return true; }; filter_attn = [&](int32_t) { return true; };
filter_recr = [&](int32_t) { return true; }; filter_recr = [&](int32_t) { return true; };
} else if (arch == LLM_ARCH_NEMOTRON_H) { } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
filter_attn = [&](int32_t il) { filter_attn = [&](int32_t il) {
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
}; };
@ -7457,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm = std::make_unique<llm_build_nemotron>(*this, params); llm = std::make_unique<llm_build_nemotron>(*this, params);
} break; } break;
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
{ {
llm = std::make_unique<llm_build_nemotron_h>(*this, params); llm = std::make_unique<llm_build_nemotron_h>(*this, params);
} break; } break;
@ -7741,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ARWKV7: case LLM_ARCH_ARWKV7:
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_NEMOTRON_H_MOE:
return LLAMA_ROPE_TYPE_NONE; return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
@ -7761,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_PLM: case LLM_ARCH_PLM:
case LLM_ARCH_CHATGLM: case LLM_ARCH_CHATGLM:
case LLM_ARCH_GLM4:
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
@ -7823,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_LFM2: case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE: case LLM_ARCH_LFM2MOE:
case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_SMALLTHINKER:
case LLM_ARCH_GLM4_MOE:
case LLM_ARCH_SEED_OSS: case LLM_ARCH_SEED_OSS:
case LLM_ARCH_GROVEMOE: case LLM_ARCH_GROVEMOE:
case LLM_ARCH_APERTUS: case LLM_ARCH_APERTUS:
@ -7840,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_QWEN3VLMOE:
return LLAMA_ROPE_TYPE_IMROPE; return LLAMA_ROPE_TYPE_IMROPE;
case LLM_ARCH_GLM4:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
case LLM_ARCH_GLM4_MOE:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here // all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN: case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture"); GGML_ABORT("unknown architecture");

View File

@ -113,6 +113,7 @@ enum llm_type {
LLM_TYPE_16B_A1B, LLM_TYPE_16B_A1B,
LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B, LLM_TYPE_30B_A3B,
LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B, LLM_TYPE_100B_A6B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air

View File

@ -241,6 +241,13 @@ static void llama_params_fit_impl(
global_surplus += memory_reduction; global_surplus += memory_reduction;
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (global_surplus >= 0) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
}
} else { } else {
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
__func__, hp_nct, n_ctx_min); __func__, hp_nct, n_ctx_min);
@ -249,10 +256,6 @@ static void llama_params_fit_impl(
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
} }
} }
if (global_surplus >= 0) {
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
return;
}
} }
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {

View File

@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * cur; ggml_tensor * cur;
ggml_tensor * inpL; ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
bool use_mrope = hparams.use_mrope();
if (ubatch.embd && !use_mrope) {
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il); cb(Kcur, "Kcur_normed", il);
} }
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext( if (use_mrope) {
ctx0, Kcur, inp_pos, nullptr, Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow);
);
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
} else {
// Normal RoPE
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);

View File

@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * cur; ggml_tensor * cur;
ggml_tensor * inpL; ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
bool use_mrope = hparams.use_mrope();
if (ubatch.embd && !use_mrope) {
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
} }
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, if (use_mrope) {
ext_factor, attn_factor, beta_fast, beta_slow); Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
} else {
// Normal RoPE
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);

View File

@ -441,23 +441,13 @@ private:
ggml_tensor * cur, ggml_tensor * cur,
ggml_tensor * causal_mask, ggml_tensor * causal_mask,
ggml_tensor * identity, ggml_tensor * identity,
ggml_tensor * diag_mask,
int il); int il);
ggml_tensor * build_layer_ffn( ggml_tensor * build_layer_ffn(
ggml_tensor * cur, ggml_tensor * cur,
int il); int il);
ggml_tensor * build_delta_net_recurrent(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il);
ggml_tensor * build_delta_net_chunking( ggml_tensor * build_delta_net_chunking(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
@ -467,8 +457,18 @@ private:
ggml_tensor * state, ggml_tensor * state,
ggml_tensor * causal_mask, ggml_tensor * causal_mask,
ggml_tensor * identity, ggml_tensor * identity,
ggml_tensor * diag_mask,
int il); int il);
ggml_tensor * build_delta_net_autoregressive(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
int il);
ggml_tensor * build_norm_gated( ggml_tensor * build_norm_gated(
ggml_tensor * input, ggml_tensor * input,
ggml_tensor * weights, ggml_tensor * weights,

View File

@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
} }
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) { ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
cur = build_ffn(cur, if (model.layers[il].ffn_gate_inp == nullptr) {
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, cur = build_ffn(cur,
NULL, NULL, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, NULL, NULL,
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il); model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
cb(cur, "ffn_out", il); NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * ffn_inp = cur;
ggml_tensor * moe_out =
build_moe_ffn(ffn_inp,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr, // no gate
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
cb(moe_out, "ffn_moe_out", il);
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
cur = build_cvec(cur, il); cur = build_cvec(cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);

View File

@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * causal_mask = ggml_tensor * causal_mask =
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f), ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER); GGML_TRI_TYPE_LOWER);
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask); ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity); ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
// Determine layer type and build appropriate attention mechanism // Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) { if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net) // Linear attention layer (gated delta net)
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il); cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
} else { } else {
// Full attention layer // Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
ggml_tensor * state, ggml_tensor * state,
ggml_tensor * causal_mask, ggml_tensor * causal_mask,
ggml_tensor * identity, ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) { int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0]; const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1]; const int64_t H_k = q->ne[1];
const int64_t n_tokens = q->ne[2]; const int64_t n_tokens = q->ne[2];
@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
// TODO: can this ever be false? const float eps_norm = hparams.f_norm_rms_eps;
const bool use_qk_l2norm = true;
if (use_qk_l2norm) { q = ggml_l2_norm(ctx0, q, eps_norm);
const float eps_norm = hparams.f_norm_rms_eps; k = ggml_l2_norm(ctx0, k, eps_norm);
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
}
const float scale = 1.0f / sqrtf(S_v); const float scale = 1.0f / sqrtf(S_v);
@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
beta = ggml_sigmoid(ctx0, beta); beta = ggml_sigmoid(ctx0, beta);
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
cb(q, "q_in", il); cb(q, "q_in", il);
cb(k, "k_in", il); cb(k, "k_in", il);
cb(v, "v_in", il); cb(v, "v_in", il);
@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
cb(v_beta, "v_beta", il); cb(v_beta, "v_beta", il);
cb(k_beta, "k_beta", il); cb(k_beta, "k_beta", il);
ggml_tensor * chunked_mask = q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
ggml_view_4d(ctx0, causal_mask, chunk_size, k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
chunk_size, causal_mask->ne[2], causal_mask->ne[3], k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0); v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * chunked_diag_mask = g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
ggml_view_4d(ctx0, causal_diag_mask, chunk_size, beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3],
causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
ggml_tensor * chunked_identity =
ggml_view_4d(ctx0, identity, chunk_size,
chunk_size, identity->ne[2], identity->ne[3],
identity->nb[1], identity->nb[2], identity->nb[3], 0);
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il); cb(g_cumsum, "g_cumsum", il);
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast = ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
cb(decay_mask, "decay_mask", il); cb(decay_mask, "decay_mask", il);
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
decay_mask = ggml_exp(ctx0, decay_mask); decay_mask = ggml_exp(ctx0, decay_mask);
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask); decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask)); ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il); cb(attn, "attn_pre_solve", il);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask); ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower); ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, chunked_mask); attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, chunked_identity); attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il); cb(attn, "attn_solved", il);
@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk); attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
attn = ggml_mul(ctx0, attn, decay_mask_chunk); attn = ggml_mul(ctx0, attn, decay_mask_chunk);
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask)); attn = ggml_mul(ctx0, attn, diag_mask);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
return ggml_concat(ctx0, flat_output, flat_state, 0); return ggml_concat(ctx0, flat_output, flat_state, 0);
} }
ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
ggml_tensor * q, ggml_tensor * q,
ggml_tensor * k, ggml_tensor * k,
ggml_tensor * v, ggml_tensor * v,
ggml_tensor * g, ggml_tensor * g,
ggml_tensor * beta, ggml_tensor * beta,
ggml_tensor * state, ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
int il) { int il) {
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = q->ne[0]; const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1]; const int64_t H_k = q->ne[1];
const int64_t n_tokens = q->ne[2]; const int64_t n_tokens = q->ne[2];
@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
const int64_t S_v = v->ne[0]; const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1]; const int64_t H_v = v->ne[1];
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
GGML_ASSERT(v->ne[2] == n_tokens); GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens); GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
// TODO: can this ever be false? const float eps_norm = hparams.f_norm_rms_eps;
const bool use_qk_l2norm = true;
if (use_qk_l2norm) { q = ggml_l2_norm(ctx0, q, eps_norm);
const float eps_norm = hparams.f_norm_rms_eps; k = ggml_l2_norm(ctx0, k, eps_norm);
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
}
const float scale = 1.0f / sqrtf(S_v); const float scale = 1.0f / sqrtf(S_v);
q = ggml_scale(ctx0, q, scale); q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta); beta = ggml_sigmoid(ctx0, beta);
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
cb(q, "q_in", il); cb(q, "q_in", il);
cb(k, "k_in", il); cb(k, "k_in", il);
cb(v, "v_in", il); cb(v, "v_in", il);
cb(beta, "beta_in", il); cb(beta, "beta_in", il);
cb(g, "g_in", il); cb(g, "g_in", il);
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
cb(q, "q_perm", il); ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
cb(k, "k_perm", il); ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
cb(v, "v_perm", il);
cb(beta, "beta_perm", il);
cb(g, "g_perm", il);
cb(state, "state_in", il);
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); // Apply exponential to g_t
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); g_t = ggml_exp(ctx0, g_t);
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); // Apply the gated delta rule for the single timestep
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); // last_recurrent_state = last_recurrent_state * g_t
state = ggml_mul(ctx0, state, g_t);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
// we need to sum over dim=-2, so we transpose, sum, then transpose again
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
cb(k_beta, "k_beta", il); // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
cb(v_beta, "v_beta", il); ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
cb(g_cumsum, "g_cumsum", il); // delta = (v_t - kv_mem) * beta_t
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs] // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs] ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
state = ggml_add(ctx0, state, k_t_delta);
// Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs] // Compute the attention output
// ggml_tensor * gcs_i_broadcast = // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
// ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v, ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
// n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
// Don't need this, this one will get auto-broadcast // again, since it's over dim = -2, transpose, sum, transpose back
ggml_tensor * gcs_j_broadcast = ggml_tensor * core_attn_out =
ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
// Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
// Apply exponential to get the decay mask values
decay_mask = ggml_exp(ctx0, decay_mask);
// Apply lower triangular mask again to ensure only lower triangular values remain
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
cb(decay_mask, "decay_mask", il);
// attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
cb(kmulkbeta, "kmulkbeta", il);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_rec", il);
// for i in range(1, chunk_size):
// row = attn[..., i, :i].clone()
// sub = attn[..., :i, :i].clone()
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
//
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
// value = attn @ v_beta
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
cb(v, "value_beta", il);
// k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
cb(gexp, "g_cum_exp", il);
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il);
ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il);
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
attn = ggml_mul_mat(ctx0, k, q);
attn = ggml_mul(ctx0, attn, decay_mask);
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
cb(attn, "attn_decay_key", il);
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
cb(v_prime, "v_prime", il);
// v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
cb(v_attn, "v_attn", il);
ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out, "core_attn_out", il);
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
// key_gdiff = key * g_diff.unsqueeze(-1)
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * g_cum_last =
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
cb(g_cum_last, "g_cum_last", il);
ggml_tensor * gexp_last =
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
cb(gexp_last, "gexp_last", il);
ggml_tensor * g_cum_last_3d =
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
cb(g_cum_last_3d, "g_cum_last_3d", il);
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
cb(g_cumsum_3d, "g_cumsum_3d", il);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
cb(g_diff, "g_diff", il);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
cb(g_diff_exp, "g_diff_exp", il);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
cb(key_gdiff, "key_gdiff", il);
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
cb(kgdmulvnew, "kgdmulvnew", il);
state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il); cb(state, "new_state", il);
// flatten output // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
ggml_tensor * flat_output = ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs); ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
return ggml_concat(ctx0, flat_output, flat_state, 0); return ggml_concat(ctx0, flat_output, flat_state, 0);
} }
@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * cur, ggml_tensor * cur,
ggml_tensor * causal_mask, ggml_tensor * causal_mask,
ggml_tensor * identity, ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) { int il) {
const auto * mctx_cur = inp->mctx; const auto * mctx_cur = inp->mctx;
@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(mixed_ba, "linear_attn_mixed_ba", il); cb(mixed_ba, "linear_attn_mixed_ba", il);
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads] // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
// Split mixed_ba into b and a (beta and alpha parameters) // Split mixed_ba into b and a (beta and alpha parameters)
int64_t split_sizes_ba[2] = { int64_t split_sizes_ba[2] = {
@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il); cb(alpha_softplus, "a_softplus", il);
@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float)); (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
cb(z, "z", il); cb(z, "z", il);
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
ggml_nelements(mixed_qkvz));
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs] // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(k_conv, "k_conv_predelta", il); cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il); cb(v_conv, "v_conv_predelta", il);
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? ggml_tensor * attn_out;
build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) : if (n_seq_tokens == 1) {
build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il); attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
}
cb(attn_out, "attn_out", il); cb(attn_out, "attn_out", il);
// The tensors were concatenated 1d, so we need to extract them 1d as well // The tensors were concatenated 1d, so we need to extract them 1d as well

View File

@ -3588,6 +3588,163 @@ static void test_template_output_peg_parsers() {
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})"; t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
}); });
} }
{
// NVIDIA Nemotron-3 Nano
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
// Test basic message
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "Hello, world!\nWhat's up?";
t.expect = message_assist;
});
// Test basic message and reasoning with reasoning_format = none
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
t.expect.content = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
});
// Test basic message and reasoning with reasoning_format = auto
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
t.params.enable_thinking = true;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.expect = message_assist_thoughts;
});
// Test tool call
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {special_function_tool};
t.expect = message_assist_call;
});
// Test tool call with reasoning
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"I'm\nthinking\n</think>\n"
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {special_function_tool};
t.expect = message_assist_call_thoughts;
});
// Test parallel tool calls
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=special_function>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"</function>\n"
"</tool_call>\n"
"<tool_call>\n"
"<function=special_function_with_opt>\n"
"<parameter=arg1>\n"
"1\n"
"</parameter>\n"
"<parameter=arg2>\n"
"2\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.parallel_tool_calls = true;
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
t.expect.tool_calls = {{
/* .name = */ "special_function",
/* .arguments = */ R"({"arg1": 1})",
/* .id = */ {},
}, {
/* .name = */ "special_function_with_opt",
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
/* .id = */ {},
}};
});
// Test tool call with string parameter
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=python>\n"
"<parameter=code>\n"
"def hello():\n"
" print(\"Hello, world!\")\n"
"\n"
"hello()\n"
"</parameter>\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {python_tool};
t.expect.tool_calls = {{
/* .name = */ "python",
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
/* .id = */ {},
}};
});
// Test tool call with string parameter and no closing </parameter> tag
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"<tool_call>\n"
"<function=python>\n"
"<parameter=code>\n"
"def hello():\n"
" print(\"Hello, world!\")\n"
"\n"
"hello()\n"
"</function>\n"
"</tool_call>";
t.params.enable_thinking = false;
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.tools = {python_tool};
t.expect.tool_calls = {{
/* .name = */ "python",
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
/* .id = */ {},
}};
});
// Test response format
test_peg_parser(tmpls.get(), [&](auto & t) {
t.input =
"I need to output the invoice details in JSON\n"
"</think>\n"
R"({"amount": 123.45, "date": "2025-12-03"})";
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
t.params.json_schema = invoice_schema;
t.expect.reasoning_content = "I need to output the invoice details in JSON";
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
});
}
} }
static void test_msg_diffs_compute() { static void test_msg_diffs_compute() {

View File

@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
}); });
} }
static void test_resolves_to_string() {
fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
fprintf(stderr, "- %s\n", name.c_str());
common_schema_info info;
auto schema = nlohmann::ordered_json::parse(schema_str);
info.resolve_refs(schema);
bool result = info.resolves_to_string(schema);
if (result != expected) {
fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
fprintf(stderr, "Schema: %s\n", schema_str.c_str());
fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
assert(false);
}
};
// Basic type checks
test("type string", R"({"type": "string"})", true);
test("type integer", R"({"type": "integer"})", false);
test("type number", R"({"type": "number"})", false);
test("type boolean", R"({"type": "boolean"})", false);
test("type object", R"({"type": "object"})", false);
test("type array", R"({"type": "array"})", false);
// Type array (nullable string)
test("type array with string", R"({"type": ["string", "null"]})", true);
test("type array without string", R"({"type": ["integer", "null"]})", false);
// String-specific keywords
test("minLength implies string", R"({"minLength": 1})", true);
test("maxLength implies string", R"({"maxLength": 10})", true);
test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
// Format
test("format date", R"({"format": "date"})", true);
test("format uuid", R"({"format": "uuid"})", true);
test("format email", R"({"format": "email"})", true);
// Const
test("const string", R"({"const": "hello"})", true);
test("const number", R"({"const": 123})", false);
// Enum
test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
// anyOf
test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
// oneOf
test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
// allOf - all must be strings
test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
// $ref
test("$ref to string",
R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
test("$ref to integer",
R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
// Nested
test("nested anyOf with string",
R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
fprintf(stderr, "All resolves_to_string tests passed!\n");
}
int main() { int main() {
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false"); fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false"); fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
test_resolves_to_string();
test_all("C++", [](const TestCase & tc) { test_all("C++", [](const TestCase & tc) {
try { try {
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true)); tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));

View File

@ -87,9 +87,6 @@ int main(int argc, char ** argv) {
common_params params; common_params params;
g_params = &params; g_params = &params;
// disable jinja by default
params.use_jinja = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
return 1; return 1;
} }

View File

@ -15,6 +15,7 @@ add_library(mtmd
clip-graph.h clip-graph.h
models/models.h models/models.h
models/cogvlm.cpp models/cogvlm.cpp
models/glm4v.cpp
models/internvl.cpp models/internvl.cpp
models/kimivl.cpp models/kimivl.cpp
models/llama4.cpp models/llama4.cpp

View File

@ -9,6 +9,8 @@
#include <vector> #include <vector>
#include <functional> #include <functional>
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
struct clip_graph { struct clip_graph {
const clip_model & model; const clip_model & model;
const clip_hparams & hparams; const clip_hparams & hparams;
@ -49,7 +51,7 @@ struct clip_graph {
void cb(ggml_tensor * cur0, const char * name, int il) const; void cb(ggml_tensor * cur0, const char * name, int il) const;
// siglip2 naflex // siglip2 naflex
ggml_tensor * resize_position_embeddings(); ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
// build vision transformer (ViT) cgraph // build vision transformer (ViT) cgraph
// this function should cover most of the models // this function should cover most of the models

View File

@ -68,6 +68,7 @@
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_NORM_EMBD "v.norm_embd.%s"
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -86,6 +87,10 @@
#define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s" #define TN_LN_POST "%s.post_ln.%s"
#define TN_LLAVA_PROJ "mm.%d.%s" #define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MM_UP "mm.up.%s"
#define TN_MM_GATE "mm.gate.%s"
#define TN_MM_DOWN "mm.down.%s"
#define TN_MM_POST_NORM "mm.post_norm.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
@ -95,7 +100,7 @@
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 #define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) #define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
@ -165,6 +170,7 @@ enum projector_type {
PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -192,6 +198,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {
@ -495,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
} }
} }
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
// //
// API used internally with mtmd // API used internally with mtmd
// //

View File

@ -158,6 +158,8 @@ struct clip_model {
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
ggml_tensor * patch_bias = nullptr; ggml_tensor * patch_bias = nullptr;
ggml_tensor * position_embeddings = nullptr; ggml_tensor * position_embeddings = nullptr;
ggml_tensor * norm_embd_w = nullptr;
ggml_tensor * norm_embd_b = nullptr;
ggml_tensor * pre_ln_w = nullptr; ggml_tensor * pre_ln_w = nullptr;
ggml_tensor * pre_ln_b = nullptr; ggml_tensor * pre_ln_b = nullptr;
@ -172,6 +174,14 @@ struct clip_model {
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_w;
ggml_tensor * mm_fc_b; ggml_tensor * mm_fc_b;
ggml_tensor * mm_ffn_up_w = nullptr;
ggml_tensor * mm_ffn_up_b = nullptr;
ggml_tensor * mm_ffn_gate_w = nullptr;
ggml_tensor * mm_ffn_gate_b = nullptr;
ggml_tensor * mm_ffn_down_w = nullptr;
ggml_tensor * mm_ffn_down_b = nullptr;
ggml_tensor * mm_post_norm_w = nullptr;
ggml_tensor * mm_post_norm_b = nullptr;
// LLaVA projection // LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr; ggml_tensor * mm_input_norm_w = nullptr;
@ -253,9 +263,10 @@ struct clip_model {
ggml_tensor * mm_input_proj_w = nullptr; ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr; ggml_tensor * mm_soft_emb_norm_w = nullptr;
// pixtral // pixtral, glm4v
ggml_tensor * token_embd_img_break = nullptr; ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr; ggml_tensor * mm_patch_merger_w = nullptr;
ggml_tensor * mm_patch_merger_b = nullptr;
// ultravox / whisper encoder // ultravox / whisper encoder
ggml_tensor * conv1d_1_w = nullptr; ggml_tensor * conv1d_1_w = nullptr;

View File

@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
} }
// siglip2 naflex // siglip2 naflex
ggml_tensor * clip_graph::resize_position_embeddings() { ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
ggml_tensor * pos_embd = model.position_embeddings; ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size; const int height = img.ny / patch_size;
const int width = img.nx / patch_size; const int width = img.nx / patch_size;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; const uint32_t mode = interpolation_mode;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
GGML_ASSERT(pos_embd); GGML_ASSERT(pos_embd);
@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
? ggml_rms_norm(ctx0, cur, norm_eps) ? ggml_rms_norm(ctx0, cur, norm_eps)
: ggml_norm(ctx0, cur, norm_eps); : ggml_norm(ctx0, cur, norm_eps);
if (mw || mb) {
cb(cur, "norm", il);
}
if (mw) { if (mw) {
cur = ggml_mul(ctx0, cur, mw); cur = ggml_mul(ctx0, cur, mw);
if (mb) { cb(cur, "norm_w", il);
cb(cur, "norm_w", il);
}
} }
if (mb) { if (mb) {
cur = ggml_add(ctx0, cur, mb); cur = ggml_add(ctx0, cur, mb);
cb(cur, "norm_b", il);
} }
return cur; return cur;
@ -842,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
builder = std::make_unique<clip_graph_llava>(ctx, img); builder = std::make_unique<clip_graph_llava>(ctx, img);
} break; } break;
case PROJECTOR_TYPE_GLM4V:
{
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
} break;
default: default:
GGML_ABORT("missing cgraph builder"); GGML_ABORT("missing cgraph builder");
} }
@ -1155,6 +1154,14 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
} }
} break; } break;
case PROJECTOR_TYPE_GLM4V:
{
hparams.rope_theta = 10000.0f;
hparams.n_merge = 2; // default value for GLM4-V
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 4096);
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_LLAMA4: case PROJECTOR_TYPE_LLAMA4:
{ {
hparams.rope_theta = 10000.0f; hparams.rope_theta = 10000.0f;
@ -1282,6 +1289,9 @@ struct clip_model_loader {
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
// layers // layers
@ -1470,6 +1480,20 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break; } break;
case PROJECTOR_TYPE_GLM4V:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
} break;
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
{ {
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
@ -1498,8 +1522,8 @@ struct clip_model_loader {
// [IMG_BREAK] token embedding // [IMG_BREAK] token embedding
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
// for mistral small 3.1 // for mistral small 3.1
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
} break; } break;
case PROJECTOR_TYPE_LIGHTONOCR: case PROJECTOR_TYPE_LIGHTONOCR:
{ {
@ -1507,8 +1531,8 @@ struct clip_model_loader {
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false); model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
} break; } break;
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
{ {
@ -1873,6 +1897,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
if (ctx_params.warmup) { if (ctx_params.warmup) {
loader.warmup(*ctx_vision); loader.warmup(*ctx_vision);
} }
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
} }
if (loader.has_audio) { if (loader.has_audio) {
@ -2582,6 +2608,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{ {
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
clip_image_u8 resized; clip_image_u8 resized;
@ -2824,16 +2851,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams; const auto & params = ctx->model.hparams;
const int n_total = clip_n_output_tokens(ctx, img); const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { const auto & proj = ctx->proj_type();
return img->nx / (params.patch_size * 2); switch (proj) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return (img->nx / params.patch_size) / 2;
default:
break;
} }
return n_total; return n_total;
} }
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams; const auto & params = ctx->model.hparams;
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { const auto & proj = ctx->proj_type();
return img->ny / (params.patch_size * 2); switch (proj) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
return (img->ny / params.patch_size) / 2;
default:
break;
} }
return 1; return 1;
} }
@ -2890,6 +2931,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{ {
// dynamic size (2 conv, so double patch size) // dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2); int x_patch = img->nx / (params.patch_size * 2);
@ -3137,6 +3179,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} break; } break;
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
{ {
const int merge_ratio = hparams.n_merge; const int merge_ratio = hparams.n_merge;
const int pw = image_size_width / patch_size; const int pw = image_size_width / patch_size;
@ -3363,7 +3406,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
// copy the embeddings to the location passed by the user // copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); if (vec != nullptr) {
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
}
return true; return true;
} }
@ -3411,6 +3456,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1]; return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_GLM4V:
return ctx->model.mm_ffn_down_w->ne[1];
default: default:
GGML_ABORT("Unknown projector type"); GGML_ABORT("Unknown projector type");
} }
@ -3427,10 +3474,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
} }
bool clip_is_qwen2vl(const struct clip_ctx * ctx) { bool clip_is_mrope(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL; || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
} }
bool clip_is_llava(const struct clip_ctx * ctx) { bool clip_is_llava(const struct clip_ctx * ctx) {
@ -3491,3 +3539,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
return &ctx->model.hparams; return &ctx->model.hparams;
} }
//
// API for debugging
//
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
clip_image_f32 img;
img.nx = w;
img.ny = h;
img.buf.resize(h * w * 3);
for (int i = 0; i < h * w * 3; i++) {
img.buf[i] = static_cast<float>(fill_value);
}
bool cur_debug_graph = ctx->debug_graph;
ctx->debug_graph = true;
clip_image_encode(ctx, 1, &img, nullptr);
ctx->debug_graph = cur_debug_graph;
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
}

View File

@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
int clip_is_minicpmv(const struct clip_ctx * ctx); int clip_is_minicpmv(const struct clip_ctx * ctx);
bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_qwen2vl(const struct clip_ctx * ctx); bool clip_is_mrope(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_is_gemma3(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx);

120
tools/mtmd/models/glm4v.cpp Normal file
View File

@ -0,0 +1,120 @@
#include "models.h"
ggml_cgraph * clip_graph_glm4v::build() {
GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
norm_type norm_t = NORM_TYPE_RMS;
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
// add patch bias
inp = ggml_add(ctx0, inp, model.patch_bias);
cb(inp, "patch_bias", -1);
// pos-conv norm
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
// calculate absolute position embedding and apply
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
learned_pos_embd = ggml_cont_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
learned_pos_embd = ggml_reshape_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
learned_pos_embd = ggml_cont_3d(
ctx0, learned_pos_embd,
n_embd, n_patches_x * n_patches_y, batch_size);
cb(learned_pos_embd, "learned_pos_embd", -1);
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return ggml_rope_multi(
ctx0, cur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
};
ggml_tensor * cur = build_vit(
inp, n_patches,
norm_t,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
// GLM4V projector
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
// patch merger (downsample)
{
int n_merge = hparams.n_merge;
GGML_ASSERT(n_merge > 0);
int n_token_out = n_patches / n_merge / n_merge;
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
}
// FC projector
{
cur = ggml_mul_mat(ctx0, model.projection, cur);
// default LayerNorm (post_projection_norm)
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_gelu_erf(ctx0, cur);
cb(cur, "after_fc_proj", -1);
}
// FFN projector
{
cur = build_ffn(cur,
model.mm_ffn_up_w, model.mm_ffn_up_b,
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
model.mm_ffn_down_w, model.mm_ffn_down_b,
hparams.ffn_op, -1);
cb(cur, "after_ffn_proj", -1);
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph {
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override; ggml_cgraph * build() override;
}; };
struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@ -270,8 +270,6 @@ int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
common_params params; common_params params;
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) { if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
return 1; return 1;

View File

@ -217,7 +217,7 @@ struct mtmd_context {
void init_vision() { void init_vision() {
GGML_ASSERT(ctx_v != nullptr); GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_v); use_mrope = clip_is_mrope(ctx_v);
projector_type proj = clip_get_projector_type(ctx_v); projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v); int minicpmv_version = clip_is_minicpmv(ctx_v);
@ -309,6 +309,10 @@ struct mtmd_context {
img_beg = "<|image_start|>"; img_beg = "<|image_start|>";
img_end = "<|image_end|>"; img_end = "<|image_end|>";
} else if (proj == PROJECTOR_TYPE_GLM4V) {
img_beg = "<|begin_of_image|>";
img_end = "<|end_of_image|>";
} }
} }

View File

@ -52,7 +52,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) | | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) | | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) | | `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
@ -67,11 +66,10 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) | | `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) | | `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
@ -150,19 +148,20 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
**Example-specific params** **Server-specific params**
| Argument | Explanation | | Argument | Explanation |
| -------- | ----------- | | -------- | ----------- |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) | | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-sp, --special` | special tokens output enabled (default: false) | | `-sp, --special` | special tokens output enabled (default: false) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | | `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
@ -1430,7 +1429,7 @@ Model presets allow advanced users to define custom configurations using an `.in
llama-server --models-preset ./my-models.ini llama-server --models-preset ./my-models.ini
``` ```
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`. Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`.
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys. Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
@ -1445,7 +1444,7 @@ version = 1
; string value ; string value
chat-template = chatml chat-template = chatml
; numeric value ; numeric value
n-gpu-layer = 123 n-gpu-layers = 123
; flag value (for certain flags, you need to use the "no-" prefix for negation) ; flag value (for certain flags, you need to use the "no-" prefix for negation)
jinja = true jinja = true
; shorthand argument (for example, context size) ; shorthand argument (for example, context size)

Binary file not shown.

View File

@ -73,12 +73,17 @@ int main(int argc, char ** argv, char ** envp) {
return 1; return 1;
} }
// TODO: should we have a separate n_parallel parameter for the server? // validate batch size for embeddings
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177 // embeddings require all tokens to be processed in a single ubatch
// TODO: this is a common configuration that is suitable for most local use cases // see https://github.com/ggml-org/llama.cpp/issues/12836
// however, overriding the parameters is a bit confusing - figure out something more intuitive if (params.embedding && params.n_batch > params.n_ubatch) {
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) { LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__); LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
params.n_batch = params.n_ubatch;
}
if (params.n_parallel < 0) {
LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
params.n_parallel = 4; params.n_parallel = 4;
params.kv_unified = true; params.kv_unified = true;

View File

@ -619,11 +619,12 @@ flowchart TB
### Test Types ### Test Types
| Type | Tool | Location | Command | | Type | Tool | Location | Command |
| ------------- | ------------------ | -------------------------------- | ------------------- | | ------------- | ------------------ | ---------------- | ------------------- |
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` | | **Unit** | Vitest | `tests/unit/` | `npm run test:unit` |
| **Unit** | Vitest | `tests/client/`, `tests/server/` | `npm run test:unit` | | **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` | | **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
| **Client** | Vitest | `tests/client/`. | `npm run test:unit` |
### Running Tests ### Running Tests

View File

@ -13,12 +13,11 @@
"reset": "rm -rf .svelte-kit node_modules", "reset": "rm -rf .svelte-kit node_modules",
"format": "prettier --write .", "format": "prettier --write .",
"lint": "prettier --check . && eslint .", "lint": "prettier --check . && eslint .",
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:server -- --run && npm run test:e2e", "test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
"test:e2e": "playwright test", "test:e2e": "playwright test",
"test:client": "vitest --project=client", "test:client": "vitest --project=client",
"test:server": "vitest --project=server", "test:unit": "vitest --project=unit",
"test:ui": "vitest --project=ui", "test:ui": "vitest --project=ui",
"test:unit": "vitest",
"storybook": "storybook dev -p 6006", "storybook": "storybook dev -p 6006",
"build-storybook": "storybook build", "build-storybook": "storybook build",
"cleanup": "rm -rf .svelte-kit build node_modules test-results" "cleanup": "rm -rf .svelte-kit build node_modules test-results"

View File

@ -241,7 +241,7 @@
</div> </div>
{/if} {/if}
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent} {:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="69rem" /> <SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="calc(69rem - 2rem)" />
{:else if isAudio} {:else if isAudio}
<div class="flex items-center justify-center p-8"> <div class="flex items-center justify-center p-8">
<div class="w-full max-w-md text-center"> <div class="w-full max-w-md text-center">

View File

@ -24,7 +24,7 @@
MimeTypeImage, MimeTypeImage,
MimeTypeText MimeTypeText
} from '$lib/enums'; } from '$lib/enums';
import { isIMEComposing } from '$lib/utils'; import { isIMEComposing, parseClipboardContent } from '$lib/utils';
import { import {
AudioRecorder, AudioRecorder,
convertToWav, convertToWav,
@ -191,7 +191,6 @@
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return; if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
// Check if model is selected first
if (!checkModelSelected()) return; if (!checkModelSelected()) return;
const messageToSend = message.trim(); const messageToSend = message.trim();
@ -228,6 +227,31 @@
const text = event.clipboardData.getData(MimeTypeText.PLAIN); const text = event.clipboardData.getData(MimeTypeText.PLAIN);
if (text.startsWith('"')) {
const parsed = parseClipboardContent(text);
if (parsed.textAttachments.length > 0) {
event.preventDefault();
message = parsed.message;
const attachmentFiles = parsed.textAttachments.map(
(att) =>
new File([att.content], att.name, {
type: MimeTypeText.PLAIN
})
);
onFileUpload?.(attachmentFiles);
setTimeout(() => {
textareaRef?.focus();
}, 10);
return;
}
}
if ( if (
text.length > 0 && text.length > 0 &&
pasteLongTextToFileLength > 0 && pasteLongTextToFileLength > 0 &&

View File

@ -35,7 +35,7 @@
<div class="flex items-center gap-1 {className}"> <div class="flex items-center gap-1 {className}">
<DropdownMenu.Root> <DropdownMenu.Root>
<DropdownMenu.Trigger name="Attach files"> <DropdownMenu.Trigger name="Attach files" {disabled}>
<Tooltip.Root> <Tooltip.Root>
<Tooltip.Trigger> <Tooltip.Trigger>
<Button <Button

View File

@ -173,6 +173,7 @@
/> />
<ModelsSelector <ModelsSelector
{disabled}
bind:this={selectorModelRef} bind:this={selectorModelRef}
currentModel={conversationModel} currentModel={conversationModel}
forceForegroundText={true} forceForegroundText={true}

View File

@ -1,6 +1,7 @@
<script lang="ts"> <script lang="ts">
import { chatStore } from '$lib/stores/chat.svelte'; import { chatStore } from '$lib/stores/chat.svelte';
import { copyToClipboard, isIMEComposing } from '$lib/utils'; import { config } from '$lib/stores/settings.svelte';
import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
import ChatMessageAssistant from './ChatMessageAssistant.svelte'; import ChatMessageAssistant from './ChatMessageAssistant.svelte';
import ChatMessageUser from './ChatMessageUser.svelte'; import ChatMessageUser from './ChatMessageUser.svelte';
import ChatMessageSystem from './ChatMessageSystem.svelte'; import ChatMessageSystem from './ChatMessageSystem.svelte';
@ -87,7 +88,9 @@
} }
async function handleCopy() { async function handleCopy() {
await copyToClipboard(message.content, 'Message copied to clipboard'); const asPlainText = Boolean(config().copyTextAttachmentsAsPlainText);
const clipboardContent = formatMessageForClipboard(message.content, message.extra, asPlainText);
await copyToClipboard(clipboardContent, 'Message copied to clipboard');
onCopy?.(message); onCopy?.(message);
} }

View File

@ -57,6 +57,11 @@
label: 'Paste long text to file length', label: 'Paste long text to file length',
type: 'input' type: 'input'
}, },
{
key: 'copyTextAttachmentsAsPlainText',
label: 'Copy text attachments as plain text',
type: 'checkbox'
},
{ {
key: 'enableContinueGeneration', key: 'enableContinueGeneration',
label: 'Enable "Continue" button', label: 'Enable "Continue" button',
@ -109,6 +114,16 @@
key: 'disableAutoScroll', key: 'disableAutoScroll',
label: 'Disable automatic scroll', label: 'Disable automatic scroll',
type: 'checkbox' type: 'checkbox'
},
{
key: 'alwaysShowSidebarOnDesktop',
label: 'Always show sidebar on desktop',
type: 'checkbox'
},
{
key: 'autoShowSidebarOnNewChat',
label: 'Auto-show sidebar on new chat',
type: 'checkbox'
} }
] ]
}, },
@ -404,7 +419,7 @@
</div> </div>
<!-- Mobile Header with Horizontal Scrollable Menu --> <!-- Mobile Header with Horizontal Scrollable Menu -->
<div class="flex flex-col md:hidden"> <div class="flex flex-col pt-6 md:hidden">
<div class="border-b border-border/30 py-4"> <div class="border-b border-border/30 py-4">
<!-- Horizontal Scrollable Category Menu with Navigation --> <!-- Horizontal Scrollable Category Menu with Navigation -->
<div class="relative flex items-center" style="scroll-padding: 1rem;"> <div class="relative flex items-center" style="scroll-padding: 1rem;">

View File

@ -72,9 +72,10 @@
<div <div
class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}" class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}"
style="max-height: {maxHeight};" style="max-height: {maxHeight}; max-width: {maxWidth};"
> >
<pre class="m-0 overflow-x-auto p-4 max-w-[{maxWidth}]"><code class="hljs text-sm leading-relaxed" <!-- Needs to be formatted as single line for proper rendering -->
<pre class="m-0 overflow-x-auto p-4"><code class="hljs text-sm leading-relaxed"
>{@html highlightedHtml}</code >{@html highlightedHtml}</code
></pre> ></pre>
</div> </div>

View File

@ -179,51 +179,37 @@
}); });
}); });
// Handle changes to the model selector pop-down or the model dialog, depending on if the server is in
// router mode or not.
function handleOpenChange(open: boolean) { function handleOpenChange(open: boolean) {
if (loading || updating) return; if (loading || updating) return;
if (open) { if (isRouter) {
isOpen = true; if (open) {
searchTerm = ''; isOpen = true;
highlightedIndex = -1; searchTerm = '';
highlightedIndex = -1;
// Focus search input after popover opens // Focus search input after popover opens
tick().then(() => { tick().then(() => {
requestAnimationFrame(() => searchInputRef?.focus()); requestAnimationFrame(() => searchInputRef?.focus());
}); });
if (isRouter) {
modelsStore.fetchRouterModels().then(() => { modelsStore.fetchRouterModels().then(() => {
modelsStore.fetchModalitiesForLoadedModels(); modelsStore.fetchModalitiesForLoadedModels();
}); });
} else {
isOpen = false;
searchTerm = '';
highlightedIndex = -1;
} }
} else { } else {
isOpen = false; showModelDialog = open;
searchTerm = '';
highlightedIndex = -1;
} }
} }
function handleTriggerClick() {
if (loading || updating) return;
if (!isRouter) {
// Single model mode: show dialog instead of popover
showModelDialog = true;
}
// For router mode, the Popover handles open/close
}
export function open() { export function open() {
if (isRouter) { handleOpenChange(true);
handleOpenChange(true);
} else {
showModelDialog = true;
}
}
function closeMenu() {
handleOpenChange(false);
} }
function handleSearchKeyDown(event: KeyboardEvent) { function handleSearchKeyDown(event: KeyboardEvent) {
@ -292,7 +278,7 @@
} }
if (shouldCloseMenu) { if (shouldCloseMenu) {
closeMenu(); handleOpenChange(false);
// Focus the chat textarea after model selection // Focus the chat textarea after model selection
requestAnimationFrame(() => { requestAnimationFrame(() => {
@ -360,8 +346,181 @@
{:else} {:else}
{@const selectedOption = getDisplayOption()} {@const selectedOption = getDisplayOption()}
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}> {#if isRouter}
<Popover.Trigger <Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
<Popover.Trigger
class={cn(
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
!isCurrentModelInCache()
? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
: forceForegroundText
? 'text-foreground'
: isHighlightedCurrentModelActive
? 'text-foreground'
: 'text-muted-foreground',
isOpen ? 'text-foreground' : ''
)}
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
disabled={disabled || updating}
>
<Package class="h-3.5 w-3.5" />
<span class="truncate font-medium">
{selectedOption?.model || 'Select model'}
</span>
{#if updating}
<Loader2 class="h-3 w-3.5 animate-spin" />
{:else}
<ChevronDown class="h-3 w-3.5" />
{/if}
</Popover.Trigger>
<Popover.Content
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
align="end"
sideOffset={8}
collisionPadding={16}
>
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
<div
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
>
<SearchInput
id="model-search"
placeholder="Search models..."
bind:value={searchTerm}
bind:ref={searchInputRef}
onClose={() => handleOpenChange(false)}
onKeyDown={handleSearchKeyDown}
/>
</div>
<div
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
>
{#if !isCurrentModelInCache() && currentModel}
<!-- Show unavailable model as first option (disabled) -->
<button
type="button"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
role="option"
aria-selected="true"
aria-disabled="true"
disabled
>
<span class="truncate">{selectedOption?.name || currentModel}</span>
<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
</button>
<div class="my-1 h-px bg-border"></div>
{/if}
{#if filteredOptions.length === 0}
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
{/if}
{#each filteredOptions as option, index (option.id)}
{@const status = getModelStatus(option.model)}
{@const isLoaded = status === ServerModelStatus.LOADED}
{@const isLoading = status === ServerModelStatus.LOADING}
{@const isSelected = currentModel === option.model || activeId === option.id}
{@const isCompatible = isModelCompatible(option)}
{@const isHighlighted = index === highlightedIndex}
{@const missingModalities = getMissingModalities(option)}
<div
class={cn(
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
isCompatible
? 'cursor-pointer hover:bg-muted focus:bg-muted'
: 'cursor-not-allowed opacity-50',
isSelected || isHighlighted
? 'bg-accent text-accent-foreground'
: isCompatible
? 'hover:bg-accent hover:text-accent-foreground'
: '',
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
)}
role="option"
aria-selected={isSelected || isHighlighted}
aria-disabled={!isCompatible}
tabindex={isCompatible ? 0 : -1}
onclick={() => isCompatible && handleSelect(option.id)}
onmouseenter={() => (highlightedIndex = index)}
onkeydown={(e) => {
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
e.preventDefault();
handleSelect(option.id);
}
}}
>
<span class="min-w-0 flex-1 truncate">{option.model}</span>
{#if missingModalities}
<span class="flex shrink-0 items-center gap-1 text-muted-foreground/70">
{#if missingModalities.vision}
<Tooltip.Root>
<Tooltip.Trigger>
<EyeOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No vision support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
{#if missingModalities.audio}
<Tooltip.Root>
<Tooltip.Trigger>
<MicOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No audio support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
</span>
{/if}
{#if isLoading}
<Tooltip.Root>
<Tooltip.Trigger>
<Loader2 class="h-4 w-4 shrink-0 animate-spin text-muted-foreground" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Loading model...</p>
</Tooltip.Content>
</Tooltip.Root>
{:else if isLoaded}
<Tooltip.Root>
<Tooltip.Trigger>
<button
type="button"
class="relative ml-2 flex h-4 w-4 shrink-0 items-center justify-center"
onclick={(e) => {
e.stopPropagation();
modelsStore.unloadModel(option.model);
}}
>
<span
class="mr-2 h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
></span>
<Power
class="absolute mr-2 h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
/>
</button>
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Unload model</p>
</Tooltip.Content>
</Tooltip.Root>
{:else}
<span class="mx-2 h-2 w-2 rounded-full bg-muted-foreground/50"></span>
{/if}
</div>
{/each}
</div>
</div>
</Popover.Content>
</Popover.Root>
{:else}
<button
class={cn( class={cn(
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`, `inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
!isCurrentModelInCache() !isCurrentModelInCache()
@ -374,165 +533,20 @@
isOpen ? 'text-foreground' : '' isOpen ? 'text-foreground' : ''
)} )}
style="max-width: min(calc(100cqw - 6.5rem), 32rem)" style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
onclick={handleTriggerClick} onclick={() => handleOpenChange(true)}
disabled={disabled || updating || !isRouter} disabled={disabled || updating}
> >
<Package class="h-3.5 w-3.5" /> <Package class="h-3.5 w-3.5" />
<span class="truncate font-medium"> <span class="truncate font-medium">
{selectedOption?.model || 'Select model'} {selectedOption?.model}
</span> </span>
{#if updating} {#if updating}
<Loader2 class="h-3 w-3.5 animate-spin" /> <Loader2 class="h-3 w-3.5 animate-spin" />
{:else if isRouter}
<ChevronDown class="h-3 w-3.5" />
{/if} {/if}
</Popover.Trigger> </button>
{/if}
<Popover.Content
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
align="end"
sideOffset={8}
collisionPadding={16}
>
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
<div
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
>
<SearchInput
id="model-search"
placeholder="Search models..."
bind:value={searchTerm}
bind:ref={searchInputRef}
onClose={closeMenu}
onKeyDown={handleSearchKeyDown}
/>
</div>
<div
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
>
{#if !isCurrentModelInCache() && currentModel}
<!-- Show unavailable model as first option (disabled) -->
<button
type="button"
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
role="option"
aria-selected="true"
aria-disabled="true"
disabled
>
<span class="truncate">{selectedOption?.name || currentModel}</span>
<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
</button>
<div class="my-1 h-px bg-border"></div>
{/if}
{#if filteredOptions.length === 0}
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
{/if}
{#each filteredOptions as option, index (option.id)}
{@const status = getModelStatus(option.model)}
{@const isLoaded = status === ServerModelStatus.LOADED}
{@const isLoading = status === ServerModelStatus.LOADING}
{@const isSelected = currentModel === option.model || activeId === option.id}
{@const isCompatible = isModelCompatible(option)}
{@const isHighlighted = index === highlightedIndex}
{@const missingModalities = getMissingModalities(option)}
<div
class={cn(
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
isCompatible
? 'cursor-pointer hover:bg-muted focus:bg-muted'
: 'cursor-not-allowed opacity-50',
isSelected || isHighlighted
? 'bg-accent text-accent-foreground'
: isCompatible
? 'hover:bg-accent hover:text-accent-foreground'
: '',
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
)}
role="option"
aria-selected={isSelected || isHighlighted}
aria-disabled={!isCompatible}
tabindex={isCompatible ? 0 : -1}
onclick={() => isCompatible && handleSelect(option.id)}
onmouseenter={() => (highlightedIndex = index)}
onkeydown={(e) => {
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
e.preventDefault();
handleSelect(option.id);
}
}}
>
<span class="min-w-0 flex-1 truncate">{option.model}</span>
{#if missingModalities}
<span class="flex shrink-0 items-center gap-1 text-muted-foreground/70">
{#if missingModalities.vision}
<Tooltip.Root>
<Tooltip.Trigger>
<EyeOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No vision support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
{#if missingModalities.audio}
<Tooltip.Root>
<Tooltip.Trigger>
<MicOff class="h-3.5 w-3.5" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>No audio support</p>
</Tooltip.Content>
</Tooltip.Root>
{/if}
</span>
{/if}
{#if isLoading}
<Tooltip.Root>
<Tooltip.Trigger>
<Loader2 class="h-4 w-4 shrink-0 animate-spin text-muted-foreground" />
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Loading model...</p>
</Tooltip.Content>
</Tooltip.Root>
{:else if isLoaded}
<Tooltip.Root>
<Tooltip.Trigger>
<button
type="button"
class="relative ml-2 flex h-4 w-4 shrink-0 items-center justify-center"
onclick={(e) => {
e.stopPropagation();
modelsStore.unloadModel(option.model);
}}
>
<span
class="mr-2 h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
></span>
<Power
class="absolute mr-2 h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
/>
</button>
</Tooltip.Trigger>
<Tooltip.Content class="z-[9999]">
<p>Unload model</p>
</Tooltip.Content>
</Tooltip.Root>
{:else}
<span class="mx-2 h-2 w-2 rounded-full bg-muted-foreground/50"></span>
{/if}
</div>
{/each}
</div>
</div>
</Popover.Content>
</Popover.Root>
{/if} {/if}
</div> </div>

View File

@ -12,9 +12,12 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
showMessageStats: true, showMessageStats: true,
askForTitleConfirmation: false, askForTitleConfirmation: false,
pasteLongTextToFileLen: 2500, pasteLongTextToFileLen: 2500,
copyTextAttachmentsAsPlainText: false,
pdfAsImage: false, pdfAsImage: false,
disableAutoScroll: false, disableAutoScroll: false,
renderUserContentAsMarkdown: false, renderUserContentAsMarkdown: false,
alwaysShowSidebarOnDesktop: false,
autoShowSidebarOnNewChat: true,
autoMicOnEmpty: false, autoMicOnEmpty: false,
// make sure these default values are in sync with `common.h` // make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature', samplers: 'top_k;typ_p;top_p;min_p;temperature',
@ -50,6 +53,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.', 'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
pasteLongTextToFileLen: pasteLongTextToFileLen:
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.', 'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
copyTextAttachmentsAsPlainText:
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
samplers: samplers:
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature', 'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
temperature: temperature:
@ -96,6 +101,10 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
disableAutoScroll: disableAutoScroll:
'Disable automatic scrolling while messages stream so you can control the viewport position manually.', 'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.', renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
alwaysShowSidebarOnDesktop:
'Always keep the sidebar visible on desktop instead of auto-hiding it.',
autoShowSidebarOnNewChat:
'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
autoMicOnEmpty: autoMicOnEmpty:
'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.', 'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
pyInterpreterEnabled: pyInterpreterEnabled:

View File

@ -0,0 +1,262 @@
import { toast } from 'svelte-sonner';
import { AttachmentType } from '$lib/enums';
import type {
DatabaseMessageExtra,
DatabaseMessageExtraTextFile,
DatabaseMessageExtraLegacyContext
} from '$lib/types/database';
/**
* Copy text to clipboard with toast notification
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
* @param text - Text to copy to clipboard
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyToClipboard(
text: string,
successMessage = 'Copied to clipboard',
errorMessage = 'Failed to copy to clipboard'
): Promise<boolean> {
try {
// Try modern clipboard API first (secure contexts only)
if (navigator.clipboard && navigator.clipboard.writeText) {
await navigator.clipboard.writeText(text);
toast.success(successMessage);
return true;
}
// Fallback for non-secure contexts
const textArea = document.createElement('textarea');
textArea.value = text;
textArea.style.position = 'fixed';
textArea.style.left = '-999999px';
textArea.style.top = '-999999px';
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
const successful = document.execCommand('copy');
document.body.removeChild(textArea);
if (successful) {
toast.success(successMessage);
return true;
} else {
throw new Error('execCommand failed');
}
} catch (error) {
console.error('Failed to copy to clipboard:', error);
toast.error(errorMessage);
return false;
}
}
/**
* Copy code with HTML entity decoding and toast notification
* @param rawCode - Raw code string that may contain HTML entities
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyCodeToClipboard(
rawCode: string,
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise<boolean> {
const doc = new DOMParser().parseFromString(rawCode, 'text/html');
const decodedCode = doc.body.textContent ?? rawCode;
return copyToClipboard(decodedCode, successMessage, errorMessage);
}
/**
* Format for text attachments when copied to clipboard
*/
export interface ClipboardTextAttachment {
type: typeof AttachmentType.TEXT;
name: string;
content: string;
}
/**
* Parsed result from clipboard content
*/
export interface ParsedClipboardContent {
message: string;
textAttachments: ClipboardTextAttachment[];
}
/**
* Formats a message with text attachments for clipboard copying.
*
* Default format (asPlainText = false):
* ```
* "Text message content"
* [
* {"type":"TEXT","name":"filename.txt","content":"..."},
* {"type":"TEXT","name":"another.txt","content":"..."}
* ]
* ```
*
* Plain text format (asPlainText = true):
* ```
* Text message content
*
* file content here
*
* another file content
* ```
*
* @param content - The message text content
* @param extras - Optional array of message attachments
* @param asPlainText - If true, format as plain text without JSON structure
* @returns Formatted string for clipboard
*/
export function formatMessageForClipboard(
content: string,
extras?: DatabaseMessageExtra[],
asPlainText: boolean = false
): string {
// Filter only text attachments (TEXT type and legacy CONTEXT type)
const textAttachments =
extras?.filter(
(extra): extra is DatabaseMessageExtraTextFile | DatabaseMessageExtraLegacyContext =>
extra.type === AttachmentType.TEXT || extra.type === AttachmentType.LEGACY_CONTEXT
) ?? [];
if (textAttachments.length === 0) {
return content;
}
if (asPlainText) {
const parts = [content];
for (const att of textAttachments) {
parts.push(att.content);
}
return parts.join('\n\n');
}
const clipboardAttachments: ClipboardTextAttachment[] = textAttachments.map((att) => ({
type: AttachmentType.TEXT,
name: att.name,
content: att.content
}));
return `${JSON.stringify(content)}\n${JSON.stringify(clipboardAttachments, null, 2)}`;
}
/**
* Parses clipboard content to extract message and text attachments.
* Supports both plain text and the special format with attachments.
*
* @param clipboardText - Raw text from clipboard
* @returns Parsed content with message and attachments
*/
export function parseClipboardContent(clipboardText: string): ParsedClipboardContent {
const defaultResult: ParsedClipboardContent = {
message: clipboardText,
textAttachments: []
};
if (!clipboardText.startsWith('"')) {
return defaultResult;
}
try {
let stringEndIndex = -1;
let escaped = false;
for (let i = 1; i < clipboardText.length; i++) {
const char = clipboardText[i];
if (escaped) {
escaped = false;
continue;
}
if (char === '\\') {
escaped = true;
continue;
}
if (char === '"') {
stringEndIndex = i;
break;
}
}
if (stringEndIndex === -1) {
return defaultResult;
}
const jsonStringPart = clipboardText.substring(0, stringEndIndex + 1);
const remainingPart = clipboardText.substring(stringEndIndex + 1).trim();
const message = JSON.parse(jsonStringPart) as string;
if (!remainingPart || !remainingPart.startsWith('[')) {
return {
message,
textAttachments: []
};
}
const attachments = JSON.parse(remainingPart) as unknown[];
const validAttachments: ClipboardTextAttachment[] = [];
for (const att of attachments) {
if (isValidTextAttachment(att)) {
validAttachments.push({
type: AttachmentType.TEXT,
name: att.name,
content: att.content
});
}
}
return {
message,
textAttachments: validAttachments
};
} catch {
return defaultResult;
}
}
/**
* Type guard to validate a text attachment object
* @param obj The object to validate
* @returns true if the object is a valid text attachment
*/
function isValidTextAttachment(
obj: unknown
): obj is { type: string; name: string; content: string } {
if (typeof obj !== 'object' || obj === null) {
return false;
}
const record = obj as Record<string, unknown>;
return (
(record.type === AttachmentType.TEXT || record.type === 'TEXT') &&
typeof record.name === 'string' &&
typeof record.content === 'string'
);
}
/**
* Checks if clipboard content contains our special format with attachments
* @param clipboardText - Raw text from clipboard
* @returns true if the clipboard content contains our special format with attachments
*/
export function hasClipboardAttachments(clipboardText: string): boolean {
if (!clipboardText.startsWith('"')) {
return false;
}
const parsed = parseClipboardContent(clipboardText);
return parsed.textAttachments.length > 0;
}

View File

@ -1,71 +0,0 @@
import { toast } from 'svelte-sonner';
/**
* Copy text to clipboard with toast notification
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
* @param text - Text to copy to clipboard
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyToClipboard(
text: string,
successMessage = 'Copied to clipboard',
errorMessage = 'Failed to copy to clipboard'
): Promise<boolean> {
try {
// Try modern clipboard API first (secure contexts only)
if (navigator.clipboard && navigator.clipboard.writeText) {
await navigator.clipboard.writeText(text);
toast.success(successMessage);
return true;
}
// Fallback for non-secure contexts
const textArea = document.createElement('textarea');
textArea.value = text;
textArea.style.position = 'fixed';
textArea.style.left = '-999999px';
textArea.style.top = '-999999px';
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
const successful = document.execCommand('copy');
document.body.removeChild(textArea);
if (successful) {
toast.success(successMessage);
return true;
} else {
throw new Error('execCommand failed');
}
} catch (error) {
console.error('Failed to copy to clipboard:', error);
toast.error(errorMessage);
return false;
}
}
/**
* Copy code with HTML entity decoding and toast notification
* @param rawCode - Raw code string that may contain HTML entities
* @param successMessage - Custom success message (optional)
* @param errorMessage - Custom error message (optional)
* @returns Promise<boolean> - True if successful, false otherwise
*/
export async function copyCodeToClipboard(
rawCode: string,
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise<boolean> {
// Decode HTML entities
const decodedCode = rawCode
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return copyToClipboard(decodedCode, successMessage, errorMessage);
}

View File

@ -40,7 +40,15 @@ export { setConfigValue, getConfigValue, configToParameterRecord } from './confi
export { createMessageCountMap, getMessageCount } from './conversation-utils'; export { createMessageCountMap, getMessageCount } from './conversation-utils';
// Clipboard utilities // Clipboard utilities
export { copyToClipboard, copyCodeToClipboard } from './copy'; export {
copyToClipboard,
copyCodeToClipboard,
formatMessageForClipboard,
parseClipboardContent,
hasClipboardAttachments,
type ClipboardTextAttachment,
type ParsedClipboardContent
} from './clipboard';
// File preview utilities // File preview utilities
export { getFileTypeLabel } from './file-preview'; export { getFileTypeLabel } from './file-preview';

View File

@ -14,6 +14,7 @@
import { goto } from '$app/navigation'; import { goto } from '$app/navigation';
import { modelsStore } from '$lib/stores/models.svelte'; import { modelsStore } from '$lib/stores/models.svelte';
import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config'; import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
let { children } = $props(); let { children } = $props();
@ -21,6 +22,10 @@
let isHomeRoute = $derived(page.route.id === '/'); let isHomeRoute = $derived(page.route.id === '/');
let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true'); let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true');
let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading()); let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading());
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
let autoShowSidebarOnNewChat = $derived(config().autoShowSidebarOnNewChat);
let isMobile = new IsMobile();
let isDesktop = $derived(!isMobile.current);
let sidebarOpen = $state(false); let sidebarOpen = $state(false);
let innerHeight = $state<number | undefined>(); let innerHeight = $state<number | undefined>();
let chatSidebar: let chatSidebar:
@ -76,6 +81,11 @@
} }
$effect(() => { $effect(() => {
if (alwaysShowSidebarOnDesktop && isDesktop) {
sidebarOpen = true;
return;
}
if (isHomeRoute && !isNewChatMode) { if (isHomeRoute && !isNewChatMode) {
// Auto-collapse sidebar when navigating to home route (but not in new chat mode) // Auto-collapse sidebar when navigating to home route (but not in new chat mode)
sidebarOpen = false; sidebarOpen = false;
@ -83,8 +93,11 @@
// Keep sidebar open in new chat mode // Keep sidebar open in new chat mode
sidebarOpen = true; sidebarOpen = true;
} else if (isChatRoute) { } else if (isChatRoute) {
// On chat routes, show sidebar by default // On chat routes, only auto-show sidebar if setting is enabled
sidebarOpen = true; if (autoShowSidebarOnNewChat) {
sidebarOpen = true;
}
// If setting is disabled, don't change sidebar state - let user control it manually
} else { } else {
// Other routes follow default behavior // Other routes follow default behavior
sidebarOpen = showSidebarByDefault; sidebarOpen = showSidebarByDefault;
@ -190,12 +203,14 @@
<ChatSidebar bind:this={chatSidebar} /> <ChatSidebar bind:this={chatSidebar} />
</Sidebar.Root> </Sidebar.Root>
<Sidebar.Trigger {#if !(alwaysShowSidebarOnDesktop && isDesktop)}
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen <Sidebar.Trigger
? 'md:left-[var(--sidebar-width)]' class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
: ''}" ? 'md:left-[var(--sidebar-width)]'
style="translate: 1rem 1rem;" : ''}"
/> style="translate: 1rem 1rem;"
/>
{/if}
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden"> <Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
{@render children?.()} {@render children?.()}

View File

@ -1,7 +0,0 @@
import { describe, it, expect } from 'vitest';
describe('sum test', () => {
it('adds 1 + 2 to equal 3', () => {
expect(1 + 2).toBe(3);
});
});

View File

@ -0,0 +1,423 @@
import { describe, it, expect } from 'vitest';
import { AttachmentType } from '$lib/enums';
import {
formatMessageForClipboard,
parseClipboardContent,
hasClipboardAttachments
} from '$lib/utils/clipboard';
describe('formatMessageForClipboard', () => {
it('returns plain content when no extras', () => {
const result = formatMessageForClipboard('Hello world', undefined);
expect(result).toBe('Hello world');
});
it('returns plain content when extras is empty array', () => {
const result = formatMessageForClipboard('Hello world', []);
expect(result).toBe('Hello world');
});
it('handles empty string content', () => {
const result = formatMessageForClipboard('', undefined);
expect(result).toBe('');
});
it('returns plain content when extras has only non-text attachments', () => {
const extras = [
{
type: AttachmentType.IMAGE as const,
name: 'image.png',
base64Url: 'data:image/png;base64,...'
}
];
const result = formatMessageForClipboard('Hello world', extras);
expect(result).toBe('Hello world');
});
it('filters non-text attachments and keeps only text ones', () => {
const extras = [
{
type: AttachmentType.IMAGE as const,
name: 'image.png',
base64Url: 'data:image/png;base64,...'
},
{
type: AttachmentType.TEXT as const,
name: 'file.txt',
content: 'Text content'
},
{
type: AttachmentType.PDF as const,
name: 'doc.pdf',
base64Data: 'data:application/pdf;base64,...',
content: 'PDF content',
processedAsImages: false
}
];
const result = formatMessageForClipboard('Hello', extras);
expect(result).toContain('"file.txt"');
expect(result).not.toContain('image.png');
expect(result).not.toContain('doc.pdf');
});
it('formats message with text attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'File 1 content'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'File 2 content'
}
];
const result = formatMessageForClipboard('Hello world', extras);
expect(result).toContain('"Hello world"');
expect(result).toContain('"type": "TEXT"');
expect(result).toContain('"name": "file1.txt"');
expect(result).toContain('"content": "File 1 content"');
expect(result).toContain('"name": "file2.txt"');
});
it('handles content with quotes and special characters', () => {
const content = 'Hello "world" with\nnewline';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'test.txt',
content: 'Test content'
}
];
const result = formatMessageForClipboard(content, extras);
// Should be valid JSON
expect(result.startsWith('"')).toBe(true);
// The content should be properly escaped
const parsed = JSON.parse(result.split('\n')[0]);
expect(parsed).toBe(content);
});
it('converts legacy context type to TEXT type', () => {
const extras = [
{
type: AttachmentType.LEGACY_CONTEXT as const,
name: 'legacy.txt',
content: 'Legacy content'
}
];
const result = formatMessageForClipboard('Hello', extras);
expect(result).toContain('"type": "TEXT"');
expect(result).not.toContain('"context"');
});
it('handles attachment content with special characters', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'code.js',
content: 'const x = "hello\\nworld";\nconst y = `template ${var}`;'
}
];
const formatted = formatMessageForClipboard('Check this code', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.textAttachments[0].content).toBe(
'const x = "hello\\nworld";\nconst y = `template ${var}`;'
);
});
it('handles unicode characters in content and attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'unicode.txt',
content: '日本語テスト 🎉 émojis'
}
];
const formatted = formatMessageForClipboard('Привет мир 👋', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('Привет мир 👋');
expect(parsed.textAttachments[0].content).toBe('日本語テスト 🎉 émojis');
});
it('formats as plain text when asPlainText is true', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'File 1 content'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'File 2 content'
}
];
const result = formatMessageForClipboard('Hello world', extras, true);
expect(result).toBe('Hello world\n\nFile 1 content\n\nFile 2 content');
});
it('returns plain content when asPlainText is true but no attachments', () => {
const result = formatMessageForClipboard('Hello world', [], true);
expect(result).toBe('Hello world');
});
it('plain text mode does not use JSON format', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'test.txt',
content: 'Test content'
}
];
const result = formatMessageForClipboard('Hello', extras, true);
expect(result).not.toContain('"type"');
expect(result).not.toContain('[');
expect(result).toBe('Hello\n\nTest content');
});
});
describe('parseClipboardContent', () => {
it('returns plain text as message when not in special format', () => {
const result = parseClipboardContent('Hello world');
expect(result.message).toBe('Hello world');
expect(result.textAttachments).toHaveLength(0);
});
it('handles empty string input', () => {
const result = parseClipboardContent('');
expect(result.message).toBe('');
expect(result.textAttachments).toHaveLength(0);
});
it('handles whitespace-only input', () => {
const result = parseClipboardContent(' \n\t ');
expect(result.message).toBe(' \n\t ');
expect(result.textAttachments).toHaveLength(0);
});
it('returns plain text as message when starts with quote but invalid format', () => {
const result = parseClipboardContent('"Unclosed quote');
expect(result.message).toBe('"Unclosed quote');
expect(result.textAttachments).toHaveLength(0);
});
it('returns original text when JSON array is malformed', () => {
const input = '"Hello"\n[invalid json';
const result = parseClipboardContent(input);
expect(result.message).toBe('"Hello"\n[invalid json');
expect(result.textAttachments).toHaveLength(0);
});
it('parses message with text attachments', () => {
const input = `"Hello world"
[
{"type":"TEXT","name":"file1.txt","content":"File 1 content"},
{"type":"TEXT","name":"file2.txt","content":"File 2 content"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello world');
expect(result.textAttachments).toHaveLength(2);
expect(result.textAttachments[0].name).toBe('file1.txt');
expect(result.textAttachments[0].content).toBe('File 1 content');
expect(result.textAttachments[1].name).toBe('file2.txt');
expect(result.textAttachments[1].content).toBe('File 2 content');
});
it('handles escaped quotes in message', () => {
const input = `"Hello \\"world\\" with quotes"
[
{"type":"TEXT","name":"file.txt","content":"test"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello "world" with quotes');
expect(result.textAttachments).toHaveLength(1);
});
it('handles newlines in message', () => {
const input = `"Hello\\nworld"
[
{"type":"TEXT","name":"file.txt","content":"test"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello\nworld');
expect(result.textAttachments).toHaveLength(1);
});
it('returns message only when no array follows', () => {
const input = '"Just a quoted string"';
const result = parseClipboardContent(input);
expect(result.message).toBe('Just a quoted string');
expect(result.textAttachments).toHaveLength(0);
});
it('filters out invalid attachment objects', () => {
const input = `"Hello"
[
{"type":"TEXT","name":"valid.txt","content":"valid"},
{"type":"INVALID","name":"invalid.txt","content":"invalid"},
{"name":"missing-type.txt","content":"missing"},
{"type":"TEXT","content":"missing name"}
]`;
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello');
expect(result.textAttachments).toHaveLength(1);
expect(result.textAttachments[0].name).toBe('valid.txt');
});
it('handles empty attachments array', () => {
const input = '"Hello"\n[]';
const result = parseClipboardContent(input);
expect(result.message).toBe('Hello');
expect(result.textAttachments).toHaveLength(0);
});
it('roundtrips correctly with formatMessageForClipboard', () => {
const originalContent = 'Hello "world" with\nspecial characters';
const originalExtras = [
{
type: AttachmentType.TEXT as const,
name: 'file1.txt',
content: 'Content with\nnewlines and "quotes"'
},
{
type: AttachmentType.TEXT as const,
name: 'file2.txt',
content: 'Another file'
}
];
const formatted = formatMessageForClipboard(originalContent, originalExtras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(originalContent);
expect(parsed.textAttachments).toHaveLength(2);
expect(parsed.textAttachments[0].name).toBe('file1.txt');
expect(parsed.textAttachments[0].content).toBe('Content with\nnewlines and "quotes"');
expect(parsed.textAttachments[1].name).toBe('file2.txt');
expect(parsed.textAttachments[1].content).toBe('Another file');
});
});
describe('hasClipboardAttachments', () => {
it('returns false for plain text', () => {
expect(hasClipboardAttachments('Hello world')).toBe(false);
});
it('returns false for empty string', () => {
expect(hasClipboardAttachments('')).toBe(false);
});
it('returns false for quoted string without attachments', () => {
expect(hasClipboardAttachments('"Hello world"')).toBe(false);
});
it('returns true for valid format with attachments', () => {
const input = `"Hello"
[{"type":"TEXT","name":"file.txt","content":"test"}]`;
expect(hasClipboardAttachments(input)).toBe(true);
});
it('returns false for format with empty attachments array', () => {
const input = '"Hello"\n[]';
expect(hasClipboardAttachments(input)).toBe(false);
});
it('returns false for malformed JSON', () => {
expect(hasClipboardAttachments('"Hello"\n[broken')).toBe(false);
});
});
describe('roundtrip edge cases', () => {
it('preserves empty message with attachments', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'file.txt',
content: 'Content only'
}
];
const formatted = formatMessageForClipboard('', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('');
expect(parsed.textAttachments).toHaveLength(1);
expect(parsed.textAttachments[0].content).toBe('Content only');
});
it('preserves attachment with empty content', () => {
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'empty.txt',
content: ''
}
];
const formatted = formatMessageForClipboard('Message', extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe('Message');
expect(parsed.textAttachments).toHaveLength(1);
expect(parsed.textAttachments[0].content).toBe('');
});
it('preserves multiple backslashes', () => {
const content = 'Path: C:\\\\Users\\\\test\\\\file.txt';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'path.txt',
content: 'D:\\\\Data\\\\file'
}
];
const formatted = formatMessageForClipboard(content, extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(content);
expect(parsed.textAttachments[0].content).toBe('D:\\\\Data\\\\file');
});
it('preserves tabs and various whitespace', () => {
const content = 'Line1\t\tTabbed\n Spaced\r\nCRLF';
const extras = [
{
type: AttachmentType.TEXT as const,
name: 'whitespace.txt',
content: '\t\t\n\n '
}
];
const formatted = formatMessageForClipboard(content, extras);
const parsed = parseClipboardContent(formatted);
expect(parsed.message).toBe(content);
expect(parsed.textAttachments[0].content).toBe('\t\t\n\n ');
});
});

View File

@ -1,6 +1,6 @@
/* eslint-disable no-irregular-whitespace */ /* eslint-disable no-irregular-whitespace */
import { describe, it, expect, test } from 'vitest'; import { describe, it, expect, test } from 'vitest';
import { maskInlineLaTeX, preprocessLaTeX } from './latex-protection'; import { maskInlineLaTeX, preprocessLaTeX } from '$lib/utils/latex-protection';
describe('maskInlineLaTeX', () => { describe('maskInlineLaTeX', () => {
it('should protect LaTeX $x + y$ but not money $3.99', () => { it('should protect LaTeX $x + y$ but not money $3.99', () => {

View File

@ -1,5 +1,5 @@
import { describe, expect, it } from 'vitest'; import { describe, expect, it } from 'vitest';
import { isValidModelName, normalizeModelName } from './model-names'; import { isValidModelName, normalizeModelName } from '$lib/utils/model-names';
describe('normalizeModelName', () => { describe('normalizeModelName', () => {
it('preserves Hugging Face org/model format (single slash)', () => { it('preserves Hugging Face org/model format (single slash)', () => {

View File

@ -125,9 +125,9 @@ export default defineConfig({
{ {
extends: './vite.config.ts', extends: './vite.config.ts',
test: { test: {
name: 'server', name: 'unit',
environment: 'node', environment: 'node',
include: ['tests/server/**/*.{test,spec}.{js,ts}'] include: ['tests/unit/**/*.{test,spec}.{js,ts}']
} }
}, },
{ {