Merge branch 'ggml-org:master' into power-law-sampler
This commit is contained in:
commit
58aa1c6f5a
|
|
@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
# ENTRYPOINT ["/app/llama-server"]
|
# ENTRYPOINT ["/app/llama-server"]
|
||||||
|
|
||||||
### Target: light
|
### Target: light
|
||||||
# Lightweight image containing only llama-cli
|
# Lightweight image containing only llama-cli and llama-completion
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
RUN echo "Building with static libs" && \
|
RUN echo "Building with static libs" && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli && \
|
||||||
|
cmake --build build --config Release --target llama-completion
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
# TODO: use image with NNRT
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ make -j GGML_CUDA=1
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
|
|
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/llama-cuda-cli
|
||||||
|
%{_bindir}/llama-cuda-completion
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@ make -j
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
|
|
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/llama-cli
|
||||||
|
%{_bindir}/llama-completion
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,295 @@
|
||||||
|
# Server WebUI build and tests
|
||||||
|
name: Server WebUI
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
sha:
|
||||||
|
description: 'Commit SHA1 to build'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
slow_tests:
|
||||||
|
description: 'Run slow tests'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||||
|
|
||||||
|
env:
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
LLAMA_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
webui-setup:
|
||||||
|
name: WebUI Setup
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
cache: "npm"
|
||||||
|
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||||
|
|
||||||
|
- name: Cache node_modules
|
||||||
|
uses: actions/cache@v4
|
||||||
|
id: cache-node-modules
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
||||||
|
run: npm ci
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-check:
|
||||||
|
needs: webui-setup
|
||||||
|
name: WebUI Check
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Run type checking
|
||||||
|
run: npm run check
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run linting
|
||||||
|
run: npm run lint
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-build:
|
||||||
|
needs: webui-check
|
||||||
|
name: WebUI Build
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Build application
|
||||||
|
run: npm run build
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-tests:
|
||||||
|
needs: webui-build
|
||||||
|
name: Run WebUI tests
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Install Playwright browsers
|
||||||
|
run: npx playwright install --with-deps
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build Storybook
|
||||||
|
run: npm run build-storybook
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run Client tests
|
||||||
|
run: npm run test:client
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run Server tests
|
||||||
|
run: npm run test:server
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run UI tests
|
||||||
|
run: npm run test:ui -- --testTimeout=60000
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run E2E tests
|
||||||
|
run: npm run test:e2e
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
server-build:
|
||||||
|
needs: [webui-tests]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
|
build_type: [RelWithDebInfo]
|
||||||
|
include:
|
||||||
|
- build_type: Release
|
||||||
|
sanitizer: ""
|
||||||
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install \
|
||||||
|
build-essential \
|
||||||
|
xxd \
|
||||||
|
git \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
language-pack-en \
|
||||||
|
libssl-dev
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Python setup
|
||||||
|
id: setup_python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Tests dependencies
|
||||||
|
id: test_dependencies
|
||||||
|
run: |
|
||||||
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
|
- name: Setup Node.js for WebUI
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
cache: "npm"
|
||||||
|
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||||
|
|
||||||
|
- name: Install WebUI dependencies
|
||||||
|
run: npm ci
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build WebUI
|
||||||
|
run: npm run build
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DGGML_OPENMP=OFF ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Build (sanitizers)
|
||||||
|
id: cmake_build_sanitizers
|
||||||
|
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Build (sanitizers)
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer == '' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
id: server_integration_tests
|
||||||
|
if: ${{ matrix.sanitizer == '' }}
|
||||||
|
env:
|
||||||
|
GITHUB_ACTIONS: "true"
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
./tests.sh
|
||||||
|
|
||||||
|
- name: Tests (sanitizers)
|
||||||
|
id: server_integration_tests_sanitizers
|
||||||
|
if: ${{ matrix.sanitizer != '' }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
LLAMA_SANITIZE=1 ./tests.sh
|
||||||
|
|
||||||
|
- name: Slow tests
|
||||||
|
id: server_integration_tests_slow
|
||||||
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
SLOW_TESTS=1 ./tests.sh
|
||||||
|
|
@ -76,270 +76,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
pip install -r tools/server/tests/requirements.txt
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
webui-setup:
|
|
||||||
name: WebUI Setup
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
|
||||||
|
|
||||||
- name: Cache node_modules
|
|
||||||
uses: actions/cache@v4
|
|
||||||
id: cache-node-modules
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
|
||||||
run: npm ci
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-check:
|
|
||||||
needs: webui-setup
|
|
||||||
name: WebUI Check
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Run type checking
|
|
||||||
run: npm run check
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run linting
|
|
||||||
run: npm run lint
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-build:
|
|
||||||
needs: webui-check
|
|
||||||
name: WebUI Build
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Build application
|
|
||||||
run: npm run build
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-tests:
|
|
||||||
needs: webui-build
|
|
||||||
name: Run WebUI tests
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Install Playwright browsers
|
|
||||||
run: npx playwright install --with-deps
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build Storybook
|
|
||||||
run: npm run build-storybook
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run Client tests
|
|
||||||
run: npm run test:client
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run Server tests
|
|
||||||
run: npm run test:server
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run UI tests
|
|
||||||
run: npm run test:ui -- --testTimeout=60000
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run E2E tests
|
|
||||||
run: npm run test:e2e
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
server-build:
|
|
||||||
needs: [webui-tests]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
|
||||||
build_type: [RelWithDebInfo]
|
|
||||||
include:
|
|
||||||
- build_type: Release
|
|
||||||
sanitizer: ""
|
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get -y install \
|
|
||||||
build-essential \
|
|
||||||
xxd \
|
|
||||||
git \
|
|
||||||
cmake \
|
|
||||||
curl \
|
|
||||||
wget \
|
|
||||||
language-pack-en \
|
|
||||||
libssl-dev
|
|
||||||
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Python setup
|
|
||||||
id: setup_python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.11'
|
|
||||||
|
|
||||||
- name: Tests dependencies
|
|
||||||
id: test_dependencies
|
|
||||||
run: |
|
|
||||||
pip install -r tools/server/tests/requirements.txt
|
|
||||||
|
|
||||||
- name: Setup Node.js for WebUI
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
|
||||||
|
|
||||||
- name: Install WebUI dependencies
|
|
||||||
run: npm ci
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build WebUI
|
|
||||||
run: npm run build
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
|
||||||
id: cmake_build_no_openmp
|
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
|
||||||
-DGGML_OPENMP=OFF ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
|
||||||
id: server_integration_tests
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
env:
|
|
||||||
GITHUB_ACTIONS: "true"
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
./tests.sh
|
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
|
||||||
id: server_integration_tests_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
SLOW_TESTS=1 ./tests.sh
|
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
|
||||||
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
|
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||||
|
|
|
||||||
|
|
@ -835,6 +835,19 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
||||||
}
|
}
|
||||||
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
|
// per-example default params
|
||||||
|
// we define here to make sure it's included in llama-gen-docs
|
||||||
|
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
||||||
|
params.use_jinja = false; // disable jinja by default
|
||||||
|
|
||||||
|
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
||||||
|
params.use_jinja = false; // disable jinja by default
|
||||||
|
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||||
|
|
||||||
|
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||||
|
params.n_parallel = -1; // auto by default
|
||||||
|
}
|
||||||
|
|
||||||
params.use_color = tty_can_use_colors();
|
params.use_color = tty_can_use_colors();
|
||||||
|
|
||||||
// load dynamic backends
|
// load dynamic backends
|
||||||
|
|
@ -1107,7 +1120,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
|
||||||
string_format("max number of context checkpoints to create per slot (default: %d)\n"
|
string_format("max number of context checkpoints to create per slot (default: %d)"
|
||||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_ctx_checkpoints = value;
|
params.n_ctx_checkpoints = value;
|
||||||
|
|
@ -1115,7 +1128,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cache-ram", "-cram"}, "N",
|
{"--cache-ram", "-cram"}, "N",
|
||||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
|
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.cache_ram_mib = value;
|
params.cache_ram_mib = value;
|
||||||
|
|
@ -1123,12 +1136,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--kv-unified", "-kvu"},
|
{"--kv-unified", "-kvu"},
|
||||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
||||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--context-shift"},
|
{"--context-shift"},
|
||||||
{"--no-context-shift"},
|
{"--no-context-shift"},
|
||||||
|
|
@ -1906,6 +1918,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
||||||
|
if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||||
|
// this is to make sure this option appears in the server-specific section of the help message
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-np", "--parallel"}, "N",
|
||||||
|
string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
if (value == 0) {
|
||||||
|
throw std::invalid_argument("error: invalid value for n_parallel\n");
|
||||||
|
}
|
||||||
|
params.n_parallel = value;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
|
} else {
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-np", "--parallel"}, "N",
|
{"-np", "--parallel"}, "N",
|
||||||
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||||
|
|
@ -1913,6 +1938,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.n_parallel = value;
|
params.n_parallel = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_PARALLEL"));
|
).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||||
|
}
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ns", "--sequences"}, "N",
|
{"-ns", "--sequences"}, "N",
|
||||||
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
||||||
|
|
|
||||||
|
|
@ -4,9 +4,14 @@
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
static std::string_view trim_trailing_space(std::string_view sv) {
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
||||||
|
int count = 0;
|
||||||
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
||||||
|
if (max != -1 && count <= max) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
sv.remove_suffix(1);
|
sv.remove_suffix(1);
|
||||||
|
count++;
|
||||||
}
|
}
|
||||||
return sv;
|
return sv;
|
||||||
}
|
}
|
||||||
|
|
@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
|
||||||
if (is_arg_string && current_tool) {
|
if (is_arg_string && current_tool) {
|
||||||
// Serialize to JSON, but exclude the end quote
|
// Serialize to JSON, but exclude the end quote
|
||||||
std::string dumped = json(node.text).dump();
|
std::string dumped = json(trim_trailing_space(node.text)).dump();
|
||||||
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
|
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
|
||||||
needs_closing_quote = true;
|
needs_closing_quote = true;
|
||||||
}
|
}
|
||||||
|
|
@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
if (is_arg_close && current_tool) {
|
if (is_arg_close && current_tool) {
|
||||||
if (needs_closing_quote) {
|
if (needs_closing_quote) {
|
||||||
current_tool->arguments += "\"";
|
current_tool->arguments += "\"";
|
||||||
|
needs_closing_quote = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_tool_close && current_tool) {
|
if (is_tool_close && current_tool) {
|
||||||
|
if (needs_closing_quote) {
|
||||||
|
current_tool->arguments += "\"";
|
||||||
|
needs_closing_quote = false;
|
||||||
|
}
|
||||||
current_tool->arguments += "}";
|
current_tool->arguments += "}";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
140
common/chat.cpp
140
common/chat.cpp
|
|
@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function<void(const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void foreach_parameter(const json & function, const std::function<void(const std::string &, const json &, bool)> & fn) {
|
||||||
|
if (!function.contains("parameters") || !function.at("parameters").is_object()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const auto & params = function.at("parameters");
|
||||||
|
if (!params.contains("properties") || !params.at("properties").is_object()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const auto & props = params.at("properties");
|
||||||
|
std::set<std::string> required;
|
||||||
|
if (params.contains("required") && params.at("required").is_array()) {
|
||||||
|
params.at("required").get_to(required);
|
||||||
|
}
|
||||||
|
for (const auto & [name, prop] : props.items()) {
|
||||||
|
bool is_required = (required.find(name) != required.end());
|
||||||
|
fn(name, prop, is_required);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string apply(
|
static std::string apply(
|
||||||
const common_chat_template & tmpl,
|
const common_chat_template & tmpl,
|
||||||
const struct templates_params & inputs,
|
const struct templates_params & inputs,
|
||||||
|
|
@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
common_chat_params data;
|
||||||
|
|
||||||
|
data.prompt = apply(tmpl, inputs);
|
||||||
|
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
|
||||||
|
|
||||||
|
// Handle thinking tags appropriately based on inputs.enable_thinking
|
||||||
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||||
|
if (!inputs.enable_thinking) {
|
||||||
|
data.prompt += "</think>";
|
||||||
|
} else {
|
||||||
|
data.thinking_forced_open = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.preserved_tokens = {
|
||||||
|
"<think>",
|
||||||
|
"</think>",
|
||||||
|
"<tool_call>",
|
||||||
|
"</tool_call>",
|
||||||
|
};
|
||||||
|
|
||||||
|
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||||
|
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||||
|
auto include_grammar = true;
|
||||||
|
|
||||||
|
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
|
||||||
|
auto reasoning = p.eps();
|
||||||
|
if (inputs.enable_thinking && extract_reasoning) {
|
||||||
|
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
|
||||||
|
if (data.thinking_forced_open) {
|
||||||
|
reasoning = reasoning_content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Response format parser
|
||||||
|
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
||||||
|
return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tool call parser
|
||||||
|
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||||
|
auto tool_choice = p.choice();
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
std::string name = function.at("name");
|
||||||
|
auto parameters = function.at("parameters");
|
||||||
|
|
||||||
|
auto schema_info = common_schema_info();
|
||||||
|
schema_info.resolve_refs(parameters);
|
||||||
|
|
||||||
|
auto tool_open = "<function=" + p.tool_name(p.literal(name)) + ">\n";
|
||||||
|
auto tool_close = p.literal("</function>\n");
|
||||||
|
auto args = p.sequence();
|
||||||
|
auto arg_string = p.rule("xml-arg-string", p.until_one_of({
|
||||||
|
"\n</parameter>",
|
||||||
|
"\n<parameter=",
|
||||||
|
"\n</function>"
|
||||||
|
}));
|
||||||
|
|
||||||
|
foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
|
||||||
|
auto rule_name = "tool-" + name + "-arg-" + param_name;
|
||||||
|
|
||||||
|
auto arg_open = "<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">\n";
|
||||||
|
auto arg_close = p.literal("</parameter>\n");
|
||||||
|
auto arg_value = p.eps();
|
||||||
|
|
||||||
|
if (schema_info.resolves_to_string(param_schema)) {
|
||||||
|
arg_value = p.tool_arg_string_value(arg_string) + "\n";
|
||||||
|
} else {
|
||||||
|
arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model may or my not close with </parameter>
|
||||||
|
auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
|
||||||
|
args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
|
||||||
|
});
|
||||||
|
|
||||||
|
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||||
|
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||||
|
auto tool_call = p.rule("tool-call", "<tool_call>\n" + tool_choice + "</tool_call>" + p.space());
|
||||||
|
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
|
||||||
|
|
||||||
|
return reasoning << p.content(p.until("<tool_call>")) << tool_calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content only parser
|
||||||
|
include_grammar = false;
|
||||||
|
return reasoning << p.content(p.rest());
|
||||||
|
});
|
||||||
|
|
||||||
|
data.parser = parser.save();
|
||||||
|
|
||||||
|
if (include_grammar) {
|
||||||
|
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
|
|
||||||
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
auto schema = function.at("parameters");
|
||||||
|
builder.resolve_refs(schema);
|
||||||
|
});
|
||||||
|
parser.build_grammar(builder, data.grammar_lazy);
|
||||||
|
});
|
||||||
|
|
||||||
|
data.grammar_triggers = {
|
||||||
|
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
common_chat_params data;
|
common_chat_params data;
|
||||||
|
|
||||||
|
|
@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
src.find("<function=") != std::string::npos &&
|
src.find("<function=") != std::string::npos &&
|
||||||
src.find("<parameters>") != std::string::npos &&
|
src.find("<parameters>") != std::string::npos &&
|
||||||
src.find("<parameter=") != std::string::npos) {
|
src.find("<parameter=") != std::string::npos) {
|
||||||
|
// Nemotron 3 Nano 30B A3B
|
||||||
|
if (src.find("<think>") != std::string::npos) {
|
||||||
|
return common_chat_params_init_nemotron_v3(tmpl, params);
|
||||||
|
}
|
||||||
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
|
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
||||||
|
|
||||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||||
|
|
||||||
class SchemaConverter {
|
class common_schema_converter {
|
||||||
private:
|
private:
|
||||||
|
friend class common_schema_info;
|
||||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
bool _dotall;
|
bool _dotall;
|
||||||
|
|
@ -729,7 +730,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SchemaConverter(
|
common_schema_converter(
|
||||||
const std::function<json(const std::string &)> & fetch_json,
|
const std::function<json(const std::string &)> & fetch_json,
|
||||||
bool dotall)
|
bool dotall)
|
||||||
: _fetch_json(fetch_json), _dotall(dotall)
|
: _fetch_json(fetch_json), _dotall(dotall)
|
||||||
|
|
@ -990,6 +991,134 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// common_schema_info implementation (pimpl)
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info()
|
||||||
|
: impl_(std::make_unique<common_schema_converter>(
|
||||||
|
[](const std::string &) { return json(); },
|
||||||
|
false)) {}
|
||||||
|
|
||||||
|
common_schema_info::~common_schema_info() = default;
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
||||||
|
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
||||||
|
|
||||||
|
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
||||||
|
impl_->resolve_refs(schema, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determines if a JSON schema can resolve to a string type through any path.
|
||||||
|
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
||||||
|
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
||||||
|
// true, allowing callers to handle the value as a raw string for simplicity.
|
||||||
|
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
||||||
|
std::unordered_set<std::string> visited_refs;
|
||||||
|
|
||||||
|
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
||||||
|
if (!s.is_object()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle $ref
|
||||||
|
if (s.contains("$ref")) {
|
||||||
|
const std::string & ref = s["$ref"];
|
||||||
|
if (visited_refs.find(ref) != visited_refs.end()) {
|
||||||
|
// Circular reference, assume not a string to be safe
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
visited_refs.insert(ref);
|
||||||
|
auto it = impl_->_refs.find(ref);
|
||||||
|
if (it != impl_->_refs.end()) {
|
||||||
|
return check(it->second);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check type field
|
||||||
|
if (s.contains("type")) {
|
||||||
|
const json & schema_type = s["type"];
|
||||||
|
if (schema_type.is_string()) {
|
||||||
|
if (schema_type == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (schema_type.is_array()) {
|
||||||
|
// Type can be an array like ["string", "null"]
|
||||||
|
for (const auto & t : schema_type) {
|
||||||
|
if (t == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check oneOf/anyOf - if any alternative can be a string
|
||||||
|
if (s.contains("oneOf")) {
|
||||||
|
for (const auto & alt : s["oneOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.contains("anyOf")) {
|
||||||
|
for (const auto & alt : s["anyOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check allOf - all components must be compatible with string type
|
||||||
|
if (s.contains("allOf")) {
|
||||||
|
bool all_string = true;
|
||||||
|
for (const auto & component : s["allOf"]) {
|
||||||
|
if (!check(component)) {
|
||||||
|
all_string = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_string) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check const - if the constant value is a string
|
||||||
|
if (s.contains("const")) {
|
||||||
|
if (s["const"].is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check enum - if any enum value is a string
|
||||||
|
if (s.contains("enum")) {
|
||||||
|
for (const auto & val : s["enum"]) {
|
||||||
|
if (val.is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// String-specific keywords imply string type
|
||||||
|
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check format - many formats imply string
|
||||||
|
if (s.contains("format")) {
|
||||||
|
const std::string & fmt = s["format"];
|
||||||
|
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
||||||
|
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
||||||
|
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
||||||
|
fmt.find("uuid") == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
return check(schema);
|
||||||
|
}
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
if (!force_gbnf) {
|
if (!force_gbnf) {
|
||||||
|
|
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||||
common_grammar_builder builder {
|
common_grammar_builder builder {
|
||||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||||
return converter._add_rule(name, rule);
|
return converter._add_rule(name, rule);
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,31 @@
|
||||||
#include <nlohmann/json_fwd.hpp>
|
#include <nlohmann/json_fwd.hpp>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||||
bool force_gbnf = false);
|
bool force_gbnf = false);
|
||||||
|
|
||||||
|
class common_schema_converter;
|
||||||
|
|
||||||
|
// Probes a JSON schema to extract information about its structure and type constraints.
|
||||||
|
class common_schema_info {
|
||||||
|
std::unique_ptr<common_schema_converter> impl_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_schema_info();
|
||||||
|
~common_schema_info();
|
||||||
|
|
||||||
|
common_schema_info(const common_schema_info &) = delete;
|
||||||
|
common_schema_info & operator=(const common_schema_info &) = delete;
|
||||||
|
common_schema_info(common_schema_info &&) noexcept;
|
||||||
|
common_schema_info & operator=(common_schema_info &&) noexcept;
|
||||||
|
|
||||||
|
void resolve_refs(nlohmann::ordered_json & schema);
|
||||||
|
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
||||||
|
};
|
||||||
|
|
||||||
struct common_grammar_builder {
|
struct common_grammar_builder {
|
||||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||||
|
|
|
||||||
|
|
@ -425,7 +425,7 @@ struct parser_executor {
|
||||||
|
|
||||||
if (result.need_more_input()) {
|
if (result.need_more_input()) {
|
||||||
// Propagate - need to know what child would match before negating
|
// Propagate - need to know what child would match before negating
|
||||||
return result;
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Child failed, so negation succeeds
|
// Child failed, so negation succeeds
|
||||||
|
|
|
||||||
|
|
@ -862,6 +862,14 @@ class TextModel(ModelBase):
|
||||||
logger.warning(f"Unknown RoPE type: {rope_type}")
|
logger.warning(f"Unknown RoPE type: {rope_type}")
|
||||||
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
||||||
|
|
||||||
|
if "mrope_section" in self.rope_parameters:
|
||||||
|
mrope_section = self.rope_parameters["mrope_section"]
|
||||||
|
# Pad to 4 dimensions [time, height, width, extra]
|
||||||
|
while len(mrope_section) < 4:
|
||||||
|
mrope_section.append(0)
|
||||||
|
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
||||||
|
logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
|
||||||
|
|
||||||
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||||
|
|
@ -3739,9 +3747,6 @@ class Qwen2VLModel(TextModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
|
||||||
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
|
|
@ -4377,6 +4382,30 @@ class Qwen3VLVisionModel(MmprojModel):
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
|
||||||
|
class Glm4VVisionModel(Qwen3VLVisionModel):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
|
||||||
|
|
||||||
|
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
|
||||||
|
if hidden_act == "gelu":
|
||||||
|
self.gguf_writer.add_vision_use_gelu(True)
|
||||||
|
elif hidden_act == "silu":
|
||||||
|
self.gguf_writer.add_vision_use_silu(True)
|
||||||
|
|
||||||
|
rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
|
||||||
|
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name.startswith("model.visual."):
|
||||||
|
name = name.replace("model.visual.", "visual.")
|
||||||
|
if name.startswith("visual.merger."):
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
||||||
class Qwen3VLTextModel(Qwen3Model):
|
class Qwen3VLTextModel(Qwen3Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
||||||
|
|
@ -4385,20 +4414,6 @@ class Qwen3VLTextModel(Qwen3Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
||||||
text_config = self.hparams.get("text_config", {})
|
|
||||||
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
||||||
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
||||||
|
|
||||||
if rope_scaling.get("mrope_section"):
|
|
||||||
# mrope_section contains [time, height, width] dimensions
|
|
||||||
mrope_section = rope_scaling["mrope_section"]
|
|
||||||
# Pad to 4 dimensions [time, height, width, extra]
|
|
||||||
while len(mrope_section) < 4:
|
|
||||||
mrope_section.append(0)
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
||||||
|
|
||||||
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
||||||
|
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
@ -4417,22 +4432,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
|
||||||
text_config = self.hparams.get("text_config", {})
|
|
||||||
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
||||||
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
||||||
|
|
||||||
if rope_scaling.get("mrope_section"):
|
|
||||||
# mrope_section contains [time, height, width] dimensions
|
|
||||||
mrope_section = rope_scaling["mrope_section"]
|
|
||||||
# Pad to 4 dimensions [time, height, width, extra]
|
|
||||||
while len(mrope_section) < 4:
|
|
||||||
mrope_section.append(0)
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
||||||
|
|
||||||
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
||||||
|
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
@ -7795,6 +7794,15 @@ class JaisModel(TextModel):
|
||||||
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
||||||
class Glm4Model(TextModel):
|
class Glm4Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.GLM4
|
model_arch = gguf.MODEL_ARCH.GLM4
|
||||||
|
use_mrope = False
|
||||||
|
partial_rotary_factor = 0.5
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
|
||||||
|
if "mrope_section" in self.rope_parameters:
|
||||||
|
self.use_mrope = True
|
||||||
|
logger.info("Q/K weight will need to be permuted for M-RoPE")
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
@ -7816,17 +7824,49 @@ class Glm4Model(TextModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
|
||||||
|
orig_shape = weights.shape
|
||||||
|
if len(orig_shape) == 1:
|
||||||
|
weights = weights.unsqueeze(1) # [out_dim, 1]
|
||||||
|
if len(weights.shape) != 2:
|
||||||
|
raise ValueError("Only 1D and 2D tensors are supported.")
|
||||||
|
n_effective_heads = weights.shape[0] // head_dim
|
||||||
|
if n_head_kv is not None and n_effective_heads != n_head:
|
||||||
|
if n_effective_heads != n_head_kv:
|
||||||
|
raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
|
||||||
|
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||||
|
if rotary_dim % 2 != 0:
|
||||||
|
raise ValueError("rotary_dim must be even.")
|
||||||
|
reshaped = weights.reshape(n_effective_heads, head_dim, -1)
|
||||||
|
rot_part = reshaped[:, :rotary_dim, :]
|
||||||
|
non_rot_part = reshaped[:, rotary_dim:, :]
|
||||||
|
permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
|
||||||
|
combined = torch.cat((permuted_rot, non_rot_part), dim=1)
|
||||||
|
result = combined.reshape(weights.shape)
|
||||||
|
return result if len(orig_shape) != 1 else result.squeeze(1)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||||
return []
|
return []
|
||||||
elif name.startswith("model.language_model."):
|
elif name.startswith("model.language_model."):
|
||||||
name = name.replace("language_model.", "") # for Glm4v
|
name = name.replace("language_model.", "") # for Glm4v
|
||||||
|
if self.use_mrope:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
n_kv_head = self.hparams["num_key_value_heads"]
|
||||||
|
n_embd = self.hparams["hidden_size"]
|
||||||
|
head_dim = n_embd // n_head
|
||||||
|
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
|
||||||
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
|
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
|
||||||
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Glm4MoeForCausalLM")
|
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
|
||||||
class Glm4MoeModel(TextModel):
|
class Glm4MoeModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.GLM4_MOE
|
model_arch = gguf.MODEL_ARCH.GLM4_MOE
|
||||||
|
|
||||||
|
|
@ -7893,6 +7933,7 @@ class Glm4MoeModel(TextModel):
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
# note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
|
||||||
def modify_tensors(
|
def modify_tensors(
|
||||||
self, data_torch: Tensor, name: str, bid: int | None
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
) -> Iterable[tuple[str, Tensor]]:
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
@ -8490,8 +8531,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||||
class NemotronHModel(GraniteHybridModel):
|
class NemotronHModel(GraniteHybridModel):
|
||||||
"""Hybrid mamba2/attention model from NVIDIA"""
|
"""Hybrid mamba2/attention model from NVIDIA"""
|
||||||
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
||||||
|
is_moe: bool = False
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
# We have to determine the correct model architecture (MoE vs non-MoE) before
|
||||||
|
# calling the parent __init__. This is because the parent constructor
|
||||||
|
# uses self.model_arch to build the tensor name map, and all MoE-specific
|
||||||
|
# mappings would be missed if it were called with the default non-MoE arch.
|
||||||
|
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
|
||||||
|
if "num_experts_per_tok" in hparams:
|
||||||
|
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
|
||||||
|
self.is_moe = True
|
||||||
|
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
# Save the top-level head_dim for later
|
# Save the top-level head_dim for later
|
||||||
|
|
@ -8503,9 +8554,11 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
|
|
||||||
# Update the ssm / attn / mlp layers
|
# Update the ssm / attn / mlp layers
|
||||||
# M: Mamba2, *: Attention, -: MLP
|
# M: Mamba2, *: Attention, -: MLP
|
||||||
|
# MoE:
|
||||||
|
# M: Mamba2, *: Attention, E: Expert
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||||
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
||||||
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
|
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
|
||||||
|
|
||||||
def get_attn_layers(self):
|
def get_attn_layers(self):
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||||
|
|
@ -8521,10 +8574,28 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
# Set feed_forward_length
|
# Set feed_forward_length
|
||||||
# NOTE: This will trigger an override warning. This is preferrable to
|
# NOTE: This will trigger an override warning. This is preferrable to
|
||||||
# duplicating all the parent logic
|
# duplicating all the parent logic
|
||||||
|
if not self.is_moe:
|
||||||
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||||
self.gguf_writer.add_feed_forward_length([
|
self.gguf_writer.add_feed_forward_length([
|
||||||
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||||
])
|
])
|
||||||
|
else:
|
||||||
|
moe_intermediate_size = self.hparams["moe_intermediate_size"]
|
||||||
|
self.gguf_writer.add_feed_forward_length([
|
||||||
|
moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||||
|
])
|
||||||
|
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
|
||||||
|
|
||||||
|
# number of experts used per token (top-k)
|
||||||
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
super().set_vocab()
|
super().set_vocab()
|
||||||
|
|
@ -8532,8 +8603,82 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||||
# config, so we need to explicitly override it here.
|
# config, so we need to explicitly override it here.
|
||||||
|
if not self.is_moe:
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if self.is_moe and bid is not None:
|
||||||
|
if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||||
|
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
mapped_name = self.map_tensor_name(new_name)
|
||||||
|
return [(mapped_name, data_torch)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.dt_bias"):
|
||||||
|
new_name = name.replace("dt_bias", "dt.bias")
|
||||||
|
mapped_name = self.map_tensor_name(new_name)
|
||||||
|
return [(mapped_name, data_torch)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.conv1d.weight"):
|
||||||
|
squeezed_data = data_torch.squeeze()
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, squeezed_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.A_log"):
|
||||||
|
transformed_data = -torch.exp(data_torch)
|
||||||
|
reshaped_data = transformed_data.squeeze().reshape(-1, 1)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.D"):
|
||||||
|
reshaped_data = data_torch.squeeze().reshape(-1, 1)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.norm.weight"):
|
||||||
|
reshaped_data = data_torch.reshape(8, 512)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.find("mixer.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 2:
|
||||||
|
# merge the experts into a single tensor
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
for w_name in ["down_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("BailingMoeForCausalLM")
|
@ModelBase.register("BailingMoeForCausalLM")
|
||||||
class BailingMoeModel(TextModel):
|
class BailingMoeModel(TextModel):
|
||||||
|
|
|
||||||
|
|
@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family:
|
||||||
- Intel Built-in Arc GPU
|
- Intel Built-in Arc GPU
|
||||||
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
|
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
|
||||||
|
|
||||||
|
On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
|
||||||
|
|
||||||
#### Verified devices
|
#### Verified devices
|
||||||
|
|
||||||
| Intel GPU | Status | Verified Model |
|
| Intel GPU | Status | Verified Model |
|
||||||
|
|
|
||||||
|
|
@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files:
|
||||||
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
|
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
|
||||||
2. In `src/llama-arch.cpp`:
|
2. In `src/llama-arch.cpp`:
|
||||||
- Add the architecture name to the `LLM_ARCH_NAMES` map.
|
- Add the architecture name to the `LLM_ARCH_NAMES` map.
|
||||||
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
|
- Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`)
|
||||||
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
|
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
|
||||||
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
|
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,9 @@
|
||||||
## Images
|
## Images
|
||||||
We have three Docker images available for this project:
|
We have three Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
|
|
||||||
Additionally, there the following images, similar to the above:
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
|
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
|
||||||
On completion, you are ready to play!
|
On completion, you are ready to play!
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a light image:
|
or with a light image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
@ -59,6 +61,8 @@ or with a server image:
|
||||||
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
|
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
|
||||||
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||||
|
|
@ -80,9 +84,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -114,9 +118,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void export_md(std::string fname, llama_example ex) {
|
static void export_md(std::string fname, llama_example ex, std::string name) {
|
||||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
common_params params;
|
common_params params;
|
||||||
|
|
@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) {
|
||||||
write_table(file, common_options);
|
write_table(file, common_options);
|
||||||
file << "\n\n**Sampling params**\n\n";
|
file << "\n\n**Sampling params**\n\n";
|
||||||
write_table(file, sparam_options);
|
write_table(file, sparam_options);
|
||||||
file << "\n\n**Example-specific params**\n\n";
|
file << "\n\n**" << name << "-specific params**\n\n";
|
||||||
write_table(file, specific_options);
|
write_table(file, specific_options);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int, char **) {
|
int main(int, char **) {
|
||||||
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
|
// TODO: add CLI
|
||||||
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
|
||||||
|
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the
|
||||||
model/models need to the ggml-org on Hugging Face. This tool/example tries to
|
model/models need to the ggml-org on Hugging Face. This tool/example tries to
|
||||||
help with this process.
|
help with this process.
|
||||||
|
|
||||||
|
> 📝 **Note:** When adding a new model from an existing family, verify the
|
||||||
|
> previous version passes logits verification first. Existing models can have
|
||||||
|
> subtle numerical differences that don't affect generation quality but cause
|
||||||
|
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
|
||||||
|
> the conversion script, or in an upstream implementation, can save significant
|
||||||
|
> debugging time.
|
||||||
|
|
||||||
### Overview
|
### Overview
|
||||||
The idea is that the makefile targets and scripts here can be used in the
|
The idea is that the makefile targets and scripts here can be used in the
|
||||||
development/conversion process assisting with things like:
|
development/conversion process assisting with things like:
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ base_model:
|
||||||
Recommended way to run this model:
|
Recommended way to run this model:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
|
llama-server -hf {namespace}/{model_name}-GGUF -c 0
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, access http://localhost:8080
|
Then, access http://localhost:8080
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,11 @@ done
|
||||||
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
||||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||||
|
|
||||||
|
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||||
|
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
|
||||||
|
|
||||||
if [ -t 0 ]; then
|
if [ -t 0 ]; then
|
||||||
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
|
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
|
||||||
else
|
else
|
||||||
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
||||||
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
||||||
|
|
|
||||||
|
|
@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
JAIS = auto()
|
JAIS = auto()
|
||||||
NEMOTRON = auto()
|
NEMOTRON = auto()
|
||||||
NEMOTRON_H = auto()
|
NEMOTRON_H = auto()
|
||||||
|
NEMOTRON_H_MOE = auto()
|
||||||
EXAONE = auto()
|
EXAONE = auto()
|
||||||
EXAONE4 = auto()
|
EXAONE4 = auto()
|
||||||
GRANITE = auto()
|
GRANITE = auto()
|
||||||
|
|
@ -642,6 +643,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_MMPROJ_PEG = auto()
|
V_MMPROJ_PEG = auto()
|
||||||
V_ENC_EMBD_CLS = auto()
|
V_ENC_EMBD_CLS = auto()
|
||||||
V_ENC_EMBD_PATCH = auto()
|
V_ENC_EMBD_PATCH = auto()
|
||||||
|
V_ENC_EMBD_NORM = auto()
|
||||||
V_ENC_EMBD_POS = auto()
|
V_ENC_EMBD_POS = auto()
|
||||||
V_ENC_INPUT_NORM = auto()
|
V_ENC_INPUT_NORM = auto()
|
||||||
V_ENC_ATTN_QKV = auto()
|
V_ENC_ATTN_QKV = auto()
|
||||||
|
|
@ -660,6 +662,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_LAYER_SCALE_2 = auto()
|
V_LAYER_SCALE_2 = auto()
|
||||||
V_PRE_NORM = auto()
|
V_PRE_NORM = auto()
|
||||||
V_POST_NORM = auto()
|
V_POST_NORM = auto()
|
||||||
|
V_MM_POST_NORM = auto()
|
||||||
V_MM_INP_NORM = auto()
|
V_MM_INP_NORM = auto()
|
||||||
V_MM_INP_PROJ = auto() # gemma3
|
V_MM_INP_PROJ = auto() # gemma3
|
||||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||||
|
|
@ -786,6 +789,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.JAIS: "jais",
|
MODEL_ARCH.JAIS: "jais",
|
||||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||||
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
||||||
|
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
|
||||||
MODEL_ARCH.EXAONE: "exaone",
|
MODEL_ARCH.EXAONE: "exaone",
|
||||||
MODEL_ARCH.EXAONE4: "exaone4",
|
MODEL_ARCH.EXAONE4: "exaone4",
|
||||||
MODEL_ARCH.GRANITE: "granite",
|
MODEL_ARCH.GRANITE: "granite",
|
||||||
|
|
@ -1014,6 +1018,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
|
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
||||||
|
|
@ -1032,6 +1037,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||||
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||||
|
|
@ -1092,6 +1098,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_MMPROJ_PEG,
|
MODEL_TENSOR.V_MMPROJ_PEG,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV,
|
MODEL_TENSOR.V_ENC_ATTN_QKV,
|
||||||
|
|
@ -1110,6 +1117,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||||
MODEL_TENSOR.V_PRE_NORM,
|
MODEL_TENSOR.V_PRE_NORM,
|
||||||
MODEL_TENSOR.V_POST_NORM,
|
MODEL_TENSOR.V_POST_NORM,
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM,
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ,
|
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||||
MODEL_TENSOR.V_MM_INP_NORM,
|
MODEL_TENSOR.V_MM_INP_NORM,
|
||||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||||
|
|
@ -2529,6 +2537,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NEMOTRON_H_MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.SSM_IN,
|
||||||
|
MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
MODEL_TENSOR.SSM_DT,
|
||||||
|
MODEL_TENSOR.SSM_A,
|
||||||
|
MODEL_TENSOR.SSM_D,
|
||||||
|
MODEL_TENSOR.SSM_NORM,
|
||||||
|
MODEL_TENSOR.SSM_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
# experts
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
# shared expert
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||||
|
],
|
||||||
MODEL_ARCH.EXAONE: [
|
MODEL_ARCH.EXAONE: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
@ -3328,6 +3363,7 @@ class VisionProjectorType:
|
||||||
LIGHTONOCR = "lightonocr"
|
LIGHTONOCR = "lightonocr"
|
||||||
COGVLM = "cogvlm"
|
COGVLM = "cogvlm"
|
||||||
JANUS_PRO = "janus_pro"
|
JANUS_PRO = "janus_pro"
|
||||||
|
GLM4V = "glm4v"
|
||||||
|
|
||||||
|
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
|
|
|
||||||
|
|
@ -379,6 +379,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
||||||
"model.layers.{bid}.mlp.router.gate", # afmoe
|
"model.layers.{bid}.mlp.router.gate", # afmoe
|
||||||
"layers.{bid}.gate", # mistral-large
|
"layers.{bid}.gate", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
|
@ -392,6 +393,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||||
|
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
|
@ -440,7 +442,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||||
|
|
@ -454,6 +456,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.down_proj",
|
"model.layers.{bid}.feed_forward.down_proj",
|
||||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||||
"layers.{bid}.shared_experts.w3", # mistral-large
|
"layers.{bid}.shared_experts.w3", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||||
|
|
@ -548,7 +551,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||||
|
|
@ -563,6 +566,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||||
"layers.{bid}.shared_experts.w2", # mistral-large
|
"layers.{bid}.shared_experts.w2", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||||
|
|
@ -706,6 +710,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||||
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
||||||
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
||||||
|
"backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT_NORM: (
|
MODEL_TENSOR.SSM_DT_NORM: (
|
||||||
|
|
@ -1207,6 +1212,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||||
"model.connector.modality_projection.proj", # SmolVLM
|
"model.connector.modality_projection.proj", # SmolVLM
|
||||||
"model.vision.linear_proj.linear_proj", # cogvlm
|
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||||
|
"visual.merger.proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||||
|
|
@ -1240,6 +1246,10 @@ class TensorNameMap:
|
||||||
"model.vision.patch_embedding.proj", # cogvlm
|
"model.vision.patch_embedding.proj", # cogvlm
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||||
|
"visual.post_conv_layernorm", # glm4v
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||||
"vision_tower.vision_model.embeddings.position_embedding",
|
"vision_tower.vision_model.embeddings.position_embedding",
|
||||||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||||
|
|
@ -1249,6 +1259,7 @@ class TensorNameMap:
|
||||||
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
||||||
"visual.pos_embed", # qwen3vl
|
"visual.pos_embed", # qwen3vl
|
||||||
"model.vision.patch_embedding.position_embedding", # cogvlm
|
"model.vision.patch_embedding.position_embedding", # cogvlm
|
||||||
|
"visual.embeddings.position_embedding", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||||
|
|
@ -1404,6 +1415,11 @@ class TensorNameMap:
|
||||||
"vision_model.layernorm_post", # llama4
|
"vision_model.layernorm_post", # llama4
|
||||||
"visual.merger.ln_q", # qwen2vl
|
"visual.merger.ln_q", # qwen2vl
|
||||||
"vision_tower.encoder.final_layernorm", # kimi-vl
|
"vision_tower.encoder.final_layernorm", # kimi-vl
|
||||||
|
"visual.post_layernorm", # glm4v
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||||
|
"visual.merger.post_projection_norm", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||||
|
|
@ -1473,6 +1489,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
||||||
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
|
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
|
||||||
"patch_merger.merging_layer", # mistral
|
"patch_merger.merging_layer", # mistral
|
||||||
|
"visual.downsample", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_DS_NORM: (
|
MODEL_TENSOR.V_DS_NORM: (
|
||||||
|
|
@ -1493,14 +1510,17 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_UP: (
|
MODEL_TENSOR.V_MM_UP: (
|
||||||
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
|
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
|
||||||
|
"visual.merger.up_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_DOWN: (
|
MODEL_TENSOR.V_MM_DOWN: (
|
||||||
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
|
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
|
||||||
|
"visual.merger.down_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_GATE: (
|
MODEL_TENSOR.V_MM_GATE: (
|
||||||
"model.vision.linear_proj.gate_proj", # cogvlm
|
"model.vision.linear_proj.gate_proj", # cogvlm
|
||||||
|
"visual.merger.gate_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_TOK_BOI: (
|
MODEL_TENSOR.V_TOK_BOI: (
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,204 @@
|
||||||
|
{% macro render_extra_keys(json_dict, handled_keys) %}
|
||||||
|
{%- if json_dict is mapping %}
|
||||||
|
{%- for json_key in json_dict if json_key not in handled_keys %}
|
||||||
|
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
|
||||||
|
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
|
||||||
|
{%- else %}
|
||||||
|
{{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{% endmacro %}
|
||||||
|
{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
|
||||||
|
{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
|
||||||
|
|
||||||
|
{%- set ns = namespace(last_user_idx = -1) %}
|
||||||
|
{%- set loop_messages = messages %}
|
||||||
|
{%- for m in loop_messages %}
|
||||||
|
{%- if m["role"] == "user" %}
|
||||||
|
{%- set ns.last_user_idx = loop.index0 %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
|
||||||
|
{%- if messages[0]["role"] == "system" %}
|
||||||
|
{%- set system_message = messages[0]["content"] %}
|
||||||
|
{%- set loop_messages = messages[1:] %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set system_message = "" %}
|
||||||
|
{%- set loop_messages = messages %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if not tools is defined %}
|
||||||
|
{%- set tools = [] %}
|
||||||
|
{%- endif %}
|
||||||
|
{# Recompute last_user_idx relative to loop_messages after handling system #}
|
||||||
|
{%- set ns = namespace(last_user_idx = -1) %}
|
||||||
|
{%- for m in loop_messages %}
|
||||||
|
{%- if m["role"] == "user" %}
|
||||||
|
{%- set ns.last_user_idx = loop.index0 %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if system_message is defined %}
|
||||||
|
{{- "<|im_start|>system\n" + system_message }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{{- "<|im_start|>system\n" }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{%- if system_message is defined and system_message | length > 0 %}
|
||||||
|
{{- "\n\n" }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou have access to the following functions:\n\n" }}
|
||||||
|
{{- "<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{%- if tool.function is defined %}
|
||||||
|
{%- set tool = tool.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
|
||||||
|
{%- if tool.description is defined %}
|
||||||
|
{{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<parameters>' }}
|
||||||
|
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
|
||||||
|
{%- for param_name, param_fields in tool.parameters.properties|items %}
|
||||||
|
{{- '\n<parameter>' }}
|
||||||
|
{{- '\n<name>' ~ param_name ~ '</name>' }}
|
||||||
|
{%- if param_fields.type is defined %}
|
||||||
|
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if param_fields.description is defined %}
|
||||||
|
{{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if param_fields.enum is defined %}
|
||||||
|
{{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
|
||||||
|
{{- render_extra_keys(param_fields, handled_keys) }}
|
||||||
|
{{- '\n</parameter>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{% set handled_keys = ['type', 'properties', 'required'] %}
|
||||||
|
{{- render_extra_keys(tool.parameters, handled_keys) }}
|
||||||
|
{%- if tool.parameters is defined and tool.parameters.required is defined %}
|
||||||
|
{{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n</parameters>' }}
|
||||||
|
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
|
||||||
|
{{- render_extra_keys(tool, handled_keys) }}
|
||||||
|
{{- '\n</function>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>" }}
|
||||||
|
|
||||||
|
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
|
||||||
|
{%- if system_message is defined %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{%- for message in loop_messages %}
|
||||||
|
{%- if message.role == "assistant" %}
|
||||||
|
{# Add reasoning content in to content field for unified processing below. #}
|
||||||
|
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
|
||||||
|
{%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set content = message.content | default('', true) %}
|
||||||
|
{%- if content is string -%}
|
||||||
|
{# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
|
||||||
|
{%- if '<think>' not in content and '</think>' not in content -%}
|
||||||
|
{%- set content = "<think></think>" ~ content -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{%- set content = content -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
|
||||||
|
{# Assistant message has tool calls. #}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
|
||||||
|
{%- if content is string and content | trim | length > 0 %}
|
||||||
|
{%- if include_content %}
|
||||||
|
{{- (content | trim) ~ '\n' -}}
|
||||||
|
{%- else %}
|
||||||
|
{%- set c = (content | string) %}
|
||||||
|
{%- if '</think>' in c %}
|
||||||
|
{# Keep only content after the last closing think. Also generation prompt causes this. #}
|
||||||
|
{%- set c = c.split('</think>')[-1] %}
|
||||||
|
{%- elif '<think>' in c %}
|
||||||
|
{# If <think> was opened but never closed, drop the trailing think segment #}
|
||||||
|
{%- set c = c.split('<think>')[0] %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set c = "<think></think>" ~ c | trim %}
|
||||||
|
{%- if c | length > 0 %}
|
||||||
|
{{- c ~ '\n' -}}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- "<think></think>" -}}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if tool_call.function is defined %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
|
||||||
|
{%- if tool_call.arguments is defined %}
|
||||||
|
{%- for args_name, args_value in tool_call.arguments|items %}
|
||||||
|
{{- '<parameter=' ~ args_name ~ '>\n' -}}
|
||||||
|
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
||||||
|
{{- args_value ~ '\n</parameter>\n' -}}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '</function>\n</tool_call>\n' -}}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{# Assistant message doesn't have tool calls. #}
|
||||||
|
{%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
|
||||||
|
{{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{%- set c = (content | default('', true) | string) %}
|
||||||
|
{%- if '<think>' in c and '</think>' in c %}
|
||||||
|
{%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set c = c | trim %}
|
||||||
|
{%- if c | length > 0 %}
|
||||||
|
{{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>assistant\n<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- elif message.role == "user" or message.role == "system" %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' }}
|
||||||
|
{%- set content = message.content | string %}
|
||||||
|
{{- content }}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||||
|
{{- '<|im_start|>user\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>\n' }}
|
||||||
|
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif loop.last %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{%- if enable_thinking %}
|
||||||
|
{{- '<|im_start|>assistant\n<think>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>assistant\n<think></think>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
3772
src/llama-arch.cpp
3772
src/llama-arch.cpp
File diff suppressed because it is too large
Load Diff
|
|
@ -3,6 +3,7 @@
|
||||||
#include "ggml.h" // ggml_op
|
#include "ggml.h" // ggml_op
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf constants (sync with gguf.py)
|
// gguf constants (sync with gguf.py)
|
||||||
|
|
@ -79,6 +80,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_JAIS,
|
LLM_ARCH_JAIS,
|
||||||
LLM_ARCH_NEMOTRON,
|
LLM_ARCH_NEMOTRON,
|
||||||
LLM_ARCH_NEMOTRON_H,
|
LLM_ARCH_NEMOTRON_H,
|
||||||
|
LLM_ARCH_NEMOTRON_H_MOE,
|
||||||
LLM_ARCH_EXAONE,
|
LLM_ARCH_EXAONE,
|
||||||
LLM_ARCH_EXAONE4,
|
LLM_ARCH_EXAONE4,
|
||||||
LLM_ARCH_RWKV6,
|
LLM_ARCH_RWKV6,
|
||||||
|
|
@ -315,6 +317,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_DENSE_3_OUT,
|
LLM_TENSOR_DENSE_3_OUT,
|
||||||
LLM_TENSOR_OUTPUT,
|
LLM_TENSOR_OUTPUT,
|
||||||
LLM_TENSOR_OUTPUT_NORM,
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
|
||||||
LLM_TENSOR_ROPE_FREQS,
|
LLM_TENSOR_ROPE_FREQS,
|
||||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||||
|
|
@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
|
||||||
const int bid;
|
const int bid;
|
||||||
const int xid;
|
const int xid;
|
||||||
|
|
||||||
|
const std::set<llm_tensor> model_tensors;
|
||||||
|
|
||||||
|
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||||
|
|
||||||
std::string str() const;
|
std::string str() const;
|
||||||
|
|
||||||
operator std::string() const {
|
operator std::string() const {
|
||||||
|
|
@ -546,11 +553,11 @@ struct LLM_TN {
|
||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
||||||
return { arch, tensor, suffix, bid, xid };
|
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
||||||
return { arch, tensor, nullptr, bid, xid };
|
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= s_copy->ne[0] == mctx->get_n_rs();
|
||||||
|
|
||||||
|
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= head == mctx->get_head();
|
||||||
|
res &= rs_z == mctx->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
GGML_UNUSED(ubatch);
|
GGML_UNUSED(ubatch);
|
||||||
|
|
||||||
|
|
@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||||
inp_attn->set_input(ubatch);
|
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
||||||
inp_rs->set_input(ubatch);
|
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
||||||
|
|
||||||
|
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
if (inp_rs->s_copy) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
||||||
|
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
||||||
|
|
||||||
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
|
for (uint32_t i = 0; i < n_rs; ++i) {
|
||||||
|
data[i] = mctx->get_recr()->s_copy(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||||
|
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
|
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
||||||
|
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= inp_rs->head == mctx->get_recr()->get_head();
|
||||||
|
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
cur = ggml_relu(ctx0, cur);
|
cur = ggml_relu(ctx0, cur);
|
||||||
cb(cur, "ffn_moe_relu", il);
|
cb(cur, "ffn_moe_relu", il);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_FFN_RELU_SQR:
|
||||||
|
if (gate_exps) {
|
||||||
|
// TODO: add support for gated squared relu
|
||||||
|
GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||||
|
} else {
|
||||||
|
cur = ggml_relu(ctx0, cur);
|
||||||
|
cur = ggml_sqr(ctx0, cur);
|
||||||
|
cb(cur, "ffn_moe_relu_sqr", il);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
||||||
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
||||||
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
||||||
|
|
||||||
|
inp->head = mctx_cur->get_head();
|
||||||
|
inp->rs_z = mctx_cur->get_rs_z();
|
||||||
|
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||||
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||||
|
|
||||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
||||||
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||||
|
|
||||||
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -225,6 +225,8 @@ public:
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [n_rs]
|
ggml_tensor * s_copy; // I32 [n_rs]
|
||||||
|
|
||||||
// views of s_copy, computed once per graph
|
// views of s_copy, computed once per graph
|
||||||
|
|
@ -233,6 +235,10 @@ public:
|
||||||
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||||
|
|
||||||
const llama_memory_recurrent_context * mctx;
|
const llama_memory_recurrent_context * mctx;
|
||||||
|
|
||||||
|
// used in view offsets, need to match for valid graph reuse
|
||||||
|
uint32_t head;
|
||||||
|
int32_t rs_z;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
|
|
@ -365,22 +371,28 @@ public:
|
||||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_mem_hybrid(
|
llm_graph_input_mem_hybrid(
|
||||||
|
const llama_cparams & cparams,
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||||
const llama_memory_hybrid_context * mctx) :
|
const llama_memory_hybrid_context * mctx) :
|
||||||
inp_attn(std::move(inp_attn)),
|
inp_attn(std::move(inp_attn)),
|
||||||
inp_rs(std::move(inp_rs)),
|
inp_rs(std::move(inp_rs)),
|
||||||
|
cparams(cparams),
|
||||||
mctx(mctx) { }
|
mctx(mctx) { }
|
||||||
virtual ~llm_graph_input_mem_hybrid() = default;
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||||
|
|
||||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||||
|
|
||||||
|
const llama_cparams cparams;
|
||||||
|
|
||||||
const llama_memory_hybrid_context * mctx;
|
const llama_memory_hybrid_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||||
|
|
@ -230,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::use_mrope() const {
|
||||||
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -270,6 +270,8 @@ struct llama_hparams {
|
||||||
// TODO: think of a better place for this function
|
// TODO: think of a better place for this function
|
||||||
// TODO: pack the SWA params in a struct?
|
// TODO: pack the SWA params in a struct?
|
||||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||||
|
|
||||||
|
bool use_mrope() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||||
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||||
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||||
|
|
@ -1689,6 +1690,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
case LLM_ARCH_GLM4:
|
case LLM_ARCH_GLM4:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 40: type = LLM_TYPE_9B; break;
|
case 40: type = LLM_TYPE_9B; break;
|
||||||
case 61: type = LLM_TYPE_32B; break;
|
case 61: type = LLM_TYPE_32B; break;
|
||||||
|
|
@ -1699,6 +1701,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
|
|
||||||
// MoE parameters
|
// MoE parameters
|
||||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
|
|
@ -1797,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
|
|
@ -1812,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||||
case 56: type = LLM_TYPE_9B; break;
|
case 56: type = LLM_TYPE_9B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -5159,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
// mamba2 Mixer SSM params
|
// mamba2 Mixer SSM params
|
||||||
// NOTE: int64_t for tensor dimensions
|
// NOTE: int64_t for tensor dimensions
|
||||||
|
|
@ -5169,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int64_t n_group = hparams.ssm_n_group;
|
const int64_t n_group = hparams.ssm_n_group;
|
||||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||||
|
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||||
|
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||||
|
|
||||||
// embeddings
|
// embeddings
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
|
@ -5218,6 +5233,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
} else {
|
||||||
|
if (n_expert != 0) {
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||||
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// mlp layers
|
// mlp layers
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||||
|
|
@ -5226,6 +5254,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_EXAONE:
|
case LLM_ARCH_EXAONE:
|
||||||
{
|
{
|
||||||
|
|
@ -6207,7 +6236,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
if (output == NULL) {
|
if (output == NULL) {
|
||||||
|
|
@ -6850,7 +6879,8 @@ void llama_model::print_info() const {
|
||||||
arch == LLM_ARCH_PLAMO2 ||
|
arch == LLM_ARCH_PLAMO2 ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
arch == LLM_ARCH_QWEN3NEXT ||
|
arch == LLM_ARCH_QWEN3NEXT ||
|
||||||
arch == LLM_ARCH_NEMOTRON_H) {
|
arch == LLM_ARCH_NEMOTRON_H ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||||
|
|
@ -6905,7 +6935,8 @@ void llama_model::print_info() const {
|
||||||
if (arch == LLM_ARCH_MINICPM ||
|
if (arch == LLM_ARCH_MINICPM ||
|
||||||
arch == LLM_ARCH_GRANITE ||
|
arch == LLM_ARCH_GRANITE ||
|
||||||
arch == LLM_ARCH_GRANITE_MOE ||
|
arch == LLM_ARCH_GRANITE_MOE ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID) {
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||||
|
|
@ -7086,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
if (arch == LLM_ARCH_FALCON_H1) {
|
if (arch == LLM_ARCH_FALCON_H1) {
|
||||||
filter_attn = [&](int32_t) { return true; };
|
filter_attn = [&](int32_t) { return true; };
|
||||||
filter_recr = [&](int32_t) { return true; };
|
filter_recr = [&](int32_t) { return true; };
|
||||||
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
filter_attn = [&](int32_t il) {
|
filter_attn = [&](int32_t il) {
|
||||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||||
};
|
};
|
||||||
|
|
@ -7457,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -7741,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_ARWKV7:
|
case LLM_ARCH_ARWKV7:
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
@ -7761,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
case LLM_ARCH_PLM:
|
case LLM_ARCH_PLM:
|
||||||
case LLM_ARCH_CHATGLM:
|
case LLM_ARCH_CHATGLM:
|
||||||
case LLM_ARCH_GLM4:
|
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
case LLM_ARCH_GRANITE_HYBRID:
|
case LLM_ARCH_GRANITE_HYBRID:
|
||||||
|
|
@ -7823,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_LFM2MOE:
|
case LLM_ARCH_LFM2MOE:
|
||||||
case LLM_ARCH_SMALLTHINKER:
|
case LLM_ARCH_SMALLTHINKER:
|
||||||
case LLM_ARCH_GLM4_MOE:
|
|
||||||
case LLM_ARCH_SEED_OSS:
|
case LLM_ARCH_SEED_OSS:
|
||||||
case LLM_ARCH_GROVEMOE:
|
case LLM_ARCH_GROVEMOE:
|
||||||
case LLM_ARCH_APERTUS:
|
case LLM_ARCH_APERTUS:
|
||||||
|
|
@ -7840,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_QWEN3VLMOE:
|
case LLM_ARCH_QWEN3VLMOE:
|
||||||
return LLAMA_ROPE_TYPE_IMROPE;
|
return LLAMA_ROPE_TYPE_IMROPE;
|
||||||
|
|
||||||
|
case LLM_ARCH_GLM4:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
||||||
|
case LLM_ARCH_GLM4_MOE:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
case LLM_ARCH_UNKNOWN:
|
case LLM_ARCH_UNKNOWN:
|
||||||
GGML_ABORT("unknown architecture");
|
GGML_ABORT("unknown architecture");
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,7 @@ enum llm_type {
|
||||||
LLM_TYPE_16B_A1B,
|
LLM_TYPE_16B_A1B,
|
||||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||||
LLM_TYPE_30B_A3B,
|
LLM_TYPE_30B_A3B,
|
||||||
|
LLM_TYPE_31B_A3_5B,
|
||||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||||
LLM_TYPE_100B_A6B,
|
LLM_TYPE_100B_A6B,
|
||||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||||
|
|
|
||||||
|
|
@ -241,6 +241,13 @@ static void llama_params_fit_impl(
|
||||||
global_surplus += memory_reduction;
|
global_surplus += memory_reduction;
|
||||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||||
|
if (global_surplus >= 0) {
|
||||||
|
if (nd == 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||||
__func__, hp_nct, n_ctx_min);
|
__func__, hp_nct, n_ctx_min);
|
||||||
|
|
@ -249,10 +256,6 @@ static void llama_params_fit_impl(
|
||||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (global_surplus >= 0) {
|
|
||||||
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Kcur, "Kcur_normed", il);
|
cb(Kcur, "Kcur_normed", il);
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(
|
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
if (use_mrope) {
|
||||||
ctx0, Kcur, inp_pos, nullptr,
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
);
|
|
||||||
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
|
if (use_mrope) {
|
||||||
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -441,23 +441,13 @@ private:
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_layer_ffn(
|
ggml_tensor * build_layer_ffn(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_delta_net_recurrent(
|
|
||||||
ggml_tensor * q,
|
|
||||||
ggml_tensor * k,
|
|
||||||
ggml_tensor * v,
|
|
||||||
ggml_tensor * g,
|
|
||||||
ggml_tensor * beta,
|
|
||||||
ggml_tensor * state,
|
|
||||||
ggml_tensor * causal_mask,
|
|
||||||
ggml_tensor * identity,
|
|
||||||
int il);
|
|
||||||
|
|
||||||
ggml_tensor * build_delta_net_chunking(
|
ggml_tensor * build_delta_net_chunking(
|
||||||
ggml_tensor * q,
|
ggml_tensor * q,
|
||||||
ggml_tensor * k,
|
ggml_tensor * k,
|
||||||
|
|
@ -467,6 +457,16 @@ private:
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
|
int il);
|
||||||
|
|
||||||
|
ggml_tensor * build_delta_net_autoregressive(
|
||||||
|
ggml_tensor * q,
|
||||||
|
ggml_tensor * k,
|
||||||
|
ggml_tensor * v,
|
||||||
|
ggml_tensor * g,
|
||||||
|
ggml_tensor * beta,
|
||||||
|
ggml_tensor * state,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_norm_gated(
|
ggml_tensor * build_norm_gated(
|
||||||
|
|
|
||||||
|
|
@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
NULL, NULL, NULL,
|
NULL, NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
ggml_tensor * ffn_inp = cur;
|
||||||
|
ggml_tensor * moe_out =
|
||||||
|
build_moe_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
nullptr, // no gate
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_exp_probs_b,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||||
|
true, hparams.expert_weights_scale,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||||
|
il);
|
||||||
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||||
|
NULL /* no gate */ , NULL, NULL,
|
||||||
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
|
cb(ffn_shexp, "ffn_shexp", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
cur = build_cvec(cur, il);
|
cur = build_cvec(cur, il);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
|
||||||
|
|
@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
ggml_tensor * causal_mask =
|
ggml_tensor * causal_mask =
|
||||||
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
|
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||||
GGML_TRI_TYPE_LOWER);
|
GGML_TRI_TYPE_LOWER);
|
||||||
|
|
||||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
|
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||||
|
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, causal_mask);
|
ggml_build_forward_expand(gf, causal_mask);
|
||||||
ggml_build_forward_expand(gf, identity);
|
ggml_build_forward_expand(gf, identity);
|
||||||
|
ggml_build_forward_expand(gf, diag_mask);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||||
// Determine layer type and build appropriate attention mechanism
|
// Determine layer type and build appropriate attention mechanism
|
||||||
if (hparams.is_recurrent(il)) {
|
if (hparams.is_recurrent(il)) {
|
||||||
// Linear attention layer (gated delta net)
|
// Linear attention layer (gated delta net)
|
||||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
|
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
|
||||||
} else {
|
} else {
|
||||||
// Full attention layer
|
// Full attention layer
|
||||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
|
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
|
||||||
|
|
@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il) {
|
int il) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(q));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(k));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(v));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(g));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(state));
|
|
||||||
|
|
||||||
const int64_t S_k = q->ne[0];
|
const int64_t S_k = q->ne[0];
|
||||||
const int64_t H_k = q->ne[1];
|
const int64_t H_k = q->ne[1];
|
||||||
const int64_t n_tokens = q->ne[2];
|
const int64_t n_tokens = q->ne[2];
|
||||||
|
|
@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||||
|
|
||||||
// TODO: can this ever be false?
|
|
||||||
const bool use_qk_l2norm = true;
|
|
||||||
|
|
||||||
if (use_qk_l2norm) {
|
|
||||||
const float eps_norm = hparams.f_norm_rms_eps;
|
const float eps_norm = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||||
}
|
|
||||||
|
|
||||||
const float scale = 1.0f / sqrtf(S_v);
|
const float scale = 1.0f / sqrtf(S_v);
|
||||||
|
|
||||||
|
|
@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
beta = ggml_sigmoid(ctx0, beta);
|
beta = ggml_sigmoid(ctx0, beta);
|
||||||
|
|
||||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
|
||||||
|
|
||||||
cb(q, "q_in", il);
|
cb(q, "q_in", il);
|
||||||
cb(k, "k_in", il);
|
cb(k, "k_in", il);
|
||||||
cb(v, "v_in", il);
|
cb(v, "v_in", il);
|
||||||
|
|
@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
cb(v_beta, "v_beta", il);
|
cb(v_beta, "v_beta", il);
|
||||||
cb(k_beta, "k_beta", il);
|
cb(k_beta, "k_beta", il);
|
||||||
|
|
||||||
ggml_tensor * chunked_mask =
|
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
ggml_view_4d(ctx0, causal_mask, chunk_size,
|
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
chunk_size, causal_mask->ne[2], causal_mask->ne[3],
|
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
|
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * chunked_diag_mask =
|
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||||
ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
|
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3],
|
|
||||||
causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
|
|
||||||
|
|
||||||
ggml_tensor * chunked_identity =
|
|
||||||
ggml_view_4d(ctx0, identity, chunk_size,
|
|
||||||
chunk_size, identity->ne[2], identity->ne[3],
|
|
||||||
identity->nb[1], identity->nb[2], identity->nb[3], 0);
|
|
||||||
|
|
||||||
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
|
||||||
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
|
||||||
|
|
||||||
g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
|
||||||
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
|
|
||||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||||
|
|
||||||
cb(g_cumsum, "g_cumsum", il);
|
cb(g_cumsum, "g_cumsum", il);
|
||||||
|
|
||||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * gcs_j_broadcast =
|
ggml_tensor * gcs_j_broadcast =
|
||||||
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
cb(decay_mask, "decay_mask", il);
|
cb(decay_mask, "decay_mask", il);
|
||||||
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||||
|
|
||||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||||
|
|
||||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
|
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||||
|
|
||||||
cb(attn, "attn_pre_solve", il);
|
cb(attn, "attn_pre_solve", il);
|
||||||
|
|
||||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
|
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
|
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||||
|
|
||||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||||
attn = ggml_mul(ctx0, lin_solve, chunked_mask);
|
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||||
attn = ggml_add(ctx0, attn, chunked_identity);
|
attn = ggml_add(ctx0, attn, identity);
|
||||||
|
|
||||||
cb(attn, "attn_solved", il);
|
cb(attn, "attn_solved", il);
|
||||||
|
|
||||||
|
|
@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||||
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
|
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
|
||||||
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
|
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
|
||||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
|
attn = ggml_mul(ctx0, attn, diag_mask);
|
||||||
|
|
||||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||||
|
|
||||||
|
|
@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
|
||||||
ggml_tensor * q,
|
ggml_tensor * q,
|
||||||
ggml_tensor * k,
|
ggml_tensor * k,
|
||||||
ggml_tensor * v,
|
ggml_tensor * v,
|
||||||
ggml_tensor * g,
|
ggml_tensor * g,
|
||||||
ggml_tensor * beta,
|
ggml_tensor * beta,
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
|
||||||
ggml_tensor * identity,
|
|
||||||
int il) {
|
int il) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(q));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(k));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(v));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(g));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(state));
|
|
||||||
|
|
||||||
const int64_t S_k = q->ne[0];
|
const int64_t S_k = q->ne[0];
|
||||||
const int64_t H_k = q->ne[1];
|
const int64_t H_k = q->ne[1];
|
||||||
const int64_t n_tokens = q->ne[2];
|
const int64_t n_tokens = q->ne[2];
|
||||||
|
|
@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
||||||
const int64_t S_v = v->ne[0];
|
const int64_t S_v = v->ne[0];
|
||||||
const int64_t H_v = v->ne[1];
|
const int64_t H_v = v->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
|
||||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||||
|
|
@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
||||||
|
|
||||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||||
|
|
||||||
// TODO: can this ever be false?
|
|
||||||
const bool use_qk_l2norm = true;
|
|
||||||
|
|
||||||
if (use_qk_l2norm) {
|
|
||||||
const float eps_norm = hparams.f_norm_rms_eps;
|
const float eps_norm = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||||
}
|
|
||||||
|
|
||||||
const float scale = 1.0f / sqrtf(S_v);
|
const float scale = 1.0f / sqrtf(S_v);
|
||||||
|
|
||||||
q = ggml_scale(ctx0, q, scale);
|
q = ggml_scale(ctx0, q, scale);
|
||||||
|
|
||||||
beta = ggml_sigmoid(ctx0, beta);
|
beta = ggml_sigmoid(ctx0, beta);
|
||||||
|
|
||||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
|
||||||
|
|
||||||
cb(q, "q_in", il);
|
cb(q, "q_in", il);
|
||||||
cb(k, "k_in", il);
|
cb(k, "k_in", il);
|
||||||
cb(v, "v_in", il);
|
cb(v, "v_in", il);
|
||||||
cb(beta, "beta_in", il);
|
cb(beta, "beta_in", il);
|
||||||
cb(g, "g_in", il);
|
cb(g, "g_in", il);
|
||||||
|
|
||||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
|
|
||||||
|
|
||||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
|
||||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||||
|
|
||||||
cb(q, "q_perm", il);
|
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
|
||||||
cb(k, "k_perm", il);
|
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||||
cb(v, "v_perm", il);
|
|
||||||
cb(beta, "beta_perm", il);
|
|
||||||
cb(g, "g_perm", il);
|
|
||||||
cb(state, "state_in", il);
|
|
||||||
|
|
||||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
// Apply exponential to g_t
|
||||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
g_t = ggml_exp(ctx0, g_t);
|
||||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
|
||||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
|
||||||
|
|
||||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
// Apply the gated delta rule for the single timestep
|
||||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
// last_recurrent_state = last_recurrent_state * g_t
|
||||||
|
state = ggml_mul(ctx0, state, g_t);
|
||||||
|
|
||||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
|
||||||
|
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
|
||||||
|
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
|
||||||
|
// we need to sum over dim=-2, so we transpose, sum, then transpose again
|
||||||
|
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
|
||||||
|
|
||||||
cb(k_beta, "k_beta", il);
|
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
|
||||||
cb(v_beta, "v_beta", il);
|
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||||
cb(g_cumsum, "g_cumsum", il);
|
// delta = (v_t - kv_mem) * beta_t
|
||||||
|
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
|
||||||
|
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
|
||||||
|
|
||||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs]
|
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
|
||||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs]
|
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
|
||||||
|
state = ggml_add(ctx0, state, k_t_delta);
|
||||||
|
|
||||||
// Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
|
// Compute the attention output
|
||||||
// ggml_tensor * gcs_i_broadcast =
|
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
|
||||||
// ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
|
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
|
||||||
// n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
|
||||||
// Don't need this, this one will get auto-broadcast
|
// again, since it's over dim = -2, transpose, sum, transpose back
|
||||||
ggml_tensor * gcs_j_broadcast =
|
ggml_tensor * core_attn_out =
|
||||||
ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
|
||||||
|
|
||||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
|
|
||||||
|
|
||||||
// Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
|
||||||
// Apply exponential to get the decay mask values
|
|
||||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
|
||||||
// Apply lower triangular mask again to ensure only lower triangular values remain
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
|
||||||
|
|
||||||
cb(decay_mask, "decay_mask", il);
|
|
||||||
|
|
||||||
// attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
|
|
||||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
|
||||||
|
|
||||||
cb(kmulkbeta, "kmulkbeta", il);
|
|
||||||
|
|
||||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
|
||||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
|
||||||
|
|
||||||
cb(attn, "attn_pre_rec", il);
|
|
||||||
|
|
||||||
// for i in range(1, chunk_size):
|
|
||||||
// row = attn[..., i, :i].clone()
|
|
||||||
// sub = attn[..., :i, :i].clone()
|
|
||||||
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
|
||||||
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
|
||||||
//
|
|
||||||
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
|
|
||||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
|
||||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
|
||||||
|
|
||||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
|
||||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
|
||||||
attn = ggml_add(ctx0, attn, identity);
|
|
||||||
|
|
||||||
// value = attn @ v_beta
|
|
||||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
|
|
||||||
|
|
||||||
cb(v, "value_beta", il);
|
|
||||||
|
|
||||||
// k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
|
|
||||||
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
|
|
||||||
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
|
|
||||||
|
|
||||||
cb(gexp, "g_cum_exp", il);
|
|
||||||
|
|
||||||
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
|
|
||||||
|
|
||||||
cb(kbeta_gexp, "kbeta_gexp", il);
|
|
||||||
|
|
||||||
ggml_tensor * k_cumdecay =
|
|
||||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
|
|
||||||
|
|
||||||
cb(k_cumdecay, "k_cumdecay", il);
|
|
||||||
|
|
||||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
|
||||||
attn = ggml_mul_mat(ctx0, k, q);
|
|
||||||
attn = ggml_mul(ctx0, attn, decay_mask);
|
|
||||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
|
|
||||||
|
|
||||||
cb(attn, "attn_decay_key", il);
|
|
||||||
|
|
||||||
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
|
|
||||||
|
|
||||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
|
||||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
|
|
||||||
|
|
||||||
cb(v_prime, "v_prime", il);
|
|
||||||
|
|
||||||
// v_new = v_i - v_prime
|
|
||||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
|
|
||||||
|
|
||||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
|
||||||
|
|
||||||
cb(v_new, "v_new", il);
|
|
||||||
|
|
||||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
|
||||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp);
|
|
||||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
|
|
||||||
|
|
||||||
cb(attn_inter, "attn_inter", il);
|
|
||||||
|
|
||||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
|
||||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
|
|
||||||
|
|
||||||
cb(v_attn, "v_attn", il);
|
|
||||||
|
|
||||||
ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
|
|
||||||
|
|
||||||
cb(core_attn_out, "core_attn_out", il);
|
|
||||||
|
|
||||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
|
||||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
|
||||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
|
||||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
|
||||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
|
||||||
|
|
||||||
ggml_tensor * g_cum_last =
|
|
||||||
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
|
|
||||||
g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
|
|
||||||
g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
|
|
||||||
|
|
||||||
cb(g_cum_last, "g_cum_last", il);
|
|
||||||
|
|
||||||
ggml_tensor * gexp_last =
|
|
||||||
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
|
|
||||||
|
|
||||||
cb(gexp_last, "gexp_last", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_cum_last_3d =
|
|
||||||
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
|
|
||||||
|
|
||||||
cb(g_cum_last_3d, "g_cum_last_3d", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
|
|
||||||
|
|
||||||
cb(g_cumsum_3d, "g_cumsum_3d", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
|
|
||||||
|
|
||||||
cb(g_diff, "g_diff", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
|
||||||
|
|
||||||
cb(g_diff_exp, "g_diff_exp", il);
|
|
||||||
|
|
||||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
|
|
||||||
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
|
|
||||||
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
|
|
||||||
|
|
||||||
cb(key_gdiff, "key_gdiff", il);
|
|
||||||
|
|
||||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
|
|
||||||
|
|
||||||
cb(kgdmulvnew, "kgdmulvnew", il);
|
|
||||||
|
|
||||||
state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
|
|
||||||
|
|
||||||
|
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||||
|
cb(core_attn_out, "output_tokens", il);
|
||||||
cb(state, "new_state", il);
|
cb(state, "new_state", il);
|
||||||
|
|
||||||
// flatten output
|
// flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
|
||||||
ggml_tensor * flat_output =
|
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
|
||||||
ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
|
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
|
||||||
|
|
||||||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il) {
|
int il) {
|
||||||
const auto * mctx_cur = inp->mctx;
|
const auto * mctx_cur = inp->mctx;
|
||||||
|
|
||||||
|
|
@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
cb(mixed_ba, "linear_attn_mixed_ba", il);
|
cb(mixed_ba, "linear_attn_mixed_ba", il);
|
||||||
|
|
||||||
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
|
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
|
||||||
ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
|
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
|
||||||
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
|
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
|
||||||
ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
// Split mixed_ba into b and a (beta and alpha parameters)
|
// Split mixed_ba into b and a (beta and alpha parameters)
|
||||||
int64_t split_sizes_ba[2] = {
|
int64_t split_sizes_ba[2] = {
|
||||||
|
|
@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
|
||||||
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
|
|
||||||
|
|
||||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||||
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
||||||
cb(alpha_softplus, "a_softplus", il);
|
cb(alpha_softplus, "a_softplus", il);
|
||||||
|
|
@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
|
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
|
||||||
cb(z, "z", il);
|
cb(z, "z", il);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
|
|
||||||
ggml_nelements(mixed_qkvz));
|
|
||||||
|
|
||||||
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
|
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
|
||||||
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
|
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
|
||||||
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
cb(k_conv, "k_conv_predelta", il);
|
cb(k_conv, "k_conv_predelta", il);
|
||||||
cb(v_conv, "v_conv_predelta", il);
|
cb(v_conv, "v_conv_predelta", il);
|
||||||
|
|
||||||
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
|
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
|
||||||
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
|
ggml_tensor * attn_out;
|
||||||
build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
|
if (n_seq_tokens == 1) {
|
||||||
build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
|
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||||
|
} else {
|
||||||
|
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
|
||||||
|
}
|
||||||
cb(attn_out, "attn_out", il);
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
// The tensors were concatenated 1d, so we need to extract them 1d as well
|
// The tensors were concatenated 1d, so we need to extract them 1d as well
|
||||||
|
|
|
||||||
|
|
@ -3588,6 +3588,163 @@ static void test_template_output_peg_parsers() {
|
||||||
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// NVIDIA Nemotron-3 Nano
|
||||||
|
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
|
||||||
|
|
||||||
|
// Test basic message
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "Hello, world!\nWhat's up?";
|
||||||
|
t.expect = message_assist;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test basic message and reasoning with reasoning_format = none
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
t.expect.content = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test basic message and reasoning with reasoning_format = auto
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
t.params.enable_thinking = true;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
|
||||||
|
t.expect = message_assist_thoughts;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {special_function_tool};
|
||||||
|
|
||||||
|
t.expect = message_assist_call;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with reasoning
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"I'm\nthinking\n</think>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {special_function_tool};
|
||||||
|
|
||||||
|
t.expect = message_assist_call_thoughts;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test parallel tool calls
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function_with_opt>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"<parameter=arg2>\n"
|
||||||
|
"2\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.parallel_tool_calls = true;
|
||||||
|
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "special_function",
|
||||||
|
/* .arguments = */ R"({"arg1": 1})",
|
||||||
|
/* .id = */ {},
|
||||||
|
}, {
|
||||||
|
/* .name = */ "special_function_with_opt",
|
||||||
|
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with string parameter
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=python>\n"
|
||||||
|
"<parameter=code>\n"
|
||||||
|
"def hello():\n"
|
||||||
|
" print(\"Hello, world!\")\n"
|
||||||
|
"\n"
|
||||||
|
"hello()\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {python_tool};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "python",
|
||||||
|
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with string parameter and no closing </parameter> tag
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=python>\n"
|
||||||
|
"<parameter=code>\n"
|
||||||
|
"def hello():\n"
|
||||||
|
" print(\"Hello, world!\")\n"
|
||||||
|
"\n"
|
||||||
|
"hello()\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {python_tool};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "python",
|
||||||
|
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test response format
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"I need to output the invoice details in JSON\n"
|
||||||
|
"</think>\n"
|
||||||
|
R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.json_schema = invoice_schema;
|
||||||
|
|
||||||
|
t.expect.reasoning_content = "I need to output the invoice details in JSON";
|
||||||
|
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_msg_diffs_compute() {
|
static void test_msg_diffs_compute() {
|
||||||
|
|
|
||||||
|
|
@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_resolves_to_string() {
|
||||||
|
fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
|
||||||
|
|
||||||
|
auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
|
||||||
|
fprintf(stderr, "- %s\n", name.c_str());
|
||||||
|
common_schema_info info;
|
||||||
|
auto schema = nlohmann::ordered_json::parse(schema_str);
|
||||||
|
info.resolve_refs(schema);
|
||||||
|
bool result = info.resolves_to_string(schema);
|
||||||
|
if (result != expected) {
|
||||||
|
fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
|
||||||
|
fprintf(stderr, "Schema: %s\n", schema_str.c_str());
|
||||||
|
fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Basic type checks
|
||||||
|
test("type string", R"({"type": "string"})", true);
|
||||||
|
test("type integer", R"({"type": "integer"})", false);
|
||||||
|
test("type number", R"({"type": "number"})", false);
|
||||||
|
test("type boolean", R"({"type": "boolean"})", false);
|
||||||
|
test("type object", R"({"type": "object"})", false);
|
||||||
|
test("type array", R"({"type": "array"})", false);
|
||||||
|
|
||||||
|
// Type array (nullable string)
|
||||||
|
test("type array with string", R"({"type": ["string", "null"]})", true);
|
||||||
|
test("type array without string", R"({"type": ["integer", "null"]})", false);
|
||||||
|
|
||||||
|
// String-specific keywords
|
||||||
|
test("minLength implies string", R"({"minLength": 1})", true);
|
||||||
|
test("maxLength implies string", R"({"maxLength": 10})", true);
|
||||||
|
test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
|
||||||
|
|
||||||
|
// Format
|
||||||
|
test("format date", R"({"format": "date"})", true);
|
||||||
|
test("format uuid", R"({"format": "uuid"})", true);
|
||||||
|
test("format email", R"({"format": "email"})", true);
|
||||||
|
|
||||||
|
// Const
|
||||||
|
test("const string", R"({"const": "hello"})", true);
|
||||||
|
test("const number", R"({"const": 123})", false);
|
||||||
|
|
||||||
|
// Enum
|
||||||
|
test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
|
||||||
|
test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
|
||||||
|
test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
|
||||||
|
|
||||||
|
// anyOf
|
||||||
|
test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
|
||||||
|
test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
|
||||||
|
|
||||||
|
// oneOf
|
||||||
|
test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
|
||||||
|
test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
|
||||||
|
|
||||||
|
// allOf - all must be strings
|
||||||
|
test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
|
||||||
|
test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
|
||||||
|
|
||||||
|
// $ref
|
||||||
|
test("$ref to string",
|
||||||
|
R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
|
||||||
|
test("$ref to integer",
|
||||||
|
R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
|
||||||
|
|
||||||
|
// Nested
|
||||||
|
test("nested anyOf with string",
|
||||||
|
R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
|
||||||
|
|
||||||
|
fprintf(stderr, "All resolves_to_string tests passed!\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
||||||
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
||||||
|
|
||||||
|
test_resolves_to_string();
|
||||||
|
|
||||||
test_all("C++", [](const TestCase & tc) {
|
test_all("C++", [](const TestCase & tc) {
|
||||||
try {
|
try {
|
||||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,6 @@ int main(int argc, char ** argv) {
|
||||||
common_params params;
|
common_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
// disable jinja by default
|
|
||||||
params.use_jinja = false;
|
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ add_library(mtmd
|
||||||
clip-graph.h
|
clip-graph.h
|
||||||
models/models.h
|
models/models.h
|
||||||
models/cogvlm.cpp
|
models/cogvlm.cpp
|
||||||
|
models/glm4v.cpp
|
||||||
models/internvl.cpp
|
models/internvl.cpp
|
||||||
models/kimivl.cpp
|
models/kimivl.cpp
|
||||||
models/llama4.cpp
|
models/llama4.cpp
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,8 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
|
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
|
||||||
|
|
||||||
struct clip_graph {
|
struct clip_graph {
|
||||||
const clip_model & model;
|
const clip_model & model;
|
||||||
const clip_hparams & hparams;
|
const clip_hparams & hparams;
|
||||||
|
|
@ -49,7 +51,7 @@ struct clip_graph {
|
||||||
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
||||||
|
|
||||||
// siglip2 naflex
|
// siglip2 naflex
|
||||||
ggml_tensor * resize_position_embeddings();
|
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
|
||||||
|
|
||||||
// build vision transformer (ViT) cgraph
|
// build vision transformer (ViT) cgraph
|
||||||
// this function should cover most of the models
|
// this function should cover most of the models
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@
|
||||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||||
|
#define TN_NORM_EMBD "v.norm_embd.%s"
|
||||||
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
||||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
|
|
@ -86,6 +87,10 @@
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
|
#define TN_MM_UP "mm.up.%s"
|
||||||
|
#define TN_MM_GATE "mm.gate.%s"
|
||||||
|
#define TN_MM_DOWN "mm.down.%s"
|
||||||
|
#define TN_MM_POST_NORM "mm.post_norm.%s"
|
||||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
|
|
@ -95,7 +100,7 @@
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||||
|
|
@ -165,6 +170,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
PROJECTOR_TYPE_COGVLM,
|
PROJECTOR_TYPE_COGVLM,
|
||||||
PROJECTOR_TYPE_JANUS_PRO,
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
|
PROJECTOR_TYPE_GLM4V,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -192,6 +198,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
|
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
@ -495,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||||
|
|
||||||
//
|
//
|
||||||
// API used internally with mtmd
|
// API used internally with mtmd
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -158,6 +158,8 @@ struct clip_model {
|
||||||
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||||
ggml_tensor * patch_bias = nullptr;
|
ggml_tensor * patch_bias = nullptr;
|
||||||
ggml_tensor * position_embeddings = nullptr;
|
ggml_tensor * position_embeddings = nullptr;
|
||||||
|
ggml_tensor * norm_embd_w = nullptr;
|
||||||
|
ggml_tensor * norm_embd_b = nullptr;
|
||||||
|
|
||||||
ggml_tensor * pre_ln_w = nullptr;
|
ggml_tensor * pre_ln_w = nullptr;
|
||||||
ggml_tensor * pre_ln_b = nullptr;
|
ggml_tensor * pre_ln_b = nullptr;
|
||||||
|
|
@ -172,6 +174,14 @@ struct clip_model {
|
||||||
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||||
ggml_tensor * mm_fc_w;
|
ggml_tensor * mm_fc_w;
|
||||||
ggml_tensor * mm_fc_b;
|
ggml_tensor * mm_fc_b;
|
||||||
|
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_up_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_b = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_b = nullptr;
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
ggml_tensor * mm_input_norm_w = nullptr;
|
ggml_tensor * mm_input_norm_w = nullptr;
|
||||||
|
|
@ -253,9 +263,10 @@ struct clip_model {
|
||||||
ggml_tensor * mm_input_proj_w = nullptr;
|
ggml_tensor * mm_input_proj_w = nullptr;
|
||||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||||
|
|
||||||
// pixtral
|
// pixtral, glm4v
|
||||||
ggml_tensor * token_embd_img_break = nullptr;
|
ggml_tensor * token_embd_img_break = nullptr;
|
||||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||||
|
ggml_tensor * mm_patch_merger_b = nullptr;
|
||||||
|
|
||||||
// ultravox / whisper encoder
|
// ultravox / whisper encoder
|
||||||
ggml_tensor * conv1d_1_w = nullptr;
|
ggml_tensor * conv1d_1_w = nullptr;
|
||||||
|
|
|
||||||
|
|
@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
// siglip2 naflex
|
// siglip2 naflex
|
||||||
ggml_tensor * clip_graph::resize_position_embeddings() {
|
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
|
||||||
ggml_tensor * pos_embd = model.position_embeddings;
|
ggml_tensor * pos_embd = model.position_embeddings;
|
||||||
const int height = img.ny / patch_size;
|
const int height = img.ny / patch_size;
|
||||||
const int width = img.nx / patch_size;
|
const int width = img.nx / patch_size;
|
||||||
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
|
const uint32_t mode = interpolation_mode;
|
||||||
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||||
|
|
||||||
GGML_ASSERT(pos_embd);
|
GGML_ASSERT(pos_embd);
|
||||||
|
|
@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
|
||||||
? ggml_rms_norm(ctx0, cur, norm_eps)
|
? ggml_rms_norm(ctx0, cur, norm_eps)
|
||||||
: ggml_norm(ctx0, cur, norm_eps);
|
: ggml_norm(ctx0, cur, norm_eps);
|
||||||
|
|
||||||
if (mw || mb) {
|
|
||||||
cb(cur, "norm", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mw) {
|
if (mw) {
|
||||||
cur = ggml_mul(ctx0, cur, mw);
|
cur = ggml_mul(ctx0, cur, mw);
|
||||||
if (mb) {
|
|
||||||
cb(cur, "norm_w", il);
|
cb(cur, "norm_w", il);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (mb) {
|
if (mb) {
|
||||||
cur = ggml_add(ctx0, cur, mb);
|
cur = ggml_add(ctx0, cur, mb);
|
||||||
|
cb(cur, "norm_b", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
|
|
@ -842,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("missing cgraph builder");
|
GGML_ABORT("missing cgraph builder");
|
||||||
}
|
}
|
||||||
|
|
@ -1155,6 +1154,14 @@ struct clip_model_loader {
|
||||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
hparams.rope_theta = 10000.0f;
|
||||||
|
hparams.n_merge = 2; // default value for GLM4-V
|
||||||
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||||
|
hparams.set_limit_image_tokens(8, 4096);
|
||||||
|
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
hparams.rope_theta = 10000.0f;
|
hparams.rope_theta = 10000.0f;
|
||||||
|
|
@ -1282,6 +1289,9 @@ struct clip_model_loader {
|
||||||
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
||||||
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
||||||
|
|
||||||
|
model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
|
||||||
|
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
|
||||||
|
|
||||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||||
|
|
||||||
// layers
|
// layers
|
||||||
|
|
@ -1470,6 +1480,20 @@ struct clip_model_loader {
|
||||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||||
|
model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
|
||||||
|
model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
|
||||||
|
model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
|
||||||
|
model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
|
||||||
|
model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
|
||||||
|
model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
|
||||||
|
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
||||||
|
model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
|
||||||
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
|
||||||
|
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
{
|
{
|
||||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||||
|
|
@ -1499,7 +1523,7 @@ struct clip_model_loader {
|
||||||
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
||||||
// for mistral small 3.1
|
// for mistral small 3.1
|
||||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
{
|
{
|
||||||
|
|
@ -1508,7 +1532,7 @@ struct clip_model_loader {
|
||||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
||||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
{
|
{
|
||||||
|
|
@ -1873,6 +1897,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||||
if (ctx_params.warmup) {
|
if (ctx_params.warmup) {
|
||||||
loader.warmup(*ctx_vision);
|
loader.warmup(*ctx_vision);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (loader.has_audio) {
|
if (loader.has_audio) {
|
||||||
|
|
@ -2582,6 +2608,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||||
clip_image_u8 resized;
|
clip_image_u8 resized;
|
||||||
|
|
@ -2824,16 +2851,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
||||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
const int n_total = clip_n_output_tokens(ctx, img);
|
const int n_total = clip_n_output_tokens(ctx, img);
|
||||||
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
const auto & proj = ctx->proj_type();
|
||||||
return img->nx / (params.patch_size * 2);
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return (img->nx / params.patch_size) / 2;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return n_total;
|
return n_total;
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
const auto & proj = ctx->proj_type();
|
||||||
return img->ny / (params.patch_size * 2);
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return (img->ny / params.patch_size) / 2;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -2890,6 +2931,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
// dynamic size (2 conv, so double patch size)
|
// dynamic size (2 conv, so double patch size)
|
||||||
int x_patch = img->nx / (params.patch_size * 2);
|
int x_patch = img->nx / (params.patch_size * 2);
|
||||||
|
|
@ -3137,6 +3179,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
const int merge_ratio = hparams.n_merge;
|
const int merge_ratio = hparams.n_merge;
|
||||||
const int pw = image_size_width / patch_size;
|
const int pw = image_size_width / patch_size;
|
||||||
|
|
@ -3363,7 +3406,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
|
if (vec != nullptr) {
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -3411,6 +3456,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_2_w->ne[1];
|
return ctx->model.mm_2_w->ne[1];
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return ctx->model.mm_ffn_down_w->ne[1];
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("Unknown projector type");
|
GGML_ABORT("Unknown projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -3427,10 +3474,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||||
|
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||||
|
|
@ -3491,3 +3539,22 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
||||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||||
return &ctx->model.hparams;
|
return &ctx->model.hparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// API for debugging
|
||||||
|
//
|
||||||
|
|
||||||
|
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||||
|
clip_image_f32 img;
|
||||||
|
img.nx = w;
|
||||||
|
img.ny = h;
|
||||||
|
img.buf.resize(h * w * 3);
|
||||||
|
for (int i = 0; i < h * w * 3; i++) {
|
||||||
|
img.buf[i] = static_cast<float>(fill_value);
|
||||||
|
}
|
||||||
|
bool cur_debug_graph = ctx->debug_graph;
|
||||||
|
ctx->debug_graph = true;
|
||||||
|
clip_image_encode(ctx, 1, &img, nullptr);
|
||||||
|
ctx->debug_graph = cur_debug_graph;
|
||||||
|
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
||||||
|
|
||||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_glm4v::build() {
|
||||||
|
GGML_ASSERT(model.patch_bias != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
GGML_ASSERT(model.class_embedding == nullptr);
|
||||||
|
|
||||||
|
const int batch_size = 1;
|
||||||
|
|
||||||
|
norm_type norm_t = NORM_TYPE_RMS;
|
||||||
|
|
||||||
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||||
|
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||||
|
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||||
|
|
||||||
|
// second conv dimension
|
||||||
|
{
|
||||||
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, inp_1);
|
||||||
|
|
||||||
|
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||||
|
inp = ggml_cont_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
inp = ggml_reshape_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||||
|
inp = ggml_cont_3d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add patch bias
|
||||||
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
|
cb(inp, "patch_bias", -1);
|
||||||
|
|
||||||
|
// pos-conv norm
|
||||||
|
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
|
||||||
|
|
||||||
|
// calculate absolute position embedding and apply
|
||||||
|
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
|
||||||
|
learned_pos_embd = ggml_cont_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
learned_pos_embd = ggml_reshape_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||||
|
learned_pos_embd = ggml_cont_3d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
cb(learned_pos_embd, "learned_pos_embd", -1);
|
||||||
|
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
return ggml_rope_multi(
|
||||||
|
ctx0, cur, positions, nullptr,
|
||||||
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||||
|
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
norm_t,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
cb(cur, "vit_out", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
|
||||||
|
|
||||||
|
// GLM4V projector
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
|
||||||
|
|
||||||
|
// patch merger (downsample)
|
||||||
|
{
|
||||||
|
int n_merge = hparams.n_merge;
|
||||||
|
GGML_ASSERT(n_merge > 0);
|
||||||
|
|
||||||
|
int n_token_out = n_patches / n_merge / n_merge;
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
|
||||||
|
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
|
||||||
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FC projector
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||||
|
// default LayerNorm (post_projection_norm)
|
||||||
|
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
cur = ggml_gelu_erf(ctx0, cur);
|
||||||
|
cb(cur, "after_fc_proj", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FFN projector
|
||||||
|
{
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||||
|
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
|
||||||
|
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||||
|
hparams.ffn_op, -1);
|
||||||
|
cb(cur, "after_ffn_proj", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph {
|
||||||
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct clip_graph_glm4v : clip_graph {
|
||||||
|
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
|
ggml_cgraph * build() override;
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -270,8 +270,6 @@ int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
common_params params;
|
common_params params;
|
||||||
params.use_jinja = false; // disable jinja by default
|
|
||||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
|
|
@ -217,7 +217,7 @@ struct mtmd_context {
|
||||||
|
|
||||||
void init_vision() {
|
void init_vision() {
|
||||||
GGML_ASSERT(ctx_v != nullptr);
|
GGML_ASSERT(ctx_v != nullptr);
|
||||||
use_mrope = clip_is_qwen2vl(ctx_v);
|
use_mrope = clip_is_mrope(ctx_v);
|
||||||
|
|
||||||
projector_type proj = clip_get_projector_type(ctx_v);
|
projector_type proj = clip_get_projector_type(ctx_v);
|
||||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||||
|
|
@ -309,6 +309,10 @@ struct mtmd_context {
|
||||||
img_beg = "<|image_start|>";
|
img_beg = "<|image_start|>";
|
||||||
img_end = "<|image_end|>";
|
img_end = "<|image_end|>";
|
||||||
|
|
||||||
|
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
||||||
|
img_beg = "<|begin_of_image|>";
|
||||||
|
img_end = "<|end_of_image|>";
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,6 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
||||||
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
|
||||||
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
|
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
|
||||||
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
|
|
@ -67,11 +66,10 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||||
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
||||||
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
||||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
|
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
|
||||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||||
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||||
|
|
@ -150,19 +148,20 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||||
|
|
||||||
|
|
||||||
**Example-specific params**
|
**Server-specific params**
|
||||||
|
|
||||||
| Argument | Explanation |
|
| Argument | Explanation |
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||||
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||||
|
| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||||
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
|
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||||
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||||
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||||
|
|
@ -1430,7 +1429,7 @@ Model presets allow advanced users to define custom configurations using an `.in
|
||||||
llama-server --models-preset ./my-models.ini
|
llama-server --models-preset ./my-models.ini
|
||||||
```
|
```
|
||||||
|
|
||||||
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
|
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`.
|
||||||
|
|
||||||
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
|
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
|
||||||
|
|
||||||
|
|
@ -1445,7 +1444,7 @@ version = 1
|
||||||
; string value
|
; string value
|
||||||
chat-template = chatml
|
chat-template = chatml
|
||||||
; numeric value
|
; numeric value
|
||||||
n-gpu-layer = 123
|
n-gpu-layers = 123
|
||||||
; flag value (for certain flags, you need to use the "no-" prefix for negation)
|
; flag value (for certain flags, you need to use the "no-" prefix for negation)
|
||||||
jinja = true
|
jinja = true
|
||||||
; shorthand argument (for example, context size)
|
; shorthand argument (for example, context size)
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -73,12 +73,17 @@ int main(int argc, char ** argv, char ** envp) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: should we have a separate n_parallel parameter for the server?
|
// validate batch size for embeddings
|
||||||
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
|
// embeddings require all tokens to be processed in a single ubatch
|
||||||
// TODO: this is a common configuration that is suitable for most local use cases
|
// see https://github.com/ggml-org/llama.cpp/issues/12836
|
||||||
// however, overriding the parameters is a bit confusing - figure out something more intuitive
|
if (params.embedding && params.n_batch > params.n_ubatch) {
|
||||||
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
|
LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
|
||||||
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
|
LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
|
||||||
|
params.n_batch = params.n_ubatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.n_parallel < 0) {
|
||||||
|
LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
|
||||||
|
|
||||||
params.n_parallel = 4;
|
params.n_parallel = 4;
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
|
|
|
||||||
|
|
@ -620,10 +620,11 @@ flowchart TB
|
||||||
### Test Types
|
### Test Types
|
||||||
|
|
||||||
| Type | Tool | Location | Command |
|
| Type | Tool | Location | Command |
|
||||||
| ------------- | ------------------ | -------------------------------- | ------------------- |
|
| ------------- | ------------------ | ---------------- | ------------------- |
|
||||||
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
|
| **Unit** | Vitest | `tests/unit/` | `npm run test:unit` |
|
||||||
| **Unit** | Vitest | `tests/client/`, `tests/server/` | `npm run test:unit` |
|
|
||||||
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
|
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
|
||||||
|
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
|
||||||
|
| **Client** | Vitest | `tests/client/`. | `npm run test:unit` |
|
||||||
|
|
||||||
### Running Tests
|
### Running Tests
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,12 +13,11 @@
|
||||||
"reset": "rm -rf .svelte-kit node_modules",
|
"reset": "rm -rf .svelte-kit node_modules",
|
||||||
"format": "prettier --write .",
|
"format": "prettier --write .",
|
||||||
"lint": "prettier --check . && eslint .",
|
"lint": "prettier --check . && eslint .",
|
||||||
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:server -- --run && npm run test:e2e",
|
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
|
||||||
"test:e2e": "playwright test",
|
"test:e2e": "playwright test",
|
||||||
"test:client": "vitest --project=client",
|
"test:client": "vitest --project=client",
|
||||||
"test:server": "vitest --project=server",
|
"test:unit": "vitest --project=unit",
|
||||||
"test:ui": "vitest --project=ui",
|
"test:ui": "vitest --project=ui",
|
||||||
"test:unit": "vitest",
|
|
||||||
"storybook": "storybook dev -p 6006",
|
"storybook": "storybook dev -p 6006",
|
||||||
"build-storybook": "storybook build",
|
"build-storybook": "storybook build",
|
||||||
"cleanup": "rm -rf .svelte-kit build node_modules test-results"
|
"cleanup": "rm -rf .svelte-kit build node_modules test-results"
|
||||||
|
|
|
||||||
|
|
@ -241,7 +241,7 @@
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
|
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
|
||||||
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="69rem" />
|
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="calc(69rem - 2rem)" />
|
||||||
{:else if isAudio}
|
{:else if isAudio}
|
||||||
<div class="flex items-center justify-center p-8">
|
<div class="flex items-center justify-center p-8">
|
||||||
<div class="w-full max-w-md text-center">
|
<div class="w-full max-w-md text-center">
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
MimeTypeImage,
|
MimeTypeImage,
|
||||||
MimeTypeText
|
MimeTypeText
|
||||||
} from '$lib/enums';
|
} from '$lib/enums';
|
||||||
import { isIMEComposing } from '$lib/utils';
|
import { isIMEComposing, parseClipboardContent } from '$lib/utils';
|
||||||
import {
|
import {
|
||||||
AudioRecorder,
|
AudioRecorder,
|
||||||
convertToWav,
|
convertToWav,
|
||||||
|
|
@ -191,7 +191,6 @@
|
||||||
|
|
||||||
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
|
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
|
||||||
|
|
||||||
// Check if model is selected first
|
|
||||||
if (!checkModelSelected()) return;
|
if (!checkModelSelected()) return;
|
||||||
|
|
||||||
const messageToSend = message.trim();
|
const messageToSend = message.trim();
|
||||||
|
|
@ -228,6 +227,31 @@
|
||||||
|
|
||||||
const text = event.clipboardData.getData(MimeTypeText.PLAIN);
|
const text = event.clipboardData.getData(MimeTypeText.PLAIN);
|
||||||
|
|
||||||
|
if (text.startsWith('"')) {
|
||||||
|
const parsed = parseClipboardContent(text);
|
||||||
|
|
||||||
|
if (parsed.textAttachments.length > 0) {
|
||||||
|
event.preventDefault();
|
||||||
|
|
||||||
|
message = parsed.message;
|
||||||
|
|
||||||
|
const attachmentFiles = parsed.textAttachments.map(
|
||||||
|
(att) =>
|
||||||
|
new File([att.content], att.name, {
|
||||||
|
type: MimeTypeText.PLAIN
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
onFileUpload?.(attachmentFiles);
|
||||||
|
|
||||||
|
setTimeout(() => {
|
||||||
|
textareaRef?.focus();
|
||||||
|
}, 10);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
text.length > 0 &&
|
text.length > 0 &&
|
||||||
pasteLongTextToFileLength > 0 &&
|
pasteLongTextToFileLength > 0 &&
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@
|
||||||
|
|
||||||
<div class="flex items-center gap-1 {className}">
|
<div class="flex items-center gap-1 {className}">
|
||||||
<DropdownMenu.Root>
|
<DropdownMenu.Root>
|
||||||
<DropdownMenu.Trigger name="Attach files">
|
<DropdownMenu.Trigger name="Attach files" {disabled}>
|
||||||
<Tooltip.Root>
|
<Tooltip.Root>
|
||||||
<Tooltip.Trigger>
|
<Tooltip.Trigger>
|
||||||
<Button
|
<Button
|
||||||
|
|
|
||||||
|
|
@ -173,6 +173,7 @@
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<ModelsSelector
|
<ModelsSelector
|
||||||
|
{disabled}
|
||||||
bind:this={selectorModelRef}
|
bind:this={selectorModelRef}
|
||||||
currentModel={conversationModel}
|
currentModel={conversationModel}
|
||||||
forceForegroundText={true}
|
forceForegroundText={true}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { chatStore } from '$lib/stores/chat.svelte';
|
import { chatStore } from '$lib/stores/chat.svelte';
|
||||||
import { copyToClipboard, isIMEComposing } from '$lib/utils';
|
import { config } from '$lib/stores/settings.svelte';
|
||||||
|
import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
|
||||||
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
|
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
|
||||||
import ChatMessageUser from './ChatMessageUser.svelte';
|
import ChatMessageUser from './ChatMessageUser.svelte';
|
||||||
import ChatMessageSystem from './ChatMessageSystem.svelte';
|
import ChatMessageSystem from './ChatMessageSystem.svelte';
|
||||||
|
|
@ -87,7 +88,9 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCopy() {
|
async function handleCopy() {
|
||||||
await copyToClipboard(message.content, 'Message copied to clipboard');
|
const asPlainText = Boolean(config().copyTextAttachmentsAsPlainText);
|
||||||
|
const clipboardContent = formatMessageForClipboard(message.content, message.extra, asPlainText);
|
||||||
|
await copyToClipboard(clipboardContent, 'Message copied to clipboard');
|
||||||
onCopy?.(message);
|
onCopy?.(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,11 @@
|
||||||
label: 'Paste long text to file length',
|
label: 'Paste long text to file length',
|
||||||
type: 'input'
|
type: 'input'
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
key: 'copyTextAttachmentsAsPlainText',
|
||||||
|
label: 'Copy text attachments as plain text',
|
||||||
|
type: 'checkbox'
|
||||||
|
},
|
||||||
{
|
{
|
||||||
key: 'enableContinueGeneration',
|
key: 'enableContinueGeneration',
|
||||||
label: 'Enable "Continue" button',
|
label: 'Enable "Continue" button',
|
||||||
|
|
@ -109,6 +114,16 @@
|
||||||
key: 'disableAutoScroll',
|
key: 'disableAutoScroll',
|
||||||
label: 'Disable automatic scroll',
|
label: 'Disable automatic scroll',
|
||||||
type: 'checkbox'
|
type: 'checkbox'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'alwaysShowSidebarOnDesktop',
|
||||||
|
label: 'Always show sidebar on desktop',
|
||||||
|
type: 'checkbox'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'autoShowSidebarOnNewChat',
|
||||||
|
label: 'Auto-show sidebar on new chat',
|
||||||
|
type: 'checkbox'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -404,7 +419,7 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Mobile Header with Horizontal Scrollable Menu -->
|
<!-- Mobile Header with Horizontal Scrollable Menu -->
|
||||||
<div class="flex flex-col md:hidden">
|
<div class="flex flex-col pt-6 md:hidden">
|
||||||
<div class="border-b border-border/30 py-4">
|
<div class="border-b border-border/30 py-4">
|
||||||
<!-- Horizontal Scrollable Category Menu with Navigation -->
|
<!-- Horizontal Scrollable Category Menu with Navigation -->
|
||||||
<div class="relative flex items-center" style="scroll-padding: 1rem;">
|
<div class="relative flex items-center" style="scroll-padding: 1rem;">
|
||||||
|
|
|
||||||
|
|
@ -72,9 +72,10 @@
|
||||||
|
|
||||||
<div
|
<div
|
||||||
class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}"
|
class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}"
|
||||||
style="max-height: {maxHeight};"
|
style="max-height: {maxHeight}; max-width: {maxWidth};"
|
||||||
>
|
>
|
||||||
<pre class="m-0 overflow-x-auto p-4 max-w-[{maxWidth}]"><code class="hljs text-sm leading-relaxed"
|
<!-- Needs to be formatted as single line for proper rendering -->
|
||||||
|
<pre class="m-0 overflow-x-auto p-4"><code class="hljs text-sm leading-relaxed"
|
||||||
>{@html highlightedHtml}</code
|
>{@html highlightedHtml}</code
|
||||||
></pre>
|
></pre>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -179,9 +179,12 @@
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Handle changes to the model selector pop-down or the model dialog, depending on if the server is in
|
||||||
|
// router mode or not.
|
||||||
function handleOpenChange(open: boolean) {
|
function handleOpenChange(open: boolean) {
|
||||||
if (loading || updating) return;
|
if (loading || updating) return;
|
||||||
|
|
||||||
|
if (isRouter) {
|
||||||
if (open) {
|
if (open) {
|
||||||
isOpen = true;
|
isOpen = true;
|
||||||
searchTerm = '';
|
searchTerm = '';
|
||||||
|
|
@ -192,38 +195,21 @@
|
||||||
requestAnimationFrame(() => searchInputRef?.focus());
|
requestAnimationFrame(() => searchInputRef?.focus());
|
||||||
});
|
});
|
||||||
|
|
||||||
if (isRouter) {
|
|
||||||
modelsStore.fetchRouterModels().then(() => {
|
modelsStore.fetchRouterModels().then(() => {
|
||||||
modelsStore.fetchModalitiesForLoadedModels();
|
modelsStore.fetchModalitiesForLoadedModels();
|
||||||
});
|
});
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
isOpen = false;
|
isOpen = false;
|
||||||
searchTerm = '';
|
searchTerm = '';
|
||||||
highlightedIndex = -1;
|
highlightedIndex = -1;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
showModelDialog = open;
|
||||||
}
|
}
|
||||||
|
|
||||||
function handleTriggerClick() {
|
|
||||||
if (loading || updating) return;
|
|
||||||
|
|
||||||
if (!isRouter) {
|
|
||||||
// Single model mode: show dialog instead of popover
|
|
||||||
showModelDialog = true;
|
|
||||||
}
|
|
||||||
// For router mode, the Popover handles open/close
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function open() {
|
export function open() {
|
||||||
if (isRouter) {
|
|
||||||
handleOpenChange(true);
|
handleOpenChange(true);
|
||||||
} else {
|
|
||||||
showModelDialog = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function closeMenu() {
|
|
||||||
handleOpenChange(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function handleSearchKeyDown(event: KeyboardEvent) {
|
function handleSearchKeyDown(event: KeyboardEvent) {
|
||||||
|
|
@ -292,7 +278,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shouldCloseMenu) {
|
if (shouldCloseMenu) {
|
||||||
closeMenu();
|
handleOpenChange(false);
|
||||||
|
|
||||||
// Focus the chat textarea after model selection
|
// Focus the chat textarea after model selection
|
||||||
requestAnimationFrame(() => {
|
requestAnimationFrame(() => {
|
||||||
|
|
@ -360,6 +346,7 @@
|
||||||
{:else}
|
{:else}
|
||||||
{@const selectedOption = getDisplayOption()}
|
{@const selectedOption = getDisplayOption()}
|
||||||
|
|
||||||
|
{#if isRouter}
|
||||||
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
|
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
|
||||||
<Popover.Trigger
|
<Popover.Trigger
|
||||||
class={cn(
|
class={cn(
|
||||||
|
|
@ -374,8 +361,7 @@
|
||||||
isOpen ? 'text-foreground' : ''
|
isOpen ? 'text-foreground' : ''
|
||||||
)}
|
)}
|
||||||
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
|
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
|
||||||
onclick={handleTriggerClick}
|
disabled={disabled || updating}
|
||||||
disabled={disabled || updating || !isRouter}
|
|
||||||
>
|
>
|
||||||
<Package class="h-3.5 w-3.5" />
|
<Package class="h-3.5 w-3.5" />
|
||||||
|
|
||||||
|
|
@ -385,7 +371,7 @@
|
||||||
|
|
||||||
{#if updating}
|
{#if updating}
|
||||||
<Loader2 class="h-3 w-3.5 animate-spin" />
|
<Loader2 class="h-3 w-3.5 animate-spin" />
|
||||||
{:else if isRouter}
|
{:else}
|
||||||
<ChevronDown class="h-3 w-3.5" />
|
<ChevronDown class="h-3 w-3.5" />
|
||||||
{/if}
|
{/if}
|
||||||
</Popover.Trigger>
|
</Popover.Trigger>
|
||||||
|
|
@ -405,7 +391,7 @@
|
||||||
placeholder="Search models..."
|
placeholder="Search models..."
|
||||||
bind:value={searchTerm}
|
bind:value={searchTerm}
|
||||||
bind:ref={searchInputRef}
|
bind:ref={searchInputRef}
|
||||||
onClose={closeMenu}
|
onClose={() => handleOpenChange(false)}
|
||||||
onKeyDown={handleSearchKeyDown}
|
onKeyDown={handleSearchKeyDown}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -533,6 +519,34 @@
|
||||||
</div>
|
</div>
|
||||||
</Popover.Content>
|
</Popover.Content>
|
||||||
</Popover.Root>
|
</Popover.Root>
|
||||||
|
{:else}
|
||||||
|
<button
|
||||||
|
class={cn(
|
||||||
|
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
|
||||||
|
!isCurrentModelInCache()
|
||||||
|
? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
|
||||||
|
: forceForegroundText
|
||||||
|
? 'text-foreground'
|
||||||
|
: isHighlightedCurrentModelActive
|
||||||
|
? 'text-foreground'
|
||||||
|
: 'text-muted-foreground',
|
||||||
|
isOpen ? 'text-foreground' : ''
|
||||||
|
)}
|
||||||
|
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
|
||||||
|
onclick={() => handleOpenChange(true)}
|
||||||
|
disabled={disabled || updating}
|
||||||
|
>
|
||||||
|
<Package class="h-3.5 w-3.5" />
|
||||||
|
|
||||||
|
<span class="truncate font-medium">
|
||||||
|
{selectedOption?.model}
|
||||||
|
</span>
|
||||||
|
|
||||||
|
{#if updating}
|
||||||
|
<Loader2 class="h-3 w-3.5 animate-spin" />
|
||||||
|
{/if}
|
||||||
|
</button>
|
||||||
|
{/if}
|
||||||
{/if}
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,12 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
|
||||||
showMessageStats: true,
|
showMessageStats: true,
|
||||||
askForTitleConfirmation: false,
|
askForTitleConfirmation: false,
|
||||||
pasteLongTextToFileLen: 2500,
|
pasteLongTextToFileLen: 2500,
|
||||||
|
copyTextAttachmentsAsPlainText: false,
|
||||||
pdfAsImage: false,
|
pdfAsImage: false,
|
||||||
disableAutoScroll: false,
|
disableAutoScroll: false,
|
||||||
renderUserContentAsMarkdown: false,
|
renderUserContentAsMarkdown: false,
|
||||||
|
alwaysShowSidebarOnDesktop: false,
|
||||||
|
autoShowSidebarOnNewChat: true,
|
||||||
autoMicOnEmpty: false,
|
autoMicOnEmpty: false,
|
||||||
// make sure these default values are in sync with `common.h`
|
// make sure these default values are in sync with `common.h`
|
||||||
samplers: 'top_k;typ_p;top_p;min_p;temperature',
|
samplers: 'top_k;typ_p;top_p;min_p;temperature',
|
||||||
|
|
@ -50,6 +53,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
||||||
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
|
'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
|
||||||
pasteLongTextToFileLen:
|
pasteLongTextToFileLen:
|
||||||
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
|
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
|
||||||
|
copyTextAttachmentsAsPlainText:
|
||||||
|
'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
|
||||||
samplers:
|
samplers:
|
||||||
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
|
'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
|
||||||
temperature:
|
temperature:
|
||||||
|
|
@ -96,6 +101,10 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
||||||
disableAutoScroll:
|
disableAutoScroll:
|
||||||
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
|
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
|
||||||
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
|
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
|
||||||
|
alwaysShowSidebarOnDesktop:
|
||||||
|
'Always keep the sidebar visible on desktop instead of auto-hiding it.',
|
||||||
|
autoShowSidebarOnNewChat:
|
||||||
|
'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
|
||||||
autoMicOnEmpty:
|
autoMicOnEmpty:
|
||||||
'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
|
'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
|
||||||
pyInterpreterEnabled:
|
pyInterpreterEnabled:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,262 @@
|
||||||
|
import { toast } from 'svelte-sonner';
|
||||||
|
import { AttachmentType } from '$lib/enums';
|
||||||
|
import type {
|
||||||
|
DatabaseMessageExtra,
|
||||||
|
DatabaseMessageExtraTextFile,
|
||||||
|
DatabaseMessageExtraLegacyContext
|
||||||
|
} from '$lib/types/database';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copy text to clipboard with toast notification
|
||||||
|
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
|
||||||
|
* @param text - Text to copy to clipboard
|
||||||
|
* @param successMessage - Custom success message (optional)
|
||||||
|
* @param errorMessage - Custom error message (optional)
|
||||||
|
* @returns Promise<boolean> - True if successful, false otherwise
|
||||||
|
*/
|
||||||
|
export async function copyToClipboard(
|
||||||
|
text: string,
|
||||||
|
successMessage = 'Copied to clipboard',
|
||||||
|
errorMessage = 'Failed to copy to clipboard'
|
||||||
|
): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
// Try modern clipboard API first (secure contexts only)
|
||||||
|
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||||
|
await navigator.clipboard.writeText(text);
|
||||||
|
toast.success(successMessage);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback for non-secure contexts
|
||||||
|
const textArea = document.createElement('textarea');
|
||||||
|
textArea.value = text;
|
||||||
|
textArea.style.position = 'fixed';
|
||||||
|
textArea.style.left = '-999999px';
|
||||||
|
textArea.style.top = '-999999px';
|
||||||
|
document.body.appendChild(textArea);
|
||||||
|
textArea.focus();
|
||||||
|
textArea.select();
|
||||||
|
|
||||||
|
const successful = document.execCommand('copy');
|
||||||
|
document.body.removeChild(textArea);
|
||||||
|
|
||||||
|
if (successful) {
|
||||||
|
toast.success(successMessage);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
throw new Error('execCommand failed');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to copy to clipboard:', error);
|
||||||
|
toast.error(errorMessage);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copy code with HTML entity decoding and toast notification
|
||||||
|
* @param rawCode - Raw code string that may contain HTML entities
|
||||||
|
* @param successMessage - Custom success message (optional)
|
||||||
|
* @param errorMessage - Custom error message (optional)
|
||||||
|
* @returns Promise<boolean> - True if successful, false otherwise
|
||||||
|
*/
|
||||||
|
export async function copyCodeToClipboard(
|
||||||
|
rawCode: string,
|
||||||
|
successMessage = 'Code copied to clipboard',
|
||||||
|
errorMessage = 'Failed to copy code'
|
||||||
|
): Promise<boolean> {
|
||||||
|
const doc = new DOMParser().parseFromString(rawCode, 'text/html');
|
||||||
|
const decodedCode = doc.body.textContent ?? rawCode;
|
||||||
|
|
||||||
|
return copyToClipboard(decodedCode, successMessage, errorMessage);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format for text attachments when copied to clipboard
|
||||||
|
*/
|
||||||
|
export interface ClipboardTextAttachment {
|
||||||
|
type: typeof AttachmentType.TEXT;
|
||||||
|
name: string;
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parsed result from clipboard content
|
||||||
|
*/
|
||||||
|
export interface ParsedClipboardContent {
|
||||||
|
message: string;
|
||||||
|
textAttachments: ClipboardTextAttachment[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Formats a message with text attachments for clipboard copying.
|
||||||
|
*
|
||||||
|
* Default format (asPlainText = false):
|
||||||
|
* ```
|
||||||
|
* "Text message content"
|
||||||
|
* [
|
||||||
|
* {"type":"TEXT","name":"filename.txt","content":"..."},
|
||||||
|
* {"type":"TEXT","name":"another.txt","content":"..."}
|
||||||
|
* ]
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* Plain text format (asPlainText = true):
|
||||||
|
* ```
|
||||||
|
* Text message content
|
||||||
|
*
|
||||||
|
* file content here
|
||||||
|
*
|
||||||
|
* another file content
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* @param content - The message text content
|
||||||
|
* @param extras - Optional array of message attachments
|
||||||
|
* @param asPlainText - If true, format as plain text without JSON structure
|
||||||
|
* @returns Formatted string for clipboard
|
||||||
|
*/
|
||||||
|
export function formatMessageForClipboard(
|
||||||
|
content: string,
|
||||||
|
extras?: DatabaseMessageExtra[],
|
||||||
|
asPlainText: boolean = false
|
||||||
|
): string {
|
||||||
|
// Filter only text attachments (TEXT type and legacy CONTEXT type)
|
||||||
|
const textAttachments =
|
||||||
|
extras?.filter(
|
||||||
|
(extra): extra is DatabaseMessageExtraTextFile | DatabaseMessageExtraLegacyContext =>
|
||||||
|
extra.type === AttachmentType.TEXT || extra.type === AttachmentType.LEGACY_CONTEXT
|
||||||
|
) ?? [];
|
||||||
|
|
||||||
|
if (textAttachments.length === 0) {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (asPlainText) {
|
||||||
|
const parts = [content];
|
||||||
|
for (const att of textAttachments) {
|
||||||
|
parts.push(att.content);
|
||||||
|
}
|
||||||
|
return parts.join('\n\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
const clipboardAttachments: ClipboardTextAttachment[] = textAttachments.map((att) => ({
|
||||||
|
type: AttachmentType.TEXT,
|
||||||
|
name: att.name,
|
||||||
|
content: att.content
|
||||||
|
}));
|
||||||
|
|
||||||
|
return `${JSON.stringify(content)}\n${JSON.stringify(clipboardAttachments, null, 2)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses clipboard content to extract message and text attachments.
|
||||||
|
* Supports both plain text and the special format with attachments.
|
||||||
|
*
|
||||||
|
* @param clipboardText - Raw text from clipboard
|
||||||
|
* @returns Parsed content with message and attachments
|
||||||
|
*/
|
||||||
|
export function parseClipboardContent(clipboardText: string): ParsedClipboardContent {
|
||||||
|
const defaultResult: ParsedClipboardContent = {
|
||||||
|
message: clipboardText,
|
||||||
|
textAttachments: []
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!clipboardText.startsWith('"')) {
|
||||||
|
return defaultResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let stringEndIndex = -1;
|
||||||
|
let escaped = false;
|
||||||
|
|
||||||
|
for (let i = 1; i < clipboardText.length; i++) {
|
||||||
|
const char = clipboardText[i];
|
||||||
|
|
||||||
|
if (escaped) {
|
||||||
|
escaped = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '\\') {
|
||||||
|
escaped = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '"') {
|
||||||
|
stringEndIndex = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stringEndIndex === -1) {
|
||||||
|
return defaultResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsonStringPart = clipboardText.substring(0, stringEndIndex + 1);
|
||||||
|
const remainingPart = clipboardText.substring(stringEndIndex + 1).trim();
|
||||||
|
|
||||||
|
const message = JSON.parse(jsonStringPart) as string;
|
||||||
|
|
||||||
|
if (!remainingPart || !remainingPart.startsWith('[')) {
|
||||||
|
return {
|
||||||
|
message,
|
||||||
|
textAttachments: []
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const attachments = JSON.parse(remainingPart) as unknown[];
|
||||||
|
|
||||||
|
const validAttachments: ClipboardTextAttachment[] = [];
|
||||||
|
|
||||||
|
for (const att of attachments) {
|
||||||
|
if (isValidTextAttachment(att)) {
|
||||||
|
validAttachments.push({
|
||||||
|
type: AttachmentType.TEXT,
|
||||||
|
name: att.name,
|
||||||
|
content: att.content
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
message,
|
||||||
|
textAttachments: validAttachments
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
|
return defaultResult;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type guard to validate a text attachment object
|
||||||
|
* @param obj The object to validate
|
||||||
|
* @returns true if the object is a valid text attachment
|
||||||
|
*/
|
||||||
|
function isValidTextAttachment(
|
||||||
|
obj: unknown
|
||||||
|
): obj is { type: string; name: string; content: string } {
|
||||||
|
if (typeof obj !== 'object' || obj === null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const record = obj as Record<string, unknown>;
|
||||||
|
|
||||||
|
return (
|
||||||
|
(record.type === AttachmentType.TEXT || record.type === 'TEXT') &&
|
||||||
|
typeof record.name === 'string' &&
|
||||||
|
typeof record.content === 'string'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if clipboard content contains our special format with attachments
|
||||||
|
* @param clipboardText - Raw text from clipboard
|
||||||
|
* @returns true if the clipboard content contains our special format with attachments
|
||||||
|
*/
|
||||||
|
export function hasClipboardAttachments(clipboardText: string): boolean {
|
||||||
|
if (!clipboardText.startsWith('"')) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsed = parseClipboardContent(clipboardText);
|
||||||
|
return parsed.textAttachments.length > 0;
|
||||||
|
}
|
||||||
|
|
@ -1,71 +0,0 @@
|
||||||
import { toast } from 'svelte-sonner';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Copy text to clipboard with toast notification
|
|
||||||
* Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
|
|
||||||
* @param text - Text to copy to clipboard
|
|
||||||
* @param successMessage - Custom success message (optional)
|
|
||||||
* @param errorMessage - Custom error message (optional)
|
|
||||||
* @returns Promise<boolean> - True if successful, false otherwise
|
|
||||||
*/
|
|
||||||
export async function copyToClipboard(
|
|
||||||
text: string,
|
|
||||||
successMessage = 'Copied to clipboard',
|
|
||||||
errorMessage = 'Failed to copy to clipboard'
|
|
||||||
): Promise<boolean> {
|
|
||||||
try {
|
|
||||||
// Try modern clipboard API first (secure contexts only)
|
|
||||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
|
||||||
await navigator.clipboard.writeText(text);
|
|
||||||
toast.success(successMessage);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback for non-secure contexts
|
|
||||||
const textArea = document.createElement('textarea');
|
|
||||||
textArea.value = text;
|
|
||||||
textArea.style.position = 'fixed';
|
|
||||||
textArea.style.left = '-999999px';
|
|
||||||
textArea.style.top = '-999999px';
|
|
||||||
document.body.appendChild(textArea);
|
|
||||||
textArea.focus();
|
|
||||||
textArea.select();
|
|
||||||
|
|
||||||
const successful = document.execCommand('copy');
|
|
||||||
document.body.removeChild(textArea);
|
|
||||||
|
|
||||||
if (successful) {
|
|
||||||
toast.success(successMessage);
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
throw new Error('execCommand failed');
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to copy to clipboard:', error);
|
|
||||||
toast.error(errorMessage);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Copy code with HTML entity decoding and toast notification
|
|
||||||
* @param rawCode - Raw code string that may contain HTML entities
|
|
||||||
* @param successMessage - Custom success message (optional)
|
|
||||||
* @param errorMessage - Custom error message (optional)
|
|
||||||
* @returns Promise<boolean> - True if successful, false otherwise
|
|
||||||
*/
|
|
||||||
export async function copyCodeToClipboard(
|
|
||||||
rawCode: string,
|
|
||||||
successMessage = 'Code copied to clipboard',
|
|
||||||
errorMessage = 'Failed to copy code'
|
|
||||||
): Promise<boolean> {
|
|
||||||
// Decode HTML entities
|
|
||||||
const decodedCode = rawCode
|
|
||||||
.replace(/&/g, '&')
|
|
||||||
.replace(/</g, '<')
|
|
||||||
.replace(/>/g, '>')
|
|
||||||
.replace(/"/g, '"')
|
|
||||||
.replace(/'/g, "'");
|
|
||||||
|
|
||||||
return copyToClipboard(decodedCode, successMessage, errorMessage);
|
|
||||||
}
|
|
||||||
|
|
@ -40,7 +40,15 @@ export { setConfigValue, getConfigValue, configToParameterRecord } from './confi
|
||||||
export { createMessageCountMap, getMessageCount } from './conversation-utils';
|
export { createMessageCountMap, getMessageCount } from './conversation-utils';
|
||||||
|
|
||||||
// Clipboard utilities
|
// Clipboard utilities
|
||||||
export { copyToClipboard, copyCodeToClipboard } from './copy';
|
export {
|
||||||
|
copyToClipboard,
|
||||||
|
copyCodeToClipboard,
|
||||||
|
formatMessageForClipboard,
|
||||||
|
parseClipboardContent,
|
||||||
|
hasClipboardAttachments,
|
||||||
|
type ClipboardTextAttachment,
|
||||||
|
type ParsedClipboardContent
|
||||||
|
} from './clipboard';
|
||||||
|
|
||||||
// File preview utilities
|
// File preview utilities
|
||||||
export { getFileTypeLabel } from './file-preview';
|
export { getFileTypeLabel } from './file-preview';
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
import { goto } from '$app/navigation';
|
import { goto } from '$app/navigation';
|
||||||
import { modelsStore } from '$lib/stores/models.svelte';
|
import { modelsStore } from '$lib/stores/models.svelte';
|
||||||
import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
|
import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
|
||||||
|
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||||
|
|
||||||
let { children } = $props();
|
let { children } = $props();
|
||||||
|
|
||||||
|
|
@ -21,6 +22,10 @@
|
||||||
let isHomeRoute = $derived(page.route.id === '/');
|
let isHomeRoute = $derived(page.route.id === '/');
|
||||||
let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true');
|
let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true');
|
||||||
let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading());
|
let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading());
|
||||||
|
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
|
||||||
|
let autoShowSidebarOnNewChat = $derived(config().autoShowSidebarOnNewChat);
|
||||||
|
let isMobile = new IsMobile();
|
||||||
|
let isDesktop = $derived(!isMobile.current);
|
||||||
let sidebarOpen = $state(false);
|
let sidebarOpen = $state(false);
|
||||||
let innerHeight = $state<number | undefined>();
|
let innerHeight = $state<number | undefined>();
|
||||||
let chatSidebar:
|
let chatSidebar:
|
||||||
|
|
@ -76,6 +81,11 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
$effect(() => {
|
$effect(() => {
|
||||||
|
if (alwaysShowSidebarOnDesktop && isDesktop) {
|
||||||
|
sidebarOpen = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (isHomeRoute && !isNewChatMode) {
|
if (isHomeRoute && !isNewChatMode) {
|
||||||
// Auto-collapse sidebar when navigating to home route (but not in new chat mode)
|
// Auto-collapse sidebar when navigating to home route (but not in new chat mode)
|
||||||
sidebarOpen = false;
|
sidebarOpen = false;
|
||||||
|
|
@ -83,8 +93,11 @@
|
||||||
// Keep sidebar open in new chat mode
|
// Keep sidebar open in new chat mode
|
||||||
sidebarOpen = true;
|
sidebarOpen = true;
|
||||||
} else if (isChatRoute) {
|
} else if (isChatRoute) {
|
||||||
// On chat routes, show sidebar by default
|
// On chat routes, only auto-show sidebar if setting is enabled
|
||||||
|
if (autoShowSidebarOnNewChat) {
|
||||||
sidebarOpen = true;
|
sidebarOpen = true;
|
||||||
|
}
|
||||||
|
// If setting is disabled, don't change sidebar state - let user control it manually
|
||||||
} else {
|
} else {
|
||||||
// Other routes follow default behavior
|
// Other routes follow default behavior
|
||||||
sidebarOpen = showSidebarByDefault;
|
sidebarOpen = showSidebarByDefault;
|
||||||
|
|
@ -190,12 +203,14 @@
|
||||||
<ChatSidebar bind:this={chatSidebar} />
|
<ChatSidebar bind:this={chatSidebar} />
|
||||||
</Sidebar.Root>
|
</Sidebar.Root>
|
||||||
|
|
||||||
|
{#if !(alwaysShowSidebarOnDesktop && isDesktop)}
|
||||||
<Sidebar.Trigger
|
<Sidebar.Trigger
|
||||||
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
|
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
|
||||||
? 'md:left-[var(--sidebar-width)]'
|
? 'md:left-[var(--sidebar-width)]'
|
||||||
: ''}"
|
: ''}"
|
||||||
style="translate: 1rem 1rem;"
|
style="translate: 1rem 1rem;"
|
||||||
/>
|
/>
|
||||||
|
{/if}
|
||||||
|
|
||||||
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
|
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
|
||||||
{@render children?.()}
|
{@render children?.()}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
import { describe, it, expect } from 'vitest';
|
|
||||||
|
|
||||||
describe('sum test', () => {
|
|
||||||
it('adds 1 + 2 to equal 3', () => {
|
|
||||||
expect(1 + 2).toBe(3);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
@ -0,0 +1,423 @@
|
||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { AttachmentType } from '$lib/enums';
|
||||||
|
import {
|
||||||
|
formatMessageForClipboard,
|
||||||
|
parseClipboardContent,
|
||||||
|
hasClipboardAttachments
|
||||||
|
} from '$lib/utils/clipboard';
|
||||||
|
|
||||||
|
describe('formatMessageForClipboard', () => {
|
||||||
|
it('returns plain content when no extras', () => {
|
||||||
|
const result = formatMessageForClipboard('Hello world', undefined);
|
||||||
|
expect(result).toBe('Hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns plain content when extras is empty array', () => {
|
||||||
|
const result = formatMessageForClipboard('Hello world', []);
|
||||||
|
expect(result).toBe('Hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles empty string content', () => {
|
||||||
|
const result = formatMessageForClipboard('', undefined);
|
||||||
|
expect(result).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns plain content when extras has only non-text attachments', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.IMAGE as const,
|
||||||
|
name: 'image.png',
|
||||||
|
base64Url: 'data:image/png;base64,...'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello world', extras);
|
||||||
|
expect(result).toBe('Hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('filters non-text attachments and keeps only text ones', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.IMAGE as const,
|
||||||
|
name: 'image.png',
|
||||||
|
base64Url: 'data:image/png;base64,...'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file.txt',
|
||||||
|
content: 'Text content'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: AttachmentType.PDF as const,
|
||||||
|
name: 'doc.pdf',
|
||||||
|
base64Data: 'data:application/pdf;base64,...',
|
||||||
|
content: 'PDF content',
|
||||||
|
processedAsImages: false
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello', extras);
|
||||||
|
|
||||||
|
expect(result).toContain('"file.txt"');
|
||||||
|
expect(result).not.toContain('image.png');
|
||||||
|
expect(result).not.toContain('doc.pdf');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('formats message with text attachments', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file1.txt',
|
||||||
|
content: 'File 1 content'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file2.txt',
|
||||||
|
content: 'File 2 content'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello world', extras);
|
||||||
|
|
||||||
|
expect(result).toContain('"Hello world"');
|
||||||
|
expect(result).toContain('"type": "TEXT"');
|
||||||
|
expect(result).toContain('"name": "file1.txt"');
|
||||||
|
expect(result).toContain('"content": "File 1 content"');
|
||||||
|
expect(result).toContain('"name": "file2.txt"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles content with quotes and special characters', () => {
|
||||||
|
const content = 'Hello "world" with\nnewline';
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'test.txt',
|
||||||
|
content: 'Test content'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard(content, extras);
|
||||||
|
|
||||||
|
// Should be valid JSON
|
||||||
|
expect(result.startsWith('"')).toBe(true);
|
||||||
|
// The content should be properly escaped
|
||||||
|
const parsed = JSON.parse(result.split('\n')[0]);
|
||||||
|
expect(parsed).toBe(content);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('converts legacy context type to TEXT type', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.LEGACY_CONTEXT as const,
|
||||||
|
name: 'legacy.txt',
|
||||||
|
content: 'Legacy content'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello', extras);
|
||||||
|
|
||||||
|
expect(result).toContain('"type": "TEXT"');
|
||||||
|
expect(result).not.toContain('"context"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles attachment content with special characters', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'code.js',
|
||||||
|
content: 'const x = "hello\\nworld";\nconst y = `template ${var}`;'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard('Check this code', extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.textAttachments[0].content).toBe(
|
||||||
|
'const x = "hello\\nworld";\nconst y = `template ${var}`;'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles unicode characters in content and attachments', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'unicode.txt',
|
||||||
|
content: '日本語テスト 🎉 émojis'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard('Привет мир 👋', extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe('Привет мир 👋');
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('日本語テスト 🎉 émojis');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('formats as plain text when asPlainText is true', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file1.txt',
|
||||||
|
content: 'File 1 content'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file2.txt',
|
||||||
|
content: 'File 2 content'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello world', extras, true);
|
||||||
|
|
||||||
|
expect(result).toBe('Hello world\n\nFile 1 content\n\nFile 2 content');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns plain content when asPlainText is true but no attachments', () => {
|
||||||
|
const result = formatMessageForClipboard('Hello world', [], true);
|
||||||
|
expect(result).toBe('Hello world');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('plain text mode does not use JSON format', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'test.txt',
|
||||||
|
content: 'Test content'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const result = formatMessageForClipboard('Hello', extras, true);
|
||||||
|
|
||||||
|
expect(result).not.toContain('"type"');
|
||||||
|
expect(result).not.toContain('[');
|
||||||
|
expect(result).toBe('Hello\n\nTest content');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('parseClipboardContent', () => {
|
||||||
|
it('returns plain text as message when not in special format', () => {
|
||||||
|
const result = parseClipboardContent('Hello world');
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello world');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles empty string input', () => {
|
||||||
|
const result = parseClipboardContent('');
|
||||||
|
|
||||||
|
expect(result.message).toBe('');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles whitespace-only input', () => {
|
||||||
|
const result = parseClipboardContent(' \n\t ');
|
||||||
|
|
||||||
|
expect(result.message).toBe(' \n\t ');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns plain text as message when starts with quote but invalid format', () => {
|
||||||
|
const result = parseClipboardContent('"Unclosed quote');
|
||||||
|
|
||||||
|
expect(result.message).toBe('"Unclosed quote');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns original text when JSON array is malformed', () => {
|
||||||
|
const input = '"Hello"\n[invalid json';
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('"Hello"\n[invalid json');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses message with text attachments', () => {
|
||||||
|
const input = `"Hello world"
|
||||||
|
[
|
||||||
|
{"type":"TEXT","name":"file1.txt","content":"File 1 content"},
|
||||||
|
{"type":"TEXT","name":"file2.txt","content":"File 2 content"}
|
||||||
|
]`;
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello world');
|
||||||
|
expect(result.textAttachments).toHaveLength(2);
|
||||||
|
expect(result.textAttachments[0].name).toBe('file1.txt');
|
||||||
|
expect(result.textAttachments[0].content).toBe('File 1 content');
|
||||||
|
expect(result.textAttachments[1].name).toBe('file2.txt');
|
||||||
|
expect(result.textAttachments[1].content).toBe('File 2 content');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles escaped quotes in message', () => {
|
||||||
|
const input = `"Hello \\"world\\" with quotes"
|
||||||
|
[
|
||||||
|
{"type":"TEXT","name":"file.txt","content":"test"}
|
||||||
|
]`;
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello "world" with quotes');
|
||||||
|
expect(result.textAttachments).toHaveLength(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles newlines in message', () => {
|
||||||
|
const input = `"Hello\\nworld"
|
||||||
|
[
|
||||||
|
{"type":"TEXT","name":"file.txt","content":"test"}
|
||||||
|
]`;
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello\nworld');
|
||||||
|
expect(result.textAttachments).toHaveLength(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns message only when no array follows', () => {
|
||||||
|
const input = '"Just a quoted string"';
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Just a quoted string');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('filters out invalid attachment objects', () => {
|
||||||
|
const input = `"Hello"
|
||||||
|
[
|
||||||
|
{"type":"TEXT","name":"valid.txt","content":"valid"},
|
||||||
|
{"type":"INVALID","name":"invalid.txt","content":"invalid"},
|
||||||
|
{"name":"missing-type.txt","content":"missing"},
|
||||||
|
{"type":"TEXT","content":"missing name"}
|
||||||
|
]`;
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello');
|
||||||
|
expect(result.textAttachments).toHaveLength(1);
|
||||||
|
expect(result.textAttachments[0].name).toBe('valid.txt');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles empty attachments array', () => {
|
||||||
|
const input = '"Hello"\n[]';
|
||||||
|
|
||||||
|
const result = parseClipboardContent(input);
|
||||||
|
|
||||||
|
expect(result.message).toBe('Hello');
|
||||||
|
expect(result.textAttachments).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('roundtrips correctly with formatMessageForClipboard', () => {
|
||||||
|
const originalContent = 'Hello "world" with\nspecial characters';
|
||||||
|
const originalExtras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file1.txt',
|
||||||
|
content: 'Content with\nnewlines and "quotes"'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file2.txt',
|
||||||
|
content: 'Another file'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const formatted = formatMessageForClipboard(originalContent, originalExtras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe(originalContent);
|
||||||
|
expect(parsed.textAttachments).toHaveLength(2);
|
||||||
|
expect(parsed.textAttachments[0].name).toBe('file1.txt');
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('Content with\nnewlines and "quotes"');
|
||||||
|
expect(parsed.textAttachments[1].name).toBe('file2.txt');
|
||||||
|
expect(parsed.textAttachments[1].content).toBe('Another file');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('hasClipboardAttachments', () => {
|
||||||
|
it('returns false for plain text', () => {
|
||||||
|
expect(hasClipboardAttachments('Hello world')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for empty string', () => {
|
||||||
|
expect(hasClipboardAttachments('')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for quoted string without attachments', () => {
|
||||||
|
expect(hasClipboardAttachments('"Hello world"')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for valid format with attachments', () => {
|
||||||
|
const input = `"Hello"
|
||||||
|
[{"type":"TEXT","name":"file.txt","content":"test"}]`;
|
||||||
|
|
||||||
|
expect(hasClipboardAttachments(input)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for format with empty attachments array', () => {
|
||||||
|
const input = '"Hello"\n[]';
|
||||||
|
|
||||||
|
expect(hasClipboardAttachments(input)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for malformed JSON', () => {
|
||||||
|
expect(hasClipboardAttachments('"Hello"\n[broken')).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('roundtrip edge cases', () => {
|
||||||
|
it('preserves empty message with attachments', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'file.txt',
|
||||||
|
content: 'Content only'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard('', extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe('');
|
||||||
|
expect(parsed.textAttachments).toHaveLength(1);
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('Content only');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves attachment with empty content', () => {
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'empty.txt',
|
||||||
|
content: ''
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard('Message', extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe('Message');
|
||||||
|
expect(parsed.textAttachments).toHaveLength(1);
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves multiple backslashes', () => {
|
||||||
|
const content = 'Path: C:\\\\Users\\\\test\\\\file.txt';
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'path.txt',
|
||||||
|
content: 'D:\\\\Data\\\\file'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard(content, extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe(content);
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('D:\\\\Data\\\\file');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves tabs and various whitespace', () => {
|
||||||
|
const content = 'Line1\t\tTabbed\n Spaced\r\nCRLF';
|
||||||
|
const extras = [
|
||||||
|
{
|
||||||
|
type: AttachmentType.TEXT as const,
|
||||||
|
name: 'whitespace.txt',
|
||||||
|
content: '\t\t\n\n '
|
||||||
|
}
|
||||||
|
];
|
||||||
|
const formatted = formatMessageForClipboard(content, extras);
|
||||||
|
const parsed = parseClipboardContent(formatted);
|
||||||
|
|
||||||
|
expect(parsed.message).toBe(content);
|
||||||
|
expect(parsed.textAttachments[0].content).toBe('\t\t\n\n ');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
/* eslint-disable no-irregular-whitespace */
|
/* eslint-disable no-irregular-whitespace */
|
||||||
import { describe, it, expect, test } from 'vitest';
|
import { describe, it, expect, test } from 'vitest';
|
||||||
import { maskInlineLaTeX, preprocessLaTeX } from './latex-protection';
|
import { maskInlineLaTeX, preprocessLaTeX } from '$lib/utils/latex-protection';
|
||||||
|
|
||||||
describe('maskInlineLaTeX', () => {
|
describe('maskInlineLaTeX', () => {
|
||||||
it('should protect LaTeX $x + y$ but not money $3.99', () => {
|
it('should protect LaTeX $x + y$ but not money $3.99', () => {
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import { describe, expect, it } from 'vitest';
|
import { describe, expect, it } from 'vitest';
|
||||||
import { isValidModelName, normalizeModelName } from './model-names';
|
import { isValidModelName, normalizeModelName } from '$lib/utils/model-names';
|
||||||
|
|
||||||
describe('normalizeModelName', () => {
|
describe('normalizeModelName', () => {
|
||||||
it('preserves Hugging Face org/model format (single slash)', () => {
|
it('preserves Hugging Face org/model format (single slash)', () => {
|
||||||
|
|
@ -125,9 +125,9 @@ export default defineConfig({
|
||||||
{
|
{
|
||||||
extends: './vite.config.ts',
|
extends: './vite.config.ts',
|
||||||
test: {
|
test: {
|
||||||
name: 'server',
|
name: 'unit',
|
||||||
environment: 'node',
|
environment: 'node',
|
||||||
include: ['tests/server/**/*.{test,spec}.{js,ts}']
|
include: ['tests/unit/**/*.{test,spec}.{js,ts}']
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue