Merge branch 'master' into xsn/arch_refactor_llm_names
This commit is contained in:
commit
942ddbe900
|
|
@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
# ENTRYPOINT ["/app/llama-server"]
|
# ENTRYPOINT ["/app/llama-server"]
|
||||||
|
|
||||||
### Target: light
|
### Target: light
|
||||||
# Lightweight image containing only llama-cli
|
# Lightweight image containing only llama-cli and llama-completion
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
RUN echo "Building with static libs" && \
|
RUN echo "Building with static libs" && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli && \
|
||||||
|
cmake --build build --config Release --target llama-completion
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
# TODO: use image with NNRT
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ make -j GGML_CUDA=1
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
|
|
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/llama-cuda-cli
|
||||||
|
%{_bindir}/llama-cuda-completion
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@ make -j
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
|
|
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/llama-cli
|
||||||
|
%{_bindir}/llama-completion
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ body:
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: version
|
id: version
|
||||||
attributes:
|
attributes:
|
||||||
|
|
@ -74,9 +74,12 @@ body:
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||||
that information would be very much appreciated by us.
|
that information would be very much appreciated by us.
|
||||||
|
|
||||||
|
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
|
||||||
|
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
|
||||||
placeholder: >
|
placeholder: >
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
|
||||||
When I use -ngl 0 it works correctly.
|
With short prompts or `-fa off` it works correctly.
|
||||||
Here are the exact commands that I used: ...
|
Here are the exact commands that I used: ...
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,295 @@
|
||||||
|
# Server WebUI build and tests
|
||||||
|
name: Server WebUI
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
sha:
|
||||||
|
description: 'Commit SHA1 to build'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
slow_tests:
|
||||||
|
description: 'Run slow tests'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||||
|
|
||||||
|
env:
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
LLAMA_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
webui-setup:
|
||||||
|
name: WebUI Setup
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
cache: "npm"
|
||||||
|
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||||
|
|
||||||
|
- name: Cache node_modules
|
||||||
|
uses: actions/cache@v4
|
||||||
|
id: cache-node-modules
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
||||||
|
run: npm ci
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-check:
|
||||||
|
needs: webui-setup
|
||||||
|
name: WebUI Check
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Run type checking
|
||||||
|
run: npm run check
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run linting
|
||||||
|
run: npm run lint
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-build:
|
||||||
|
needs: webui-check
|
||||||
|
name: WebUI Build
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Build application
|
||||||
|
run: npm run build
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
webui-tests:
|
||||||
|
needs: webui-build
|
||||||
|
name: Run WebUI tests
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
|
||||||
|
- name: Restore node_modules cache
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: tools/server/webui/node_modules
|
||||||
|
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-node-modules-
|
||||||
|
|
||||||
|
- name: Install Playwright browsers
|
||||||
|
run: npx playwright install --with-deps
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build Storybook
|
||||||
|
run: npm run build-storybook
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run Client tests
|
||||||
|
run: npm run test:client
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run Server tests
|
||||||
|
run: npm run test:server
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run UI tests
|
||||||
|
run: npm run test:ui -- --testTimeout=60000
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Run E2E tests
|
||||||
|
run: npm run test:e2e
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
server-build:
|
||||||
|
needs: [webui-tests]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
|
build_type: [RelWithDebInfo]
|
||||||
|
include:
|
||||||
|
- build_type: Release
|
||||||
|
sanitizer: ""
|
||||||
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get -y install \
|
||||||
|
build-essential \
|
||||||
|
xxd \
|
||||||
|
git \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
language-pack-en \
|
||||||
|
libssl-dev
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Python setup
|
||||||
|
id: setup_python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Tests dependencies
|
||||||
|
id: test_dependencies
|
||||||
|
run: |
|
||||||
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
|
- name: Setup Node.js for WebUI
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: "22"
|
||||||
|
cache: "npm"
|
||||||
|
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||||
|
|
||||||
|
- name: Install WebUI dependencies
|
||||||
|
run: npm ci
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build WebUI
|
||||||
|
run: npm run build
|
||||||
|
working-directory: tools/server/webui
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DGGML_OPENMP=OFF ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Build (sanitizers)
|
||||||
|
id: cmake_build_sanitizers
|
||||||
|
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Build (sanitizers)
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer == '' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_NATIVE=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
id: server_integration_tests
|
||||||
|
if: ${{ matrix.sanitizer == '' }}
|
||||||
|
env:
|
||||||
|
GITHUB_ACTIONS: "true"
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
./tests.sh
|
||||||
|
|
||||||
|
- name: Tests (sanitizers)
|
||||||
|
id: server_integration_tests_sanitizers
|
||||||
|
if: ${{ matrix.sanitizer != '' }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
LLAMA_SANITIZE=1 ./tests.sh
|
||||||
|
|
||||||
|
- name: Slow tests
|
||||||
|
id: server_integration_tests_slow
|
||||||
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
|
run: |
|
||||||
|
cd tools/server/tests
|
||||||
|
SLOW_TESTS=1 ./tests.sh
|
||||||
|
|
@ -76,270 +76,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
pip install -r tools/server/tests/requirements.txt
|
pip install -r tools/server/tests/requirements.txt
|
||||||
|
|
||||||
webui-setup:
|
|
||||||
name: WebUI Setup
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
|
||||||
|
|
||||||
- name: Cache node_modules
|
|
||||||
uses: actions/cache@v4
|
|
||||||
id: cache-node-modules
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
|
||||||
run: npm ci
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-check:
|
|
||||||
needs: webui-setup
|
|
||||||
name: WebUI Check
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Run type checking
|
|
||||||
run: npm run check
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run linting
|
|
||||||
run: npm run lint
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-build:
|
|
||||||
needs: webui-check
|
|
||||||
name: WebUI Build
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Build application
|
|
||||||
run: npm run build
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
webui-tests:
|
|
||||||
needs: webui-build
|
|
||||||
name: Run WebUI tests
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
|
|
||||||
- name: Restore node_modules cache
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: tools/server/webui/node_modules
|
|
||||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
|
||||||
restore-keys: |
|
|
||||||
${{ runner.os }}-node-modules-
|
|
||||||
|
|
||||||
- name: Install Playwright browsers
|
|
||||||
run: npx playwright install --with-deps
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build Storybook
|
|
||||||
run: npm run build-storybook
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run Client tests
|
|
||||||
run: npm run test:client
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run Server tests
|
|
||||||
run: npm run test:server
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run UI tests
|
|
||||||
run: npm run test:ui -- --testTimeout=60000
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Run E2E tests
|
|
||||||
run: npm run test:e2e
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
server-build:
|
|
||||||
needs: [webui-tests]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
|
||||||
build_type: [RelWithDebInfo]
|
|
||||||
include:
|
|
||||||
- build_type: Release
|
|
||||||
sanitizer: ""
|
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get -y install \
|
|
||||||
build-essential \
|
|
||||||
xxd \
|
|
||||||
git \
|
|
||||||
cmake \
|
|
||||||
curl \
|
|
||||||
wget \
|
|
||||||
language-pack-en \
|
|
||||||
libssl-dev
|
|
||||||
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
- name: Python setup
|
|
||||||
id: setup_python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.11'
|
|
||||||
|
|
||||||
- name: Tests dependencies
|
|
||||||
id: test_dependencies
|
|
||||||
run: |
|
|
||||||
pip install -r tools/server/tests/requirements.txt
|
|
||||||
|
|
||||||
- name: Setup Node.js for WebUI
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: "22"
|
|
||||||
cache: "npm"
|
|
||||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
|
||||||
|
|
||||||
- name: Install WebUI dependencies
|
|
||||||
run: npm ci
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build WebUI
|
|
||||||
run: npm run build
|
|
||||||
working-directory: tools/server/webui
|
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
|
||||||
id: cmake_build_no_openmp
|
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
|
||||||
-DGGML_OPENMP=OFF ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Build (sanitizers)
|
|
||||||
id: cmake_build
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DGGML_NATIVE=OFF \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DLLAMA_OPENSSL=ON \
|
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
|
||||||
|
|
||||||
- name: Tests
|
|
||||||
id: server_integration_tests
|
|
||||||
if: ${{ matrix.sanitizer == '' }}
|
|
||||||
env:
|
|
||||||
GITHUB_ACTIONS: "true"
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
./tests.sh
|
|
||||||
|
|
||||||
- name: Tests (sanitizers)
|
|
||||||
id: server_integration_tests_sanitizers
|
|
||||||
if: ${{ matrix.sanitizer != '' }}
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
LLAMA_SANITIZE=1 ./tests.sh
|
|
||||||
|
|
||||||
- name: Slow tests
|
|
||||||
id: server_integration_tests_slow
|
|
||||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
|
||||||
run: |
|
|
||||||
cd tools/server/tests
|
|
||||||
SLOW_TESTS=1 ./tests.sh
|
|
||||||
|
|
||||||
|
|
||||||
server-windows:
|
server-windows:
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,8 @@
|
||||||
/tests/ @ggerganov
|
/tests/ @ggerganov
|
||||||
/tests/test-chat-.* @pwilkin
|
/tests/test-chat-.* @pwilkin
|
||||||
/tools/batched-bench/ @ggerganov
|
/tools/batched-bench/ @ggerganov
|
||||||
/tools/main/ @ggerganov
|
/tools/cli/ @ngxson
|
||||||
|
/tools/completion/ @ggerganov
|
||||||
/tools/mtmd/ @ngxson
|
/tools/mtmd/ @ngxson
|
||||||
/tools/perplexity/ @ggerganov
|
/tools/perplexity/ @ggerganov
|
||||||
/tools/quantize/ @ggerganov
|
/tools/quantize/ @ggerganov
|
||||||
|
|
|
||||||
|
|
@ -313,7 +313,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||||
|
|
||||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||||
|
|
||||||
## [`llama-cli`](tools/main)
|
## [`llama-cli`](tools/cli)
|
||||||
|
|
||||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||||
|
|
||||||
|
|
@ -525,7 +525,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||||
|
|
||||||
## Other documentation
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](tools/main/README.md)
|
- [cli](tools/cli/README.md)
|
||||||
|
- [completion](tools/completion/README.md)
|
||||||
- [server](tools/server/README.md)
|
- [server](tools/server/README.md)
|
||||||
- [GBNF grammars](grammars/README.md)
|
- [GBNF grammars](grammars/README.md)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
|
||||||
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
|
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||||
|
|
|
||||||
|
|
@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
|
@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
@ -529,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.tensor_buft_overrides.empty()) {
|
// pad tensor_buft_overrides for llama_params_fit:
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
while (params.tensor_buft_overrides.size() < ntbo) {
|
||||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2153,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MAIN_GPU"));
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fit", "--fit" }, "[on|off]",
|
||||||
|
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
params.fit_params = true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
params.fit_params = false;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(
|
||||||
|
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitt", "--fit-target" }, "MiB",
|
||||||
|
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_target = value * size_t(1024*1024);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitc", "--fit-ctx" }, "N",
|
||||||
|
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_min_ctx = value;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_CTX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--check-tensors"},
|
{"--check-tensors"},
|
||||||
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
||||||
|
|
|
||||||
|
|
@ -4,9 +4,14 @@
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
static std::string_view trim_trailing_space(std::string_view sv) {
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
||||||
|
int count = 0;
|
||||||
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
||||||
|
if (max != -1 && count <= max) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
sv.remove_suffix(1);
|
sv.remove_suffix(1);
|
||||||
|
count++;
|
||||||
}
|
}
|
||||||
return sv;
|
return sv;
|
||||||
}
|
}
|
||||||
|
|
@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
|
||||||
if (is_arg_string && current_tool) {
|
if (is_arg_string && current_tool) {
|
||||||
// Serialize to JSON, but exclude the end quote
|
// Serialize to JSON, but exclude the end quote
|
||||||
std::string dumped = json(node.text).dump();
|
std::string dumped = json(trim_trailing_space(node.text)).dump();
|
||||||
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
|
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
|
||||||
needs_closing_quote = true;
|
needs_closing_quote = true;
|
||||||
}
|
}
|
||||||
|
|
@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
if (is_arg_close && current_tool) {
|
if (is_arg_close && current_tool) {
|
||||||
if (needs_closing_quote) {
|
if (needs_closing_quote) {
|
||||||
current_tool->arguments += "\"";
|
current_tool->arguments += "\"";
|
||||||
|
needs_closing_quote = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_tool_close && current_tool) {
|
if (is_tool_close && current_tool) {
|
||||||
|
if (needs_closing_quote) {
|
||||||
|
current_tool->arguments += "\"";
|
||||||
|
needs_closing_quote = false;
|
||||||
|
}
|
||||||
current_tool->arguments += "}";
|
current_tool->arguments += "}";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
140
common/chat.cpp
140
common/chat.cpp
|
|
@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function<void(const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void foreach_parameter(const json & function, const std::function<void(const std::string &, const json &, bool)> & fn) {
|
||||||
|
if (!function.contains("parameters") || !function.at("parameters").is_object()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const auto & params = function.at("parameters");
|
||||||
|
if (!params.contains("properties") || !params.at("properties").is_object()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const auto & props = params.at("properties");
|
||||||
|
std::set<std::string> required;
|
||||||
|
if (params.contains("required") && params.at("required").is_array()) {
|
||||||
|
params.at("required").get_to(required);
|
||||||
|
}
|
||||||
|
for (const auto & [name, prop] : props.items()) {
|
||||||
|
bool is_required = (required.find(name) != required.end());
|
||||||
|
fn(name, prop, is_required);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string apply(
|
static std::string apply(
|
||||||
const common_chat_template & tmpl,
|
const common_chat_template & tmpl,
|
||||||
const struct templates_params & inputs,
|
const struct templates_params & inputs,
|
||||||
|
|
@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
common_chat_params data;
|
||||||
|
|
||||||
|
data.prompt = apply(tmpl, inputs);
|
||||||
|
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
|
||||||
|
|
||||||
|
// Handle thinking tags appropriately based on inputs.enable_thinking
|
||||||
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||||
|
if (!inputs.enable_thinking) {
|
||||||
|
data.prompt += "</think>";
|
||||||
|
} else {
|
||||||
|
data.thinking_forced_open = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.preserved_tokens = {
|
||||||
|
"<think>",
|
||||||
|
"</think>",
|
||||||
|
"<tool_call>",
|
||||||
|
"</tool_call>",
|
||||||
|
};
|
||||||
|
|
||||||
|
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||||
|
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||||
|
auto include_grammar = true;
|
||||||
|
|
||||||
|
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
|
||||||
|
auto reasoning = p.eps();
|
||||||
|
if (inputs.enable_thinking && extract_reasoning) {
|
||||||
|
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
|
||||||
|
if (data.thinking_forced_open) {
|
||||||
|
reasoning = reasoning_content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Response format parser
|
||||||
|
if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
|
||||||
|
return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tool call parser
|
||||||
|
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||||
|
auto tool_choice = p.choice();
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
std::string name = function.at("name");
|
||||||
|
auto parameters = function.at("parameters");
|
||||||
|
|
||||||
|
auto schema_info = common_schema_info();
|
||||||
|
schema_info.resolve_refs(parameters);
|
||||||
|
|
||||||
|
auto tool_open = "<function=" + p.tool_name(p.literal(name)) + ">\n";
|
||||||
|
auto tool_close = p.literal("</function>\n");
|
||||||
|
auto args = p.sequence();
|
||||||
|
auto arg_string = p.rule("xml-arg-string", p.until_one_of({
|
||||||
|
"\n</parameter>",
|
||||||
|
"\n<parameter=",
|
||||||
|
"\n</function>"
|
||||||
|
}));
|
||||||
|
|
||||||
|
foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
|
||||||
|
auto rule_name = "tool-" + name + "-arg-" + param_name;
|
||||||
|
|
||||||
|
auto arg_open = "<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">\n";
|
||||||
|
auto arg_close = p.literal("</parameter>\n");
|
||||||
|
auto arg_value = p.eps();
|
||||||
|
|
||||||
|
if (schema_info.resolves_to_string(param_schema)) {
|
||||||
|
arg_value = p.tool_arg_string_value(arg_string) + "\n";
|
||||||
|
} else {
|
||||||
|
arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model may or my not close with </parameter>
|
||||||
|
auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
|
||||||
|
args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
|
||||||
|
});
|
||||||
|
|
||||||
|
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||||
|
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||||
|
auto tool_call = p.rule("tool-call", "<tool_call>\n" + tool_choice + "</tool_call>" + p.space());
|
||||||
|
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
|
||||||
|
|
||||||
|
return reasoning << p.content(p.until("<tool_call>")) << tool_calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content only parser
|
||||||
|
include_grammar = false;
|
||||||
|
return reasoning << p.content(p.rest());
|
||||||
|
});
|
||||||
|
|
||||||
|
data.parser = parser.save();
|
||||||
|
|
||||||
|
if (include_grammar) {
|
||||||
|
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
|
|
||||||
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
|
foreach_function(inputs.tools, [&](const json & tool) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
auto schema = function.at("parameters");
|
||||||
|
builder.resolve_refs(schema);
|
||||||
|
});
|
||||||
|
parser.build_grammar(builder, data.grammar_lazy);
|
||||||
|
});
|
||||||
|
|
||||||
|
data.grammar_triggers = {
|
||||||
|
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
common_chat_params data;
|
common_chat_params data;
|
||||||
|
|
||||||
|
|
@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||||
src.find("<function=") != std::string::npos &&
|
src.find("<function=") != std::string::npos &&
|
||||||
src.find("<parameters>") != std::string::npos &&
|
src.find("<parameters>") != std::string::npos &&
|
||||||
src.find("<parameter=") != std::string::npos) {
|
src.find("<parameter=") != std::string::npos) {
|
||||||
|
// Nemotron 3 Nano 30B A3B
|
||||||
|
if (src.find("<think>") != std::string::npos) {
|
||||||
|
return common_chat_params_init_nemotron_v3(tmpl, params);
|
||||||
|
}
|
||||||
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
|
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1088,7 +1088,15 @@ struct common_init_result::impl {
|
||||||
|
|
||||||
common_init_result::common_init_result(common_params & params) :
|
common_init_result::common_init_result(common_params & params) :
|
||||||
pimpl(new impl{}) {
|
pimpl(new impl{}) {
|
||||||
const auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
if (params.fit_params) {
|
||||||
|
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
|
@ -1103,8 +1111,6 @@ common_init_result::common_init_result(common_params & params) :
|
||||||
// TODO: fix naming
|
// TODO: fix naming
|
||||||
common_init_sampler_from_model(model, params.sampling);
|
common_init_sampler_from_model(model, params.sampling);
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sampling.ignore_eos = false;
|
params.sampling.ignore_eos = false;
|
||||||
|
|
@ -1143,8 +1149,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1176,15 +1181,13 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
llama_model * model = res->model();
|
llama_model * model = res->model();
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context * lctx = res->context();
|
llama_context * lctx = res->context();
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -306,8 +307,8 @@ struct lr_opt {
|
||||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
@ -328,9 +329,12 @@ struct common_params {
|
||||||
// offload params
|
// offload params
|
||||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||||
|
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
|
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||||
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
||||||
|
|
||||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||||
|
|
||||||
class SchemaConverter {
|
class common_schema_converter {
|
||||||
private:
|
private:
|
||||||
|
friend class common_schema_info;
|
||||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
bool _dotall;
|
bool _dotall;
|
||||||
|
|
@ -729,7 +730,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SchemaConverter(
|
common_schema_converter(
|
||||||
const std::function<json(const std::string &)> & fetch_json,
|
const std::function<json(const std::string &)> & fetch_json,
|
||||||
bool dotall)
|
bool dotall)
|
||||||
: _fetch_json(fetch_json), _dotall(dotall)
|
: _fetch_json(fetch_json), _dotall(dotall)
|
||||||
|
|
@ -990,6 +991,134 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// common_schema_info implementation (pimpl)
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info()
|
||||||
|
: impl_(std::make_unique<common_schema_converter>(
|
||||||
|
[](const std::string &) { return json(); },
|
||||||
|
false)) {}
|
||||||
|
|
||||||
|
common_schema_info::~common_schema_info() = default;
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
||||||
|
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
||||||
|
|
||||||
|
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
||||||
|
impl_->resolve_refs(schema, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determines if a JSON schema can resolve to a string type through any path.
|
||||||
|
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
||||||
|
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
||||||
|
// true, allowing callers to handle the value as a raw string for simplicity.
|
||||||
|
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
||||||
|
std::unordered_set<std::string> visited_refs;
|
||||||
|
|
||||||
|
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
||||||
|
if (!s.is_object()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle $ref
|
||||||
|
if (s.contains("$ref")) {
|
||||||
|
const std::string & ref = s["$ref"];
|
||||||
|
if (visited_refs.find(ref) != visited_refs.end()) {
|
||||||
|
// Circular reference, assume not a string to be safe
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
visited_refs.insert(ref);
|
||||||
|
auto it = impl_->_refs.find(ref);
|
||||||
|
if (it != impl_->_refs.end()) {
|
||||||
|
return check(it->second);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check type field
|
||||||
|
if (s.contains("type")) {
|
||||||
|
const json & schema_type = s["type"];
|
||||||
|
if (schema_type.is_string()) {
|
||||||
|
if (schema_type == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (schema_type.is_array()) {
|
||||||
|
// Type can be an array like ["string", "null"]
|
||||||
|
for (const auto & t : schema_type) {
|
||||||
|
if (t == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check oneOf/anyOf - if any alternative can be a string
|
||||||
|
if (s.contains("oneOf")) {
|
||||||
|
for (const auto & alt : s["oneOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.contains("anyOf")) {
|
||||||
|
for (const auto & alt : s["anyOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check allOf - all components must be compatible with string type
|
||||||
|
if (s.contains("allOf")) {
|
||||||
|
bool all_string = true;
|
||||||
|
for (const auto & component : s["allOf"]) {
|
||||||
|
if (!check(component)) {
|
||||||
|
all_string = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_string) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check const - if the constant value is a string
|
||||||
|
if (s.contains("const")) {
|
||||||
|
if (s["const"].is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check enum - if any enum value is a string
|
||||||
|
if (s.contains("enum")) {
|
||||||
|
for (const auto & val : s["enum"]) {
|
||||||
|
if (val.is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// String-specific keywords imply string type
|
||||||
|
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check format - many formats imply string
|
||||||
|
if (s.contains("format")) {
|
||||||
|
const std::string & fmt = s["format"];
|
||||||
|
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
||||||
|
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
||||||
|
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
||||||
|
fmt.find("uuid") == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
return check(schema);
|
||||||
|
}
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
if (!force_gbnf) {
|
if (!force_gbnf) {
|
||||||
|
|
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||||
common_grammar_builder builder {
|
common_grammar_builder builder {
|
||||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||||
return converter._add_rule(name, rule);
|
return converter._add_rule(name, rule);
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,31 @@
|
||||||
#include <nlohmann/json_fwd.hpp>
|
#include <nlohmann/json_fwd.hpp>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||||
bool force_gbnf = false);
|
bool force_gbnf = false);
|
||||||
|
|
||||||
|
class common_schema_converter;
|
||||||
|
|
||||||
|
// Probes a JSON schema to extract information about its structure and type constraints.
|
||||||
|
class common_schema_info {
|
||||||
|
std::unique_ptr<common_schema_converter> impl_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_schema_info();
|
||||||
|
~common_schema_info();
|
||||||
|
|
||||||
|
common_schema_info(const common_schema_info &) = delete;
|
||||||
|
common_schema_info & operator=(const common_schema_info &) = delete;
|
||||||
|
common_schema_info(common_schema_info &&) noexcept;
|
||||||
|
common_schema_info & operator=(common_schema_info &&) noexcept;
|
||||||
|
|
||||||
|
void resolve_refs(nlohmann::ordered_json & schema);
|
||||||
|
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
||||||
|
};
|
||||||
|
|
||||||
struct common_grammar_builder {
|
struct common_grammar_builder {
|
||||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||||
|
|
|
||||||
|
|
@ -425,7 +425,7 @@ struct parser_executor {
|
||||||
|
|
||||||
if (result.need_more_input()) {
|
if (result.need_more_input()) {
|
||||||
// Propagate - need to know what child would match before negating
|
// Propagate - need to know what child would match before negating
|
||||||
return result;
|
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Child failed, so negation succeeds
|
// Child failed, so negation succeeds
|
||||||
|
|
|
||||||
|
|
@ -136,19 +136,11 @@ class ModelBase:
|
||||||
self.remote_hf_model_id = remote_hf_model_id
|
self.remote_hf_model_id = remote_hf_model_id
|
||||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||||
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
||||||
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
|
||||||
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
|
||||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
|
||||||
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
|
|
||||||
self.rope_parameters["rope_theta"] = rope_theta
|
|
||||||
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
|
||||||
self.rope_parameters["rope_type"] = rope_type
|
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
|
|
@ -765,6 +757,15 @@ class TextModel(ModelBase):
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
|
||||||
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||||
|
|
||||||
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||||
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||||
|
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
|
||||||
|
self.rope_parameters["rope_theta"] = rope_theta
|
||||||
|
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||||
|
self.rope_parameters["rope_type"] = rope_type
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
# can't use an abstract property, because overriding it without type errors
|
# can't use an abstract property, because overriding it without type errors
|
||||||
|
|
@ -861,6 +862,14 @@ class TextModel(ModelBase):
|
||||||
logger.warning(f"Unknown RoPE type: {rope_type}")
|
logger.warning(f"Unknown RoPE type: {rope_type}")
|
||||||
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
||||||
|
|
||||||
|
if "mrope_section" in self.rope_parameters:
|
||||||
|
mrope_section = self.rope_parameters["mrope_section"]
|
||||||
|
# Pad to 4 dimensions [time, height, width, extra]
|
||||||
|
while len(mrope_section) < 4:
|
||||||
|
mrope_section.append(0)
|
||||||
|
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
||||||
|
logger.info(f"gguf: mrope sections: {mrope_section[:4]}")
|
||||||
|
|
||||||
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||||
|
|
@ -1203,6 +1212,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
|
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
|
||||||
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
|
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
|
||||||
res = "minimax-m2"
|
res = "minimax-m2"
|
||||||
|
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
|
||||||
|
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
|
||||||
|
res = "kormo"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
|
@ -3398,7 +3410,7 @@ class QwenModel(TextModel):
|
||||||
self._set_vocab_qwen()
|
self._set_vocab_qwen()
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
|
||||||
class Qwen2Model(TextModel):
|
class Qwen2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||||
|
|
||||||
|
|
@ -3735,9 +3747,6 @@ class Qwen2VLModel(TextModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
|
||||||
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
|
|
@ -4373,6 +4382,30 @@ class Qwen3VLVisionModel(MmprojModel):
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration")
|
||||||
|
class Glm4VVisionModel(Qwen3VLVisionModel):
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
|
||||||
|
assert self.hparams_vision is not None
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
|
||||||
|
|
||||||
|
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
|
||||||
|
if hidden_act == "gelu":
|
||||||
|
self.gguf_writer.add_vision_use_gelu(True)
|
||||||
|
elif hidden_act == "silu":
|
||||||
|
self.gguf_writer.add_vision_use_silu(True)
|
||||||
|
|
||||||
|
rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5)
|
||||||
|
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name.startswith("model.visual."):
|
||||||
|
name = name.replace("model.visual.", "visual.")
|
||||||
|
if name.startswith("visual.merger."):
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
||||||
class Qwen3VLTextModel(Qwen3Model):
|
class Qwen3VLTextModel(Qwen3Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
||||||
|
|
@ -4381,20 +4414,6 @@ class Qwen3VLTextModel(Qwen3Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
||||||
text_config = self.hparams.get("text_config", {})
|
|
||||||
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
||||||
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
||||||
|
|
||||||
if rope_scaling.get("mrope_section"):
|
|
||||||
# mrope_section contains [time, height, width] dimensions
|
|
||||||
mrope_section = rope_scaling["mrope_section"]
|
|
||||||
# Pad to 4 dimensions [time, height, width, extra]
|
|
||||||
while len(mrope_section) < 4:
|
|
||||||
mrope_section.append(0)
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
||||||
|
|
||||||
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
||||||
|
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
@ -4413,22 +4432,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
|
|
||||||
text_config = self.hparams.get("text_config", {})
|
|
||||||
# rope_scaling is deprecated in V5, use rope_parameters instead
|
|
||||||
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
|
|
||||||
|
|
||||||
if rope_scaling.get("mrope_section"):
|
|
||||||
# mrope_section contains [time, height, width] dimensions
|
|
||||||
mrope_section = rope_scaling["mrope_section"]
|
|
||||||
# Pad to 4 dimensions [time, height, width, extra]
|
|
||||||
while len(mrope_section) < 4:
|
|
||||||
mrope_section.append(0)
|
|
||||||
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
|
|
||||||
|
|
||||||
logger.info(f"MRoPE sections: {mrope_section[:4]}")
|
|
||||||
|
|
||||||
vision_config = self.hparams.get("vision_config", {})
|
vision_config = self.hparams.get("vision_config", {})
|
||||||
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
|
||||||
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
|
||||||
|
|
@ -7791,6 +7794,15 @@ class JaisModel(TextModel):
|
||||||
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
||||||
class Glm4Model(TextModel):
|
class Glm4Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.GLM4
|
model_arch = gguf.MODEL_ARCH.GLM4
|
||||||
|
use_mrope = False
|
||||||
|
partial_rotary_factor = 0.5
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5)
|
||||||
|
if "mrope_section" in self.rope_parameters:
|
||||||
|
self.use_mrope = True
|
||||||
|
logger.info("Q/K weight will need to be permuted for M-RoPE")
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
@ -7812,17 +7824,49 @@ class Glm4Model(TextModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor:
|
||||||
|
orig_shape = weights.shape
|
||||||
|
if len(orig_shape) == 1:
|
||||||
|
weights = weights.unsqueeze(1) # [out_dim, 1]
|
||||||
|
if len(weights.shape) != 2:
|
||||||
|
raise ValueError("Only 1D and 2D tensors are supported.")
|
||||||
|
n_effective_heads = weights.shape[0] // head_dim
|
||||||
|
if n_head_kv is not None and n_effective_heads != n_head:
|
||||||
|
if n_effective_heads != n_head_kv:
|
||||||
|
raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}")
|
||||||
|
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||||
|
if rotary_dim % 2 != 0:
|
||||||
|
raise ValueError("rotary_dim must be even.")
|
||||||
|
reshaped = weights.reshape(n_effective_heads, head_dim, -1)
|
||||||
|
rot_part = reshaped[:, :rotary_dim, :]
|
||||||
|
non_rot_part = reshaped[:, rotary_dim:, :]
|
||||||
|
permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1)
|
||||||
|
combined = torch.cat((permuted_rot, non_rot_part), dim=1)
|
||||||
|
result = combined.reshape(weights.shape)
|
||||||
|
return result if len(orig_shape) != 1 else result.squeeze(1)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||||
return []
|
return []
|
||||||
elif name.startswith("model.language_model."):
|
elif name.startswith("model.language_model."):
|
||||||
name = name.replace("language_model.", "") # for Glm4v
|
name = name.replace("language_model.", "") # for Glm4v
|
||||||
|
if self.use_mrope:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
n_kv_head = self.hparams["num_key_value_heads"]
|
||||||
|
n_embd = self.hparams["hidden_size"]
|
||||||
|
head_dim = n_embd // n_head
|
||||||
|
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
|
||||||
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
|
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
|
||||||
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor)
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Glm4MoeForCausalLM")
|
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
|
||||||
class Glm4MoeModel(TextModel):
|
class Glm4MoeModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.GLM4_MOE
|
model_arch = gguf.MODEL_ARCH.GLM4_MOE
|
||||||
|
|
||||||
|
|
@ -7889,6 +7933,7 @@ class Glm4MoeModel(TextModel):
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
# note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already
|
||||||
def modify_tensors(
|
def modify_tensors(
|
||||||
self, data_torch: Tensor, name: str, bid: int | None
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
) -> Iterable[tuple[str, Tensor]]:
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
@ -8486,8 +8531,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||||
class NemotronHModel(GraniteHybridModel):
|
class NemotronHModel(GraniteHybridModel):
|
||||||
"""Hybrid mamba2/attention model from NVIDIA"""
|
"""Hybrid mamba2/attention model from NVIDIA"""
|
||||||
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
||||||
|
is_moe: bool = False
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
# We have to determine the correct model architecture (MoE vs non-MoE) before
|
||||||
|
# calling the parent __init__. This is because the parent constructor
|
||||||
|
# uses self.model_arch to build the tensor name map, and all MoE-specific
|
||||||
|
# mappings would be missed if it were called with the default non-MoE arch.
|
||||||
|
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
|
||||||
|
if "num_experts_per_tok" in hparams:
|
||||||
|
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
|
||||||
|
self.is_moe = True
|
||||||
|
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
# Save the top-level head_dim for later
|
# Save the top-level head_dim for later
|
||||||
|
|
@ -8499,9 +8554,11 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
|
|
||||||
# Update the ssm / attn / mlp layers
|
# Update the ssm / attn / mlp layers
|
||||||
# M: Mamba2, *: Attention, -: MLP
|
# M: Mamba2, *: Attention, -: MLP
|
||||||
|
# MoE:
|
||||||
|
# M: Mamba2, *: Attention, E: Expert
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||||
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
||||||
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
|
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
|
||||||
|
|
||||||
def get_attn_layers(self):
|
def get_attn_layers(self):
|
||||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||||
|
|
@ -8517,10 +8574,28 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
# Set feed_forward_length
|
# Set feed_forward_length
|
||||||
# NOTE: This will trigger an override warning. This is preferrable to
|
# NOTE: This will trigger an override warning. This is preferrable to
|
||||||
# duplicating all the parent logic
|
# duplicating all the parent logic
|
||||||
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
if not self.is_moe:
|
||||||
self.gguf_writer.add_feed_forward_length([
|
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||||
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
self.gguf_writer.add_feed_forward_length([
|
||||||
])
|
n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
moe_intermediate_size = self.hparams["moe_intermediate_size"]
|
||||||
|
self.gguf_writer.add_feed_forward_length([
|
||||||
|
moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||||
|
])
|
||||||
|
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
|
||||||
|
|
||||||
|
# number of experts used per token (top-k)
|
||||||
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
super().set_vocab()
|
super().set_vocab()
|
||||||
|
|
@ -8528,7 +8603,81 @@ class NemotronHModel(GraniteHybridModel):
|
||||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||||
# config, so we need to explicitly override it here.
|
# config, so we need to explicitly override it here.
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
if not self.is_moe:
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if self.is_moe and bid is not None:
|
||||||
|
if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||||
|
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
mapped_name = self.map_tensor_name(new_name)
|
||||||
|
return [(mapped_name, data_torch)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.dt_bias"):
|
||||||
|
new_name = name.replace("dt_bias", "dt.bias")
|
||||||
|
mapped_name = self.map_tensor_name(new_name)
|
||||||
|
return [(mapped_name, data_torch)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.conv1d.weight"):
|
||||||
|
squeezed_data = data_torch.squeeze()
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, squeezed_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.A_log"):
|
||||||
|
transformed_data = -torch.exp(data_torch)
|
||||||
|
reshaped_data = transformed_data.squeeze().reshape(-1, 1)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.D"):
|
||||||
|
reshaped_data = data_torch.squeeze().reshape(-1, 1)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.endswith("mixer.norm.weight"):
|
||||||
|
reshaped_data = data_torch.reshape(8, 512)
|
||||||
|
mapped_name = self.map_tensor_name(name)
|
||||||
|
return [(mapped_name, reshaped_data)]
|
||||||
|
|
||||||
|
if name.find("mixer.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 2:
|
||||||
|
# merge the experts into a single tensor
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
for w_name in ["down_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("BailingMoeForCausalLM")
|
@ModelBase.register("BailingMoeForCausalLM")
|
||||||
|
|
|
||||||
|
|
@ -143,6 +143,7 @@ models = [
|
||||||
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
|
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
|
||||||
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
|
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
|
||||||
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
|
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
|
||||||
|
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||||
|
|
|
||||||
|
|
@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family:
|
||||||
- Intel Built-in Arc GPU
|
- Intel Built-in Arc GPU
|
||||||
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
|
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
|
||||||
|
|
||||||
|
On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities.
|
||||||
|
|
||||||
#### Verified devices
|
#### Verified devices
|
||||||
|
|
||||||
| Intel GPU | Status | Verified Model |
|
| Intel GPU | Status | Verified Model |
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,8 @@ Adding a model requires few steps:
|
||||||
After following these steps, you can open PR.
|
After following these steps, you can open PR.
|
||||||
|
|
||||||
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
|
||||||
- [main](/tools/main/)
|
- [cli](/tools/cli/)
|
||||||
|
- [completion](/tools/completion/)
|
||||||
- [imatrix](/tools/imatrix/)
|
- [imatrix](/tools/imatrix/)
|
||||||
- [quantize](/tools/quantize/)
|
- [quantize](/tools/quantize/)
|
||||||
- [server](/tools/server/)
|
- [server](/tools/server/)
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,9 @@
|
||||||
## Images
|
## Images
|
||||||
We have three Docker images available for this project:
|
We have three Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
|
|
||||||
Additionally, there the following images, similar to the above:
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
|
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
|
||||||
On completion, you are ready to play!
|
On completion, you are ready to play!
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a light image:
|
or with a light image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
@ -59,6 +61,8 @@ or with a server image:
|
||||||
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
|
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
|
||||||
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||||
|
|
@ -80,9 +84,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -114,9 +118,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,13 @@ and in some cases perplexity checked of the quantized model. And finally the
|
||||||
model/models need to the ggml-org on Hugging Face. This tool/example tries to
|
model/models need to the ggml-org on Hugging Face. This tool/example tries to
|
||||||
help with this process.
|
help with this process.
|
||||||
|
|
||||||
|
> 📝 **Note:** When adding a new model from an existing family, verify the
|
||||||
|
> previous version passes logits verification first. Existing models can have
|
||||||
|
> subtle numerical differences that don't affect generation quality but cause
|
||||||
|
> logits mismatches. Identifying these upfront whether they exist in llama.cpp,
|
||||||
|
> the conversion script, or in an upstream implementation, can save significant
|
||||||
|
> debugging time.
|
||||||
|
|
||||||
### Overview
|
### Overview
|
||||||
The idea is that the makefile targets and scripts here can be used in the
|
The idea is that the makefile targets and scripts here can be used in the
|
||||||
development/conversion process assisting with things like:
|
development/conversion process assisting with things like:
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,11 @@ done
|
||||||
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
||||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||||
|
|
||||||
|
CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||||
|
CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
|
||||||
|
|
||||||
if [ -t 0 ]; then
|
if [ -t 0 ]; then
|
||||||
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
|
CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
|
||||||
else
|
else
|
||||||
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
# Process piped JSON data and convert to binary (matching logits.cpp format)
|
||||||
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
// call with a worst-case graph to avoid buffer reallocations
|
// call with a worst-case graph to avoid buffer reallocations
|
||||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||||
// returns false if the buffer allocation failed
|
// returns false if the buffer allocation failed
|
||||||
|
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc,
|
||||||
|
struct ggml_cgraph * graph,
|
||||||
|
const int * node_buffer_ids,
|
||||||
|
const int * leaf_buffer_ids,
|
||||||
|
size_t * sizes);
|
||||||
GGML_API bool ggml_gallocr_reserve_n(
|
GGML_API bool ggml_gallocr_reserve_n(
|
||||||
ggml_gallocr_t galloc,
|
ggml_gallocr_t galloc,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
|
|
@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||||
|
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
|
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||||
|
|
||||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||||
|
|
|
||||||
|
|
@ -2615,7 +2615,8 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
|
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
return t->data != NULL // tensor data already set externally
|
||||||
|
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||||
|
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||||
}
|
}
|
||||||
|
|
||||||
// free the extra space at the end if the new tensor is smaller
|
// free the extra space at the end if the new tensor is smaller
|
||||||
|
|
@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
static bool ggml_gallocr_reserve_n_impl(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||||
// add 25% margin to avoid hash collisions
|
// add 25% margin to avoid hash collisions
|
||||||
min_hash_size += min_hash_size / 4;
|
min_hash_size += min_hash_size / 4;
|
||||||
|
|
@ -928,16 +931,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||||
if (cur_size > 0) {
|
if (cur_size > 0) {
|
||||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_vbuffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
if (no_alloc) {
|
||||||
if (galloc->buffers[i] == NULL) {
|
galloc->buffers[i] = NULL;
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
} else {
|
||||||
return false;
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
|
if (galloc->buffers[i] == NULL) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -945,6 +951,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||||
|
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||||
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
sizes[i] = 0;
|
||||||
|
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||||
|
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
|
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
@ -1147,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||||
|
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
@ -1155,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
ggml_backend_buffer_t * buffers = NULL;
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
size_t n_buffers = 0;
|
size_t n_buffers = 0;
|
||||||
|
*nbytes_total = 0;
|
||||||
|
|
||||||
size_t cur_buf_size = 0;
|
size_t cur_buf_size = 0;
|
||||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
|
@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||||
// allocate tensors in the current buffer
|
// allocate tensors in the current buffer
|
||||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
first = t;
|
first = t;
|
||||||
|
*nbytes_total += cur_buf_size;
|
||||||
cur_buf_size = this_size;
|
cur_buf_size = this_size;
|
||||||
} else {
|
} else {
|
||||||
cur_buf_size += this_size;
|
cur_buf_size += this_size;
|
||||||
|
|
@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
// allocate remaining tensors
|
// allocate remaining tensors
|
||||||
if (cur_buf_size > 0) {
|
if (cur_buf_size > 0) {
|
||||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
*nbytes_total += cur_buf_size;
|
||||||
|
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (no_alloc) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
GGML_ASSERT(!buffers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
} else {
|
} else {
|
||||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
}
|
}
|
||||||
free(buffers);
|
if (buffers) {
|
||||||
|
free(buffers); // can be NULL if context is empty or no_alloc
|
||||||
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||||
|
GGML_ASSERT(!buf);
|
||||||
|
return nbytes_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// return a dummy buffer for zero-sized allocations
|
// return a dummy buffer for zero-sized allocations
|
||||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buft);
|
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||||
|
// I don't know whether the above comment is correct
|
||||||
|
if (!buffer->iface.get_base) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
|
|
@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
sched->is_alloc = false;
|
sched->is_alloc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
GGML_ASSERT(sizes);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
|
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
GGML_ASSERT(sched);
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
|
||||||
|
|
@ -1976,9 +1976,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
if (!opt_experimental) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
const float * restrict vy = (const float * restrict) y;
|
const float * restrict vy = (const float * restrict) y;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n; i++) {
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
rsum += vx[i] * (__fp16) vy[i];
|
rsum += (float)vx[i] * vy[i];
|
||||||
}
|
}
|
||||||
*s = rsum;
|
*s = rsum;
|
||||||
return;
|
return;
|
||||||
|
|
@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
// for some reason we need volatile here so that the compiler doesn't try anything funky
|
// for some reason we need volatile here so that the compiler doesn't try anything funky
|
||||||
volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
|
volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
|
||||||
|
float r_sum_scalar = 0.0f;
|
||||||
uint32_t i = 0;
|
uint32_t i = 0;
|
||||||
|
|
||||||
for (i = 0; i < nv0; i++) {
|
for (i = 0; i < nv0; i++) {
|
||||||
|
|
@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
HVX_Vector x = vx[i];
|
HVX_Vector x = vx[i];
|
||||||
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
||||||
|
|
||||||
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
//NOTE: need volatile here to prevent compiler optimization
|
||||||
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
// Seem compiler cannot guarantee read-after-write??
|
||||||
|
volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
||||||
|
volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
||||||
|
|
||||||
HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
|
HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nv1) {
|
if (nv1) {
|
||||||
HVX_VectorPair yp = vy[i];
|
// HVX_VectorPair yp = vy[i];
|
||||||
|
|
||||||
HVX_Vector x = vx[i];
|
// HVX_Vector x = vx[i];
|
||||||
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
// HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
|
||||||
|
|
||||||
if (nv1 >= 32) {
|
// if (nv1 >= 32) {
|
||||||
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
// volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
|
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
|
||||||
nv1 -= 32;
|
// nv1 -= 32;
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
// rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
|
|
||||||
|
// if (nv1) {
|
||||||
|
// volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
||||||
|
// HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
|
||||||
|
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
||||||
|
// }
|
||||||
|
|
||||||
|
//process the remainder using scalar loop
|
||||||
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
|
const __fp16 * restrict sx = (const __fp16 * restrict) x;
|
||||||
|
const float * restrict sy = (const float * restrict) y;
|
||||||
|
|
||||||
if (nv1) {
|
for (uint32_t i = nv0 * 64; i < n; i++) {
|
||||||
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
|
r_sum_scalar += (float) sx[i] * sy[i];
|
||||||
HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
|
|
||||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hvx_vec_dump_fp16("X", x);
|
// hvx_vec_dump_fp16("X", x);
|
||||||
|
|
@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
|
||||||
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
rsum = hvx_vec_qf32_reduce_sum(rsum);
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
|
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;
|
||||||
|
|
||||||
# ifdef HTP_DEBUG
|
# ifdef HTP_DEBUG
|
||||||
{
|
{
|
||||||
|
|
@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
uint64_t t1, t2;
|
uint64_t t1, t2;
|
||||||
t1 = HAP_perf_get_qtimer_count();
|
t1 = HAP_perf_get_qtimer_count();
|
||||||
|
|
||||||
const size_t src0_row_size = sizeof(__fp16) * ne00;
|
|
||||||
const size_t src1_row_size = sizeof(float) * ne10;
|
|
||||||
|
|
||||||
assert(ne12 % ne02 == 0);
|
assert(ne12 % ne02 == 0);
|
||||||
assert(ne13 % ne03 == 0);
|
assert(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
|
@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
// This is the size of the rest of the dimensions of the result
|
// This is the size of the rest of the dimensions of the result
|
||||||
const uint32_t nr1 = ne1 * ne2 * ne3;
|
const uint32_t nr1 = ne1 * ne2 * ne3;
|
||||||
|
|
||||||
uint32_t chunk_size = 64;
|
|
||||||
|
|
||||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||||
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||||
|
|
@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
const uint32_t blck_0 = 64;
|
const uint32_t blck_0 = 64;
|
||||||
const uint32_t blck_1 = 64;
|
const uint32_t blck_1 = 64;
|
||||||
|
|
||||||
float tmp[32];
|
__attribute__((aligned(128))) float tmp[64];
|
||||||
|
|
||||||
for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
||||||
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
||||||
for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) {
|
for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
|
||||||
const uint32_t i13 = (ir1 / (ne12 * ne1));
|
const uint32_t i13 = (ir1 / (ne12 * ne1));
|
||||||
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
||||||
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
||||||
|
|
@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
|
||||||
const uint32_t i2 = i12;
|
const uint32_t i2 = i12;
|
||||||
const uint32_t i3 = i13;
|
const uint32_t i3 = i13;
|
||||||
|
|
||||||
const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
|
const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
|
||||||
const uint8_t * restrict src1_col =
|
const uint8_t * restrict src1_col =
|
||||||
(const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size;
|
(const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
|
||||||
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
||||||
|
|
||||||
for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) {
|
const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
|
||||||
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col);
|
for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
|
||||||
|
// Use nb01 stride for non-contiguous src0 support
|
||||||
|
const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
|
||||||
|
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col);
|
||||||
}
|
}
|
||||||
|
|
||||||
hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
|
hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
|
||||||
|
|
|
||||||
|
|
@ -769,9 +769,16 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
// In case of eGPU, shared memory may be preferable.
|
||||||
|
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
|
||||||
|
#endif
|
||||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||||
dev->props.use_shared_buffers = false;
|
dev->props.use_shared_buffers = false;
|
||||||
}
|
}
|
||||||
|
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
|
||||||
|
dev->props.use_shared_buffers = true;
|
||||||
|
}
|
||||||
|
|
||||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7566,6 +7566,11 @@ size_t ggml_quantize_chunk(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
*log_callback = g_logger_state.log_callback;
|
||||||
|
*user_data = g_logger_state.log_callback_user_data;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||||
g_logger_state.log_callback_user_data = user_data;
|
g_logger_state.log_callback_user_data = user_data;
|
||||||
|
|
|
||||||
|
|
@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
JAIS = auto()
|
JAIS = auto()
|
||||||
NEMOTRON = auto()
|
NEMOTRON = auto()
|
||||||
NEMOTRON_H = auto()
|
NEMOTRON_H = auto()
|
||||||
|
NEMOTRON_H_MOE = auto()
|
||||||
EXAONE = auto()
|
EXAONE = auto()
|
||||||
EXAONE4 = auto()
|
EXAONE4 = auto()
|
||||||
GRANITE = auto()
|
GRANITE = auto()
|
||||||
|
|
@ -642,6 +643,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_MMPROJ_PEG = auto()
|
V_MMPROJ_PEG = auto()
|
||||||
V_ENC_EMBD_CLS = auto()
|
V_ENC_EMBD_CLS = auto()
|
||||||
V_ENC_EMBD_PATCH = auto()
|
V_ENC_EMBD_PATCH = auto()
|
||||||
|
V_ENC_EMBD_NORM = auto()
|
||||||
V_ENC_EMBD_POS = auto()
|
V_ENC_EMBD_POS = auto()
|
||||||
V_ENC_INPUT_NORM = auto()
|
V_ENC_INPUT_NORM = auto()
|
||||||
V_ENC_ATTN_QKV = auto()
|
V_ENC_ATTN_QKV = auto()
|
||||||
|
|
@ -660,6 +662,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
V_LAYER_SCALE_2 = auto()
|
V_LAYER_SCALE_2 = auto()
|
||||||
V_PRE_NORM = auto()
|
V_PRE_NORM = auto()
|
||||||
V_POST_NORM = auto()
|
V_POST_NORM = auto()
|
||||||
|
V_MM_POST_NORM = auto()
|
||||||
V_MM_INP_NORM = auto()
|
V_MM_INP_NORM = auto()
|
||||||
V_MM_INP_PROJ = auto() # gemma3
|
V_MM_INP_PROJ = auto() # gemma3
|
||||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||||
|
|
@ -786,6 +789,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.JAIS: "jais",
|
MODEL_ARCH.JAIS: "jais",
|
||||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||||
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
||||||
|
MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
|
||||||
MODEL_ARCH.EXAONE: "exaone",
|
MODEL_ARCH.EXAONE: "exaone",
|
||||||
MODEL_ARCH.EXAONE4: "exaone4",
|
MODEL_ARCH.EXAONE4: "exaone4",
|
||||||
MODEL_ARCH.GRANITE: "granite",
|
MODEL_ARCH.GRANITE: "granite",
|
||||||
|
|
@ -1014,6 +1018,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd",
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
|
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
||||||
|
|
@ -1032,6 +1037,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||||
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||||
|
|
@ -1092,6 +1098,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_MMPROJ_PEG,
|
MODEL_TENSOR.V_MMPROJ_PEG,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM,
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV,
|
MODEL_TENSOR.V_ENC_ATTN_QKV,
|
||||||
|
|
@ -1110,6 +1117,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.V_LAYER_SCALE_2,
|
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||||
MODEL_TENSOR.V_PRE_NORM,
|
MODEL_TENSOR.V_PRE_NORM,
|
||||||
MODEL_TENSOR.V_POST_NORM,
|
MODEL_TENSOR.V_POST_NORM,
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM,
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ,
|
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||||
MODEL_TENSOR.V_MM_INP_NORM,
|
MODEL_TENSOR.V_MM_INP_NORM,
|
||||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||||
|
|
@ -2529,6 +2537,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NEMOTRON_H_MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.SSM_IN,
|
||||||
|
MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
MODEL_TENSOR.SSM_DT,
|
||||||
|
MODEL_TENSOR.SSM_A,
|
||||||
|
MODEL_TENSOR.SSM_D,
|
||||||
|
MODEL_TENSOR.SSM_NORM,
|
||||||
|
MODEL_TENSOR.SSM_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
# experts
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
# shared expert
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||||
|
],
|
||||||
MODEL_ARCH.EXAONE: [
|
MODEL_ARCH.EXAONE: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
@ -3328,6 +3363,7 @@ class VisionProjectorType:
|
||||||
LIGHTONOCR = "lightonocr"
|
LIGHTONOCR = "lightonocr"
|
||||||
COGVLM = "cogvlm"
|
COGVLM = "cogvlm"
|
||||||
JANUS_PRO = "janus_pro"
|
JANUS_PRO = "janus_pro"
|
||||||
|
GLM4V = "glm4v"
|
||||||
|
|
||||||
|
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,8 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.operator_norm", # lfm2
|
"model.layers.{bid}.operator_norm", # lfm2
|
||||||
"model.transformer.blocks.{bid}.attn_norm", # llada
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||||
"layers.{bid}.input_layernorm", # qwen3-embedding
|
"layers.{bid}.input_layernorm", # qwen3-embedding
|
||||||
"model.layers.{bid}.attention_layernorm" # apertus
|
"model.layers.{bid}.attention_layernorm", # apertus
|
||||||
|
"model.layers.{bid}.pre_attention_layernorm", # kormo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
|
@ -342,6 +343,7 @@ class TensorNameMap:
|
||||||
"model.transformer.blocks.{bid}.ff_norm", # llada
|
"model.transformer.blocks.{bid}.ff_norm", # llada
|
||||||
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
||||||
"model.layers.{bid}.feedforward_layernorm", # apertus
|
"model.layers.{bid}.feedforward_layernorm", # apertus
|
||||||
|
"model.layers.{bid}.pre_mlp_layernorm", # kormo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Pre feed-forward norm
|
# Pre feed-forward norm
|
||||||
|
|
@ -377,6 +379,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
||||||
"model.layers.{bid}.mlp.router.gate", # afmoe
|
"model.layers.{bid}.mlp.router.gate", # afmoe
|
||||||
"layers.{bid}.gate", # mistral-large
|
"layers.{bid}.gate", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
|
@ -390,6 +393,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||||
|
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
|
@ -438,7 +442,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||||
|
|
@ -452,6 +456,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.down_proj",
|
"model.layers.{bid}.feed_forward.down_proj",
|
||||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||||
"layers.{bid}.shared_experts.w3", # mistral-large
|
"layers.{bid}.shared_experts.w3", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||||
|
|
@ -546,7 +551,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||||
|
|
@ -561,6 +566,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||||
"layers.{bid}.shared_experts.w2", # mistral-large
|
"layers.{bid}.shared_experts.w2", # mistral-large
|
||||||
|
"backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||||
|
|
@ -704,6 +710,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||||
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
||||||
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
||||||
|
"backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT_NORM: (
|
MODEL_TENSOR.SSM_DT_NORM: (
|
||||||
|
|
@ -1205,6 +1212,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||||
"model.connector.modality_projection.proj", # SmolVLM
|
"model.connector.modality_projection.proj", # SmolVLM
|
||||||
"model.vision.linear_proj.linear_proj", # cogvlm
|
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||||
|
"visual.merger.proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||||
|
|
@ -1238,6 +1246,10 @@ class TensorNameMap:
|
||||||
"model.vision.patch_embedding.proj", # cogvlm
|
"model.vision.patch_embedding.proj", # cogvlm
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||||
|
"visual.post_conv_layernorm", # glm4v
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||||
"vision_tower.vision_model.embeddings.position_embedding",
|
"vision_tower.vision_model.embeddings.position_embedding",
|
||||||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||||
|
|
@ -1247,6 +1259,7 @@ class TensorNameMap:
|
||||||
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
||||||
"visual.pos_embed", # qwen3vl
|
"visual.pos_embed", # qwen3vl
|
||||||
"model.vision.patch_embedding.position_embedding", # cogvlm
|
"model.vision.patch_embedding.position_embedding", # cogvlm
|
||||||
|
"visual.embeddings.position_embedding", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||||
|
|
@ -1402,6 +1415,11 @@ class TensorNameMap:
|
||||||
"vision_model.layernorm_post", # llama4
|
"vision_model.layernorm_post", # llama4
|
||||||
"visual.merger.ln_q", # qwen2vl
|
"visual.merger.ln_q", # qwen2vl
|
||||||
"vision_tower.encoder.final_layernorm", # kimi-vl
|
"vision_tower.encoder.final_layernorm", # kimi-vl
|
||||||
|
"visual.post_layernorm", # glm4v
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||||
|
"visual.merger.post_projection_norm", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||||
|
|
@ -1471,6 +1489,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
||||||
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
|
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
|
||||||
"patch_merger.merging_layer", # mistral
|
"patch_merger.merging_layer", # mistral
|
||||||
|
"visual.downsample", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_DS_NORM: (
|
MODEL_TENSOR.V_DS_NORM: (
|
||||||
|
|
@ -1491,14 +1510,17 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_UP: (
|
MODEL_TENSOR.V_MM_UP: (
|
||||||
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
|
"model.vision.linear_proj.dense_h_to_4h", # cogvlm
|
||||||
|
"visual.merger.up_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_DOWN: (
|
MODEL_TENSOR.V_MM_DOWN: (
|
||||||
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
|
"model.vision.linear_proj.dense_4h_to_h", # cogvlm
|
||||||
|
"visual.merger.down_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MM_GATE: (
|
MODEL_TENSOR.V_MM_GATE: (
|
||||||
"model.vision.linear_proj.gate_proj", # cogvlm
|
"model.vision.linear_proj.gate_proj", # cogvlm
|
||||||
|
"visual.merger.gate_proj", # glm4v
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_TOK_BOI: (
|
MODEL_TENSOR.V_TOK_BOI: (
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# GBNF Guide
|
# GBNF Guide
|
||||||
|
|
||||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `tools/main` and `tools/server`.
|
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `tools/cli`, `tools/completion` and `tools/server`.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
|
|
@ -135,7 +135,7 @@ While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) ma
|
||||||
You can use GBNF grammars:
|
You can use GBNF grammars:
|
||||||
|
|
||||||
- In [llama-server](../tools/server)'s completion endpoints, passed as the `grammar` body field
|
- In [llama-server](../tools/server)'s completion endpoints, passed as the `grammar` body field
|
||||||
- In [llama-cli](../tools/main), passed as the `--grammar` & `--grammar-file` flags
|
- In [llama-cli](../tools/cli) and [llama-completion](../tools/completion), passed as the `--grammar` & `--grammar-file` flags
|
||||||
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
|
||||||
|
|
||||||
## JSON Schemas → GBNF
|
## JSON Schemas → GBNF
|
||||||
|
|
@ -145,7 +145,7 @@ You can use GBNF grammars:
|
||||||
- In [llama-server](../tools/server):
|
- In [llama-server](../tools/server):
|
||||||
- For any completion endpoints, passed as the `json_schema` body field
|
- For any completion endpoints, passed as the `json_schema` body field
|
||||||
- For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
|
- For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
|
||||||
- In [llama-cli](../tools/main), passed as the `--json` / `-j` flag
|
- In [llama-cli](../tools/cli) and [llama-completion](../tools/completion), passed as the `--json` / `-j` flag
|
||||||
- To convert to a grammar ahead of time:
|
- To convert to a grammar ahead of time:
|
||||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||||
- in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI)
|
- in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI)
|
||||||
|
|
|
||||||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||||
|
bool no_alloc; // only load metadata and simulate memory allocations
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||||
|
// returns true if the parameters could be successfully modified to fit device memory
|
||||||
|
// this function is NOT thread safe because it modifies the global llama logger state
|
||||||
|
LLAMA_API bool llama_params_fit(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params * mparams,
|
||||||
|
struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||||
|
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||||
|
size_t margin, // margin of memory to leave per device in bytes
|
||||||
|
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||||
|
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||||
|
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||||
|
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
|
@ -1354,7 +1369,9 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
// The logger state is global so these functions are NOT thread safe.
|
||||||
|
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Performance utils
|
// Performance utils
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,204 @@
|
||||||
|
{% macro render_extra_keys(json_dict, handled_keys) %}
|
||||||
|
{%- if json_dict is mapping %}
|
||||||
|
{%- for json_key in json_dict if json_key not in handled_keys %}
|
||||||
|
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
|
||||||
|
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
|
||||||
|
{%- else %}
|
||||||
|
{{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{% endmacro %}
|
||||||
|
{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
|
||||||
|
{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
|
||||||
|
|
||||||
|
{%- set ns = namespace(last_user_idx = -1) %}
|
||||||
|
{%- set loop_messages = messages %}
|
||||||
|
{%- for m in loop_messages %}
|
||||||
|
{%- if m["role"] == "user" %}
|
||||||
|
{%- set ns.last_user_idx = loop.index0 %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
|
||||||
|
{%- if messages[0]["role"] == "system" %}
|
||||||
|
{%- set system_message = messages[0]["content"] %}
|
||||||
|
{%- set loop_messages = messages[1:] %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set system_message = "" %}
|
||||||
|
{%- set loop_messages = messages %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if not tools is defined %}
|
||||||
|
{%- set tools = [] %}
|
||||||
|
{%- endif %}
|
||||||
|
{# Recompute last_user_idx relative to loop_messages after handling system #}
|
||||||
|
{%- set ns = namespace(last_user_idx = -1) %}
|
||||||
|
{%- for m in loop_messages %}
|
||||||
|
{%- if m["role"] == "user" %}
|
||||||
|
{%- set ns.last_user_idx = loop.index0 %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if system_message is defined %}
|
||||||
|
{{- "<|im_start|>system\n" + system_message }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{{- "<|im_start|>system\n" }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{%- if system_message is defined and system_message | length > 0 %}
|
||||||
|
{{- "\n\n" }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou have access to the following functions:\n\n" }}
|
||||||
|
{{- "<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{%- if tool.function is defined %}
|
||||||
|
{%- set tool = tool.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
|
||||||
|
{%- if tool.description is defined %}
|
||||||
|
{{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<parameters>' }}
|
||||||
|
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
|
||||||
|
{%- for param_name, param_fields in tool.parameters.properties|items %}
|
||||||
|
{{- '\n<parameter>' }}
|
||||||
|
{{- '\n<name>' ~ param_name ~ '</name>' }}
|
||||||
|
{%- if param_fields.type is defined %}
|
||||||
|
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if param_fields.description is defined %}
|
||||||
|
{{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if param_fields.enum is defined %}
|
||||||
|
{{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
|
||||||
|
{{- render_extra_keys(param_fields, handled_keys) }}
|
||||||
|
{{- '\n</parameter>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{% set handled_keys = ['type', 'properties', 'required'] %}
|
||||||
|
{{- render_extra_keys(tool.parameters, handled_keys) }}
|
||||||
|
{%- if tool.parameters is defined and tool.parameters.required is defined %}
|
||||||
|
{{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n</parameters>' }}
|
||||||
|
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
|
||||||
|
{{- render_extra_keys(tool, handled_keys) }}
|
||||||
|
{{- '\n</function>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>" }}
|
||||||
|
|
||||||
|
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
|
||||||
|
{%- if system_message is defined %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if tools is iterable and tools | length > 0 %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{%- for message in loop_messages %}
|
||||||
|
{%- if message.role == "assistant" %}
|
||||||
|
{# Add reasoning content in to content field for unified processing below. #}
|
||||||
|
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
|
||||||
|
{%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set content = message.content | default('', true) %}
|
||||||
|
{%- if content is string -%}
|
||||||
|
{# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
|
||||||
|
{%- if '<think>' not in content and '</think>' not in content -%}
|
||||||
|
{%- set content = "<think></think>" ~ content -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{%- set content = content -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
|
||||||
|
{# Assistant message has tool calls. #}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
|
||||||
|
{%- if content is string and content | trim | length > 0 %}
|
||||||
|
{%- if include_content %}
|
||||||
|
{{- (content | trim) ~ '\n' -}}
|
||||||
|
{%- else %}
|
||||||
|
{%- set c = (content | string) %}
|
||||||
|
{%- if '</think>' in c %}
|
||||||
|
{# Keep only content after the last closing think. Also generation prompt causes this. #}
|
||||||
|
{%- set c = c.split('</think>')[-1] %}
|
||||||
|
{%- elif '<think>' in c %}
|
||||||
|
{# If <think> was opened but never closed, drop the trailing think segment #}
|
||||||
|
{%- set c = c.split('<think>')[0] %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set c = "<think></think>" ~ c | trim %}
|
||||||
|
{%- if c | length > 0 %}
|
||||||
|
{{- c ~ '\n' -}}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- "<think></think>" -}}
|
||||||
|
{%- endif %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if tool_call.function is defined %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
|
||||||
|
{%- if tool_call.arguments is defined %}
|
||||||
|
{%- for args_name, args_value in tool_call.arguments|items %}
|
||||||
|
{{- '<parameter=' ~ args_name ~ '>\n' -}}
|
||||||
|
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
||||||
|
{{- args_value ~ '\n</parameter>\n' -}}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '</function>\n</tool_call>\n' -}}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{# Assistant message doesn't have tool calls. #}
|
||||||
|
{%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
|
||||||
|
{{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{%- set c = (content | default('', true) | string) %}
|
||||||
|
{%- if '<think>' in c and '</think>' in c %}
|
||||||
|
{%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set c = c | trim %}
|
||||||
|
{%- if c | length > 0 %}
|
||||||
|
{{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>assistant\n<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- elif message.role == "user" or message.role == "system" %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' }}
|
||||||
|
{%- set content = message.content | string %}
|
||||||
|
{{- content }}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||||
|
{{- '<|im_start|>user\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>\n' }}
|
||||||
|
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif loop.last %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{%- if enable_thinking %}
|
||||||
|
{{- '<|im_start|>assistant\n<think>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>assistant\n<think></think>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
|
||||||
|
# Basedir on device
|
||||||
|
basedir=/data/local/tmp/llama.cpp
|
||||||
|
|
||||||
|
cli_opts=
|
||||||
|
|
||||||
|
branch=.
|
||||||
|
[ "$B" != "" ] && branch=$B
|
||||||
|
|
||||||
|
adbserial=
|
||||||
|
[ "$S" != "" ] && adbserial="-s $S"
|
||||||
|
|
||||||
|
model="gemma-3-4b-it-Q4_0.gguf"
|
||||||
|
[ "$M" != "" ] && model="$M"
|
||||||
|
|
||||||
|
mmproj="mmproj-F16.gguf"
|
||||||
|
[ "$MMPROJ" != "" ] && mmproj="$MMPROJ"
|
||||||
|
|
||||||
|
image=
|
||||||
|
[ "$IMG" != "" ] && image="$IMG"
|
||||||
|
|
||||||
|
device="HTP0"
|
||||||
|
[ "$D" != "" ] && device="$D"
|
||||||
|
|
||||||
|
verbose=
|
||||||
|
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
|
||||||
|
|
||||||
|
experimental="GGML_HEXAGON_EXPERIMENTAL=1"
|
||||||
|
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
|
||||||
|
|
||||||
|
sched=
|
||||||
|
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||||
|
|
||||||
|
profile=
|
||||||
|
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
|
||||||
|
|
||||||
|
opmask=
|
||||||
|
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||||
|
|
||||||
|
nhvx=
|
||||||
|
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||||
|
|
||||||
|
ndev=
|
||||||
|
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
|
||||||
|
|
||||||
|
# MTMD backend device for vision model (defaults to CPU if not set)
|
||||||
|
mtmd_backend=
|
||||||
|
[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE"
|
||||||
|
|
||||||
|
set -x
|
||||||
|
|
||||||
|
adb $adbserial shell " \
|
||||||
|
cd $basedir; ulimit -c unlimited; \
|
||||||
|
LD_LIBRARY_PATH=$basedir/$branch/lib \
|
||||||
|
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
|
||||||
|
$verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \
|
||||||
|
./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model \
|
||||||
|
--mmproj $basedir/../gguf/$mmproj \
|
||||||
|
--image $basedir/../gguf/$image \
|
||||||
|
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
|
||||||
|
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
|
||||||
|
-ngl 99 --device $device -v $cli_opts $@ \
|
||||||
|
"
|
||||||
|
|
@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_JAIS, "jais" },
|
{ LLM_ARCH_JAIS, "jais" },
|
||||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||||
|
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||||
{ LLM_ARCH_EXAONE, "exaone" },
|
{ LLM_ARCH_EXAONE, "exaone" },
|
||||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||||
|
|
@ -2422,6 +2423,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_LFM2MOE:
|
case LLM_ARCH_LFM2MOE:
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
case LLM_ARCH_QWEN3NEXT:
|
case LLM_ARCH_QWEN3NEXT:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_JAIS,
|
LLM_ARCH_JAIS,
|
||||||
LLM_ARCH_NEMOTRON,
|
LLM_ARCH_NEMOTRON,
|
||||||
LLM_ARCH_NEMOTRON_H,
|
LLM_ARCH_NEMOTRON_H,
|
||||||
|
LLM_ARCH_NEMOTRON_H_MOE,
|
||||||
LLM_ARCH_EXAONE,
|
LLM_ARCH_EXAONE,
|
||||||
LLM_ARCH_EXAONE4,
|
LLM_ARCH_EXAONE4,
|
||||||
LLM_ARCH_RWKV6,
|
LLM_ARCH_RWKV6,
|
||||||
|
|
|
||||||
|
|
@ -258,6 +258,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.clear();
|
backend_buft.clear();
|
||||||
backend_ptrs.clear();
|
backend_ptrs.clear();
|
||||||
|
backend_buf_exp_size.clear();
|
||||||
|
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||||
|
|
@ -274,6 +275,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.push_back(buft);
|
backend_buft.push_back(buft);
|
||||||
backend_ptrs.push_back(backend.get());
|
backend_ptrs.push_back(backend.get());
|
||||||
|
backend_buf_exp_size.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||||
|
|
@ -389,7 +391,8 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||||
|
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||||
|
|
@ -407,7 +410,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -422,7 +425,7 @@ llama_context::llama_context(
|
||||||
//
|
//
|
||||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||||
//
|
//
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -431,11 +434,13 @@ llama_context::llama_context(
|
||||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
ggml_backend_t backend = backend_ptrs[i];
|
ggml_backend_t backend = backend_ptrs[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
if (!model.hparams.no_alloc) {
|
||||||
if (size > 1) {
|
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
|
if (backend_buf_exp_size[i] > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -454,6 +459,23 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context::~llama_context() {
|
llama_context::~llama_context() {
|
||||||
|
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||||
|
// if (!model.hparams.no_alloc) {
|
||||||
|
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
|
// ggml_backend_t backend = backend_ptrs[i];
|
||||||
|
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
|
|
||||||
|
// const size_t size_exp = backend_buf_exp_size[i];
|
||||||
|
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
// if (size_exp == size_act) {
|
||||||
|
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// } else {
|
||||||
|
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
ggml_opt_free(opt_ctx);
|
ggml_opt_free(opt_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1428,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
ggml_cgraph * llama_context::graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
GGML_ASSERT(n_outputs >= 1);
|
GGML_ASSERT(n_outputs >= 1);
|
||||||
|
|
||||||
|
|
@ -1465,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
|
|
||||||
// initialize scheduler with the specified graph
|
// initialize scheduler with the specified graph
|
||||||
if (split_only) {
|
if (split_only) {
|
||||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
if (sizes) {
|
||||||
|
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||||
|
} else {
|
||||||
|
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||||
|
}
|
||||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||||
|
GGML_ASSERT(!sizes);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -2088,15 +2116,26 @@ void llama_context::perf_reset() {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
for (const auto & buft_size : model.memory_breakdown()) {
|
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||||
ret[buft_size.first].model += buft_size.second;
|
ret[buft].model += size;
|
||||||
}
|
}
|
||||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
if (memory) {
|
||||||
ret[buft_size.first].context += buft_size.second;
|
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||||
|
ret[buft].context += size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (const auto & backend_ptr : backends) {
|
if (model.hparams.no_alloc) {
|
||||||
ggml_backend_t backend = backend_ptr.get();
|
for (size_t i = 0; i < backends.size(); ++i) {
|
||||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
ggml_backend_t backend = backends[i].get();
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += backend_buf_exp_size[i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (const auto & backend_ptr : backends) {
|
||||||
|
ggml_backend_t backend = backend_ptr.get();
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
||||||
size_t model = 0; // memory allocated for the model
|
size_t model = 0; // memory allocated for the model
|
||||||
size_t context = 0; // memory allocated for the context
|
size_t context = 0; // memory allocated for the context
|
||||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
|
||||||
|
size_t total() const {
|
||||||
|
return model + context + compute;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
|
@ -206,7 +210,8 @@ public:
|
||||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||||
|
|
||||||
// reserve a graph with a dummy ubatch of the specified size
|
// reserve a graph with a dummy ubatch of the specified size
|
||||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
ggml_cgraph * graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llm_graph_params graph_params(
|
llm_graph_params graph_params(
|
||||||
|
|
@ -281,9 +286,10 @@ private:
|
||||||
|
|
||||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||||
|
|
||||||
// buffer types used for the compute buffer of each backend
|
// pointers and buffer types used for the compute buffer of each backend
|
||||||
std::vector<ggml_backend_t> backend_ptrs;
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||||
|
|
||||||
llm_graph_result_ptr gf_res_prev;
|
llm_graph_result_ptr gf_res_prev;
|
||||||
llm_graph_result_ptr gf_res_reserve;
|
llm_graph_result_ptr gf_res_reserve;
|
||||||
|
|
|
||||||
|
|
@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= s_copy->ne[0] == mctx->get_n_rs();
|
||||||
|
|
||||||
|
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= head == mctx->get_head();
|
||||||
|
res &= rs_z == mctx->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
GGML_UNUSED(ubatch);
|
GGML_UNUSED(ubatch);
|
||||||
|
|
||||||
|
|
@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||||
inp_attn->set_input(ubatch);
|
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
||||||
inp_rs->set_input(ubatch);
|
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
||||||
|
|
||||||
|
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
if (inp_rs->s_copy) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
||||||
|
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
||||||
|
|
||||||
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
|
for (uint32_t i = 0; i < n_rs; ++i) {
|
||||||
|
data[i] = mctx->get_recr()->s_copy(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||||
|
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
|
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
||||||
|
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= inp_rs->head == mctx->get_recr()->get_head();
|
||||||
|
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
cur = ggml_relu(ctx0, cur);
|
cur = ggml_relu(ctx0, cur);
|
||||||
cb(cur, "ffn_moe_relu", il);
|
cb(cur, "ffn_moe_relu", il);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_FFN_RELU_SQR:
|
||||||
|
if (gate_exps) {
|
||||||
|
// TODO: add support for gated squared relu
|
||||||
|
GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||||
|
} else {
|
||||||
|
cur = ggml_relu(ctx0, cur);
|
||||||
|
cur = ggml_sqr(ctx0, cur);
|
||||||
|
cb(cur, "ffn_moe_relu_sqr", il);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
||||||
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
||||||
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
||||||
|
|
||||||
|
inp->head = mctx_cur->get_head();
|
||||||
|
inp->rs_z = mctx_cur->get_rs_z();
|
||||||
|
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||||
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||||
|
|
||||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
||||||
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||||
|
|
||||||
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -225,6 +225,8 @@ public:
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [n_rs]
|
ggml_tensor * s_copy; // I32 [n_rs]
|
||||||
|
|
||||||
// views of s_copy, computed once per graph
|
// views of s_copy, computed once per graph
|
||||||
|
|
@ -233,6 +235,10 @@ public:
|
||||||
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||||
|
|
||||||
const llama_memory_recurrent_context * mctx;
|
const llama_memory_recurrent_context * mctx;
|
||||||
|
|
||||||
|
// used in view offsets, need to match for valid graph reuse
|
||||||
|
uint32_t head;
|
||||||
|
int32_t rs_z;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
|
|
@ -365,22 +371,28 @@ public:
|
||||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_mem_hybrid(
|
llm_graph_input_mem_hybrid(
|
||||||
|
const llama_cparams & cparams,
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||||
const llama_memory_hybrid_context * mctx) :
|
const llama_memory_hybrid_context * mctx) :
|
||||||
inp_attn(std::move(inp_attn)),
|
inp_attn(std::move(inp_attn)),
|
||||||
inp_rs(std::move(inp_rs)),
|
inp_rs(std::move(inp_rs)),
|
||||||
|
cparams(cparams),
|
||||||
mctx(mctx) { }
|
mctx(mctx) { }
|
||||||
virtual ~llm_graph_input_mem_hybrid() = default;
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||||
|
|
||||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||||
|
|
||||||
|
const llama_cparams cparams;
|
||||||
|
|
||||||
const llama_memory_hybrid_context * mctx;
|
const llama_memory_hybrid_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||||
|
|
@ -230,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::use_mrope() const {
|
||||||
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
bool no_alloc;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
bool use_par_res;
|
bool use_par_res;
|
||||||
bool swin_norm;
|
bool swin_norm;
|
||||||
|
|
@ -269,6 +270,8 @@ struct llama_hparams {
|
||||||
// TODO: think of a better place for this function
|
// TODO: think of a better place for this function
|
||||||
// TODO: pack the SWA params in a struct?
|
// TODO: pack the SWA params in a struct?
|
||||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||||
|
|
||||||
|
bool use_mrope() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
ggml_log_get(log_callback, user_data);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
ggml_log_set(log_callback, user_data);
|
ggml_log_set(log_callback, user_data);
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
||||||
|
|
||||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||||
for (auto & [buft, ctx] : ctx_map) {
|
for (auto & [buft, ctx] : ctx_map) {
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
ggml_backend_buffer_t buf;
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||||
|
}
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||||
}
|
}
|
||||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, buf] : ctxs_bufs) {
|
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||||
|
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
|
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1544,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
||||||
|
|
||||||
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
||||||
|
|
||||||
|
slot_info sinfo;
|
||||||
|
|
||||||
bool res = true;
|
bool res = true;
|
||||||
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
||||||
res = res && state_read_data(io, strm, cell_count);
|
res = res && state_read_data(io, strm, cell_count, sinfo);
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
if (seq_id == -1) {
|
if (seq_id == -1) {
|
||||||
|
|
@ -1685,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
||||||
auto & cells = v_cells[strm];
|
auto & cells = v_cells[strm];
|
||||||
auto & head = v_heads[strm];
|
auto & head = v_heads[strm];
|
||||||
|
|
||||||
|
|
@ -1722,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
ubatch.seq_id[i] = &dest_seq_id;
|
ubatch.seq_id[i] = &dest_seq_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto sinfo = find_slot(ubatch, true);
|
sinfo = find_slot(ubatch, false);
|
||||||
if (sinfo.empty()) {
|
if (sinfo.empty()) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -1732,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||||
apply_ubatch(sinfo, ubatch);
|
apply_ubatch(sinfo, ubatch);
|
||||||
|
|
||||||
const auto head_cur = sinfo.head();
|
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
||||||
|
|
||||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
||||||
head = head_cur;
|
GGML_ASSERT(sinfo.n_stream() == 1);
|
||||||
|
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
||||||
LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const uint32_t idx = sinfo.idxs[0][i];
|
||||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
||||||
// Assume that this is one contiguous block of cells
|
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
||||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
}
|
||||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
|
||||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
|
||||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
|
||||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
|
||||||
} else {
|
} else {
|
||||||
// whole KV cache restore
|
// whole KV cache restore
|
||||||
|
|
||||||
|
|
@ -1778,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create contiguous slot_info for whole cache restore
|
||||||
|
sinfo.s0 = strm;
|
||||||
|
sinfo.s1 = strm;
|
||||||
|
sinfo.resize(1);
|
||||||
|
sinfo.strm[0] = strm;
|
||||||
|
sinfo.idxs[0].resize(cell_count);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
sinfo.idxs[0][i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
head = 0;
|
head = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
||||||
auto & cells = v_cells[strm];
|
auto & cells = v_cells[strm];
|
||||||
auto & head = v_heads[strm];
|
|
||||||
|
|
||||||
uint32_t v_trans;
|
uint32_t v_trans;
|
||||||
uint32_t n_layer;
|
uint32_t n_layer;
|
||||||
|
|
@ -1836,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// Read and set the keys for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
|
// Fast path: contiguous cells, single memcpy
|
||||||
|
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
const void * src = io.read(cell_count * k_size_row);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
||||||
|
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1868,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// Read and set the values for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
|
// Fast path: contiguous cells, single memcpy
|
||||||
|
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
const void * src = io.read(cell_count * v_size_row);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
||||||
|
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1908,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// For each row in the transposed matrix, read the values for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
// Fast path: contiguous cells
|
||||||
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
const uint32_t h = sinfo.head();
|
||||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
|
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
||||||
|
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
|
const void * src = io.read(cell_count * v_size_el);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
||||||
|
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,23 @@ public:
|
||||||
void clear() {
|
void clear() {
|
||||||
idxs.clear();
|
idxs.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if indices are contiguous starting from head()
|
||||||
|
bool is_contiguous() const {
|
||||||
|
if (idxs.empty() || idxs[0].empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (idxs.size() > 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const uint32_t h = idxs[0][0];
|
||||||
|
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
||||||
|
if (idxs[0][i] != h + i) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
using slot_info_vec_t = std::vector<slot_info>;
|
using slot_info_vec_t = std::vector<slot_info>;
|
||||||
|
|
@ -264,8 +281,8 @@ private:
|
||||||
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
||||||
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
||||||
|
|
||||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
||||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
||||||
};
|
};
|
||||||
|
|
||||||
class llama_kv_cache_context : public llama_memory_context_i {
|
class llama_kv_cache_context : public llama_memory_context_i {
|
||||||
|
|
|
||||||
|
|
@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
||||||
ubatches(std::move(ubatches)),
|
ubatches(std::move(ubatches)),
|
||||||
// note: here we copy the ubatches. not sure if this is ideal
|
// note: here we copy the ubatches. not sure if this is ideal
|
||||||
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
||||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
|
this->no_alloc = no_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_model_loader::get_arch_name() const {
|
std::string llama_model_loader::get_arch_name() const {
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
bool use_mmap = false;
|
bool use_mmap = false;
|
||||||
bool check_tensors;
|
bool check_tensors;
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
llama_files files;
|
llama_files files;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||||
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||||
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||||
|
|
@ -1688,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GLM4:
|
case LLM_ARCH_GLM4:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 40: type = LLM_TYPE_9B; break;
|
case 40: type = LLM_TYPE_9B; break;
|
||||||
case 61: type = LLM_TYPE_32B; break;
|
case 61: type = LLM_TYPE_32B; break;
|
||||||
|
|
@ -1697,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GLM4_MOE:
|
case LLM_ARCH_GLM4_MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
|
|
||||||
// MoE parameters
|
// MoE parameters
|
||||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
|
|
@ -1797,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
|
|
@ -1812,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||||
case 56: type = LLM_TYPE_9B; break;
|
case 56: type = LLM_TYPE_9B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -3388,9 +3399,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
|
@ -5159,6 +5170,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
// mamba2 Mixer SSM params
|
// mamba2 Mixer SSM params
|
||||||
// NOTE: int64_t for tensor dimensions
|
// NOTE: int64_t for tensor dimensions
|
||||||
|
|
@ -5169,6 +5181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int64_t n_group = hparams.ssm_n_group;
|
const int64_t n_group = hparams.ssm_n_group;
|
||||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||||
|
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||||
|
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||||
|
|
||||||
// embeddings
|
// embeddings
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
|
@ -5218,12 +5233,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
} else {
|
} else {
|
||||||
// mlp layers
|
if (n_expert != 0) {
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
||||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
// MoE branch
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||||
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// mlp layers
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||||
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -6606,9 +6635,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||||
|
GGML_ASSERT(!ml.no_alloc);
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
||||||
|
// then we could just use metal for all layers
|
||||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||||
void * addr = nullptr;
|
void * addr = nullptr;
|
||||||
size_t first, last; // NOLINT
|
size_t first, last; // NOLINT
|
||||||
|
|
@ -6624,9 +6655,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
bufs.emplace_back(buf);
|
bufs.emplace_back(buf);
|
||||||
buf_map.emplace(idx, buf);
|
buf_map.emplace(idx, buf);
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
ggml_backend_buffer_t buf;
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
if (ml.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||||
|
}
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||||
}
|
}
|
||||||
|
|
@ -6681,6 +6719,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ml.no_alloc) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// load tensor data
|
// load tensor data
|
||||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||||
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
|
@ -6723,9 +6765,18 @@ size_t llama_model::n_devices() const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||||
for (const auto & buf : bufs) {
|
if (hparams.no_alloc) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
GGML_ASSERT(bufs.size() == 1);
|
||||||
|
ggml_backend_buffer_t buf = bufs[0].get();
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
|
for (const auto & buf : bufs) {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
|
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
|
@ -6770,6 +6821,7 @@ void llama_model::print_info() const {
|
||||||
// hparams
|
// hparams
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
||||||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||||||
|
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||||
|
|
@ -6827,7 +6879,8 @@ void llama_model::print_info() const {
|
||||||
arch == LLM_ARCH_PLAMO2 ||
|
arch == LLM_ARCH_PLAMO2 ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
arch == LLM_ARCH_QWEN3NEXT ||
|
arch == LLM_ARCH_QWEN3NEXT ||
|
||||||
arch == LLM_ARCH_NEMOTRON_H) {
|
arch == LLM_ARCH_NEMOTRON_H ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||||
|
|
@ -6882,7 +6935,8 @@ void llama_model::print_info() const {
|
||||||
if (arch == LLM_ARCH_MINICPM ||
|
if (arch == LLM_ARCH_MINICPM ||
|
||||||
arch == LLM_ARCH_GRANITE ||
|
arch == LLM_ARCH_GRANITE ||
|
||||||
arch == LLM_ARCH_GRANITE_MOE ||
|
arch == LLM_ARCH_GRANITE_MOE ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID) {
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||||
|
|
@ -7063,7 +7117,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
if (arch == LLM_ARCH_FALCON_H1) {
|
if (arch == LLM_ARCH_FALCON_H1) {
|
||||||
filter_attn = [&](int32_t) { return true; };
|
filter_attn = [&](int32_t) { return true; };
|
||||||
filter_recr = [&](int32_t) { return true; };
|
filter_recr = [&](int32_t) { return true; };
|
||||||
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
filter_attn = [&](int32_t il) {
|
filter_attn = [&](int32_t il) {
|
||||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||||
};
|
};
|
||||||
|
|
@ -7434,6 +7488,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -7618,6 +7673,7 @@ llama_model_params llama_model_default_params() {
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
/*.use_extra_bufts =*/ true,
|
/*.use_extra_bufts =*/ true,
|
||||||
/*.no_host =*/ false,
|
/*.no_host =*/ false,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
@ -7717,6 +7773,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_ARWKV7:
|
case LLM_ARCH_ARWKV7:
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
@ -7737,7 +7794,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
case LLM_ARCH_PLM:
|
case LLM_ARCH_PLM:
|
||||||
case LLM_ARCH_CHATGLM:
|
case LLM_ARCH_CHATGLM:
|
||||||
case LLM_ARCH_GLM4:
|
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
case LLM_ARCH_GRANITE_HYBRID:
|
case LLM_ARCH_GRANITE_HYBRID:
|
||||||
|
|
@ -7799,7 +7855,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_LFM2MOE:
|
case LLM_ARCH_LFM2MOE:
|
||||||
case LLM_ARCH_SMALLTHINKER:
|
case LLM_ARCH_SMALLTHINKER:
|
||||||
case LLM_ARCH_GLM4_MOE:
|
|
||||||
case LLM_ARCH_SEED_OSS:
|
case LLM_ARCH_SEED_OSS:
|
||||||
case LLM_ARCH_GROVEMOE:
|
case LLM_ARCH_GROVEMOE:
|
||||||
case LLM_ARCH_APERTUS:
|
case LLM_ARCH_APERTUS:
|
||||||
|
|
@ -7816,6 +7871,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_QWEN3VLMOE:
|
case LLM_ARCH_QWEN3VLMOE:
|
||||||
return LLAMA_ROPE_TYPE_IMROPE;
|
return LLAMA_ROPE_TYPE_IMROPE;
|
||||||
|
|
||||||
|
case LLM_ARCH_GLM4:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
||||||
|
case LLM_ARCH_GLM4_MOE:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
case LLM_ARCH_UNKNOWN:
|
case LLM_ARCH_UNKNOWN:
|
||||||
GGML_ABORT("unknown architecture");
|
GGML_ABORT("unknown architecture");
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,7 @@ enum llm_type {
|
||||||
LLM_TYPE_16B_A1B,
|
LLM_TYPE_16B_A1B,
|
||||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||||
LLM_TYPE_30B_A3B,
|
LLM_TYPE_30B_A3B,
|
||||||
|
LLM_TYPE_31B_A3_5B,
|
||||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||||
LLM_TYPE_100B_A6B,
|
LLM_TYPE_100B_A6B,
|
||||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||||
|
|
|
||||||
|
|
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model(llama_model_default_params());
|
llama_model model(llama_model_default_params());
|
||||||
|
|
|
||||||
|
|
@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "qwen2" ||
|
tokenizer_pre == "qwen2" ||
|
||||||
tokenizer_pre == "deepseek-r1-qwen") {
|
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||||
|
tokenizer_pre == "kormo") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
|
|
|
||||||
650
src/llama.cpp
650
src/llama.cpp
|
|
@ -1,6 +1,9 @@
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include "llama-chat.h"
|
#include "llama-chat.h"
|
||||||
|
#include "llama-context.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
@ -11,11 +14,14 @@
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
|
@ -37,6 +43,643 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_device_memory_data {
|
||||||
|
int64_t total;
|
||||||
|
int64_t free;
|
||||||
|
llama_memory_breakdown_data mb;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||||
|
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||||
|
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||||
|
const ggml_log_level log_level) {
|
||||||
|
struct user_data_t {
|
||||||
|
struct {
|
||||||
|
ggml_log_callback callback;
|
||||||
|
void * user_data;
|
||||||
|
} original_logger;
|
||||||
|
ggml_log_level min_level; // prints below this log level go to debug log
|
||||||
|
};
|
||||||
|
user_data_t ud;
|
||||||
|
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||||
|
ud.min_level = log_level;
|
||||||
|
|
||||||
|
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
const user_data_t * ud = (const user_data_t *) user_data;
|
||||||
|
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||||
|
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||||
|
}, &ud);
|
||||||
|
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
mparams_copy.no_alloc = true;
|
||||||
|
mparams_copy.use_mmap = false;
|
||||||
|
|
||||||
|
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||||
|
if (model == nullptr) {
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to load model");
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to create llama_context from model");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||||
|
|
||||||
|
for (const auto & [buft, mb] : memory_breakdown) {
|
||||||
|
if (ggml_backend_buft_is_host(buft)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||||
|
if (!dev) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
if (model->devices[i] == dev) {
|
||||||
|
ret[i].mb.model += mb.model;
|
||||||
|
ret[i].mb.context += mb.context;
|
||||||
|
ret[i].mb.compute += mb.compute;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||||
|
ret[i].free = free;
|
||||||
|
ret[i].total = total;
|
||||||
|
}
|
||||||
|
|
||||||
|
devs = model->devices;
|
||||||
|
hp_ngl = model->hparams.n_layer;
|
||||||
|
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||||
|
hp_n_expert = model->hparams.n_expert;
|
||||||
|
|
||||||
|
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// enum to identify part of a layer for distributing its tensors:
|
||||||
|
enum layer_fraction_t {
|
||||||
|
LAYER_FRACTION_NONE = 0, // nothing
|
||||||
|
LAYER_FRACTION_ATTN = 1, // attention
|
||||||
|
LAYER_FRACTION_UP = 2, // attention + up
|
||||||
|
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||||
|
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||||
|
};
|
||||||
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||||
|
|
||||||
|
static void llama_params_fit_impl(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
constexpr int64_t MiB = 1024*1024;
|
||||||
|
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||||
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||||
|
const llama_model_params default_mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
std::vector<ggml_backend_dev_t> devs;
|
||||||
|
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||||
|
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||||
|
uint32_t hp_nex = 0; // hparams.n_expert
|
||||||
|
|
||||||
|
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||||
|
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
const size_t nd = devs.size(); // number of devices
|
||||||
|
if (nd == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> dev_names;
|
||||||
|
{
|
||||||
|
dev_names.reserve(nd);
|
||||||
|
size_t max_length = 0;
|
||||||
|
for (ggml_backend_dev_t dev : devs) {
|
||||||
|
std::string name = ggml_backend_dev_name(dev);
|
||||||
|
name += " (";
|
||||||
|
name += ggml_backend_dev_description(dev);
|
||||||
|
name += ")";
|
||||||
|
dev_names.push_back(name);
|
||||||
|
max_length = std::max(max_length, name.length());
|
||||||
|
}
|
||||||
|
for (std::string & dn : dev_names) {
|
||||||
|
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t sum_total = 0;
|
||||||
|
int64_t sum_projected_free = 0;
|
||||||
|
int64_t min_projected_free = INT64_MAX;
|
||||||
|
int64_t sum_projected_used = 0;
|
||||||
|
int64_t sum_projected_ctx = 0;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||||
|
}
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const llama_device_memory_data & dmd = dmds_full[id];
|
||||||
|
|
||||||
|
const int64_t projected_used = dmd.mb.total();
|
||||||
|
const int64_t projected_free = dmd.free - projected_used;
|
||||||
|
|
||||||
|
sum_total += dmd.total;
|
||||||
|
sum_projected_used += projected_used;
|
||||||
|
sum_projected_free += projected_free;
|
||||||
|
min_projected_free = std::min(min_projected_free, projected_free);
|
||||||
|
sum_projected_ctx += dmd.mb.context;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||||
|
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||||
|
projected_free >= 0 ? "surplus" : "deficit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||||
|
assert(sum_projected_used >= sum_projected_ctx);
|
||||||
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||||
|
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||||
|
if (min_projected_free >= margin) {
|
||||||
|
if (nd == 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 2: try reducing memory use by reducing the context size
|
||||||
|
|
||||||
|
{
|
||||||
|
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||||
|
if (global_surplus < 0) {
|
||||||
|
LLAMA_LOG_INFO(nd == 1 ?
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||||
|
__func__, margin/MiB, -global_surplus/MiB);
|
||||||
|
if (cparams->n_ctx == 0) {
|
||||||
|
if (hp_nct > n_ctx_min) {
|
||||||
|
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||||
|
const uint32_t ctx_reduction = std::min(
|
||||||
|
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||||
|
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||||
|
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||||
|
global_surplus += memory_reduction;
|
||||||
|
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||||
|
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||||
|
__func__, hp_nct, n_ctx_min);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (global_surplus >= 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||||
|
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||||
|
}
|
||||||
|
if (nd > 1) {
|
||||||
|
if (!tensor_split) {
|
||||||
|
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_split) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
if (mparams->tensor_split[id] != 0.0f) {
|
||||||
|
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||||
|
}
|
||||||
|
if (hp_ngl < 2*nd) {
|
||||||
|
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||||
|
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!tensor_buft_overrides) {
|
||||||
|
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||||
|
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3: iteratively fill the back to front with "dense" layers
|
||||||
|
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||||
|
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||||
|
|
||||||
|
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||||
|
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||||
|
constexpr size_t n_strings = 1000;
|
||||||
|
if (il >= n_strings) {
|
||||||
|
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||||
|
}
|
||||||
|
switch (lf) {
|
||||||
|
case LAYER_FRACTION_ATTN: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_UP: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_GATE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_MOE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ngl_t {
|
||||||
|
uint32_t n_layer = 0; // number of total layers
|
||||||
|
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||||
|
|
||||||
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||||
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
|
||||||
|
// utility function to set n_gpu_layers and tensor_split
|
||||||
|
auto set_ngl_tensor_split_tbo = [&](
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
llama_model_params & mparams,
|
||||||
|
const bool add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||||
|
if (nd > 1) {
|
||||||
|
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||||
|
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||||
|
|
||||||
|
if (add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers += 1;
|
||||||
|
tensor_split[nd - 1] += 1;
|
||||||
|
}
|
||||||
|
mparams.tensor_split = tensor_split;
|
||||||
|
|
||||||
|
size_t itbo = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||||
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||||
|
if (itbo + 1 >= ntbo) {
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||||
|
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||||
|
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||||
|
itbo++;
|
||||||
|
}
|
||||||
|
il0 += ngl_per_device[id].n_part;
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
};
|
||||||
|
|
||||||
|
// utility function that returns the memory use per device for given numbers of layers per device
|
||||||
|
auto get_memory_for_layers = [&](
|
||||||
|
const char * func_name,
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||||
|
|
||||||
|
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||||
|
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const ngl_t & n = ngl_per_device[id];
|
||||||
|
LLAMA_LOG_DEBUG(
|
||||||
|
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||||
|
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> ret;
|
||||||
|
ret.reserve(nd);
|
||||||
|
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||||
|
ret.push_back(dmd.mb.total());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t global_surplus_cpu_moe = 0;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
||||||
|
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||||
|
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||||
|
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||||
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||||
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||||
|
global_surplus_cpu_moe += dmd.free;
|
||||||
|
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (global_surplus_cpu_moe > 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||||
|
__func__, global_surplus_cpu_moe/MiB);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||||
|
__func__, -global_surplus_cpu_moe/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset
|
||||||
|
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||||
|
targets.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
targets.push_back(dmds_full[id].free - margin);
|
||||||
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||||
|
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||||
|
|
||||||
|
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||||
|
overflow_bufts.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd - 1; ++id) {
|
||||||
|
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||||
|
}
|
||||||
|
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device(nd);
|
||||||
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// optimize the number of layers per device using the method of false position:
|
||||||
|
// - ngl_per_device has 0 layers for each device, lower bound
|
||||||
|
// - try a "high" configuration where a device is given all unassigned layers
|
||||||
|
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||||
|
// - check memory use of our guess, replace either the low or high bound
|
||||||
|
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||||
|
if (hp_nex == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||||
|
}
|
||||||
|
uint32_t n_unassigned = hp_ngl;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||||
|
}
|
||||||
|
if (ngl_per_device_high[id].n_layer > 0) {
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
ngl_per_device_test[id].n_layer += step_size;
|
||||||
|
if (hp_nex) {
|
||||||
|
ngl_per_device_test[id].n_part += step_size;
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 4: for a MoE model where all dense tensors fit,
|
||||||
|
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||||
|
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||||
|
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||||
|
|
||||||
|
size_t id_dense_start = nd;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
if (ngl_per_device[id].n_layer > 0) {
|
||||||
|
id_dense_start = id;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert(id_dense_start < nd);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||||
|
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||||
|
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||||
|
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_part = 0;
|
||||||
|
}
|
||||||
|
size_t id_dense_start_high = nd - 1;
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||||
|
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||||
|
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
uint32_t n_converted_test = 0;
|
||||||
|
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||||
|
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||||
|
n_converted_test += n_convert_jd;
|
||||||
|
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
id_dense_start_high = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||||
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||||
|
}
|
||||||
|
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
id_dense_start = id_dense_start_high;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// try to fit at least part of one more layer
|
||||||
|
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||||
|
ngl_per_device_test[id].n_layer++;
|
||||||
|
ngl_per_device_test[id].n_part++;
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||||
|
id_dense_start_test++;
|
||||||
|
}
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||||
|
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_params_fit(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
const int64_t t0_us = llama_time_us();
|
||||||
|
bool ok = true;
|
||||||
|
try {
|
||||||
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||||
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||||
|
} catch (const std::runtime_error & e) {
|
||||||
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
const int64_t t1_us = llama_time_us();
|
||||||
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
|
|
@ -49,6 +692,10 @@ size_t llama_max_devices(void) {
|
||||||
return 16;
|
return 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t llama_max_tensor_buft_overrides() {
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_supports_mmap(void) {
|
bool llama_supports_mmap(void) {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
@ -108,11 +755,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||||
model.t_start_us = tm.t_start_us;
|
model.t_start_us = tm.t_start_us;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||||
|
|
||||||
ml.print_info();
|
ml.print_info();
|
||||||
|
|
||||||
model.hparams.vocab_only = params.vocab_only;
|
model.hparams.vocab_only = params.vocab_only;
|
||||||
|
model.hparams.no_alloc = params.no_alloc;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
model.load_arch(ml);
|
model.load_arch(ml);
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Kcur, "Kcur_normed", il);
|
cb(Kcur, "Kcur_normed", il);
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(
|
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
if (use_mrope) {
|
||||||
ctx0, Kcur, inp_pos, nullptr,
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
);
|
|
||||||
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
if (use_mrope) {
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||||
cur = build_ffn(cur,
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
cur = build_ffn(cur,
|
||||||
NULL, NULL, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
NULL, NULL, NULL,
|
||||||
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
cb(cur, "ffn_out", il);
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
ggml_tensor * ffn_inp = cur;
|
||||||
|
ggml_tensor * moe_out =
|
||||||
|
build_moe_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
nullptr, // no gate
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_exp_probs_b,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||||
|
true, hparams.expert_weights_scale,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||||
|
il);
|
||||||
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||||
|
NULL /* no gate */ , NULL, NULL,
|
||||||
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
|
cb(ffn_shexp, "ffn_shexp", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
cur = build_cvec(cur, il);
|
cur = build_cvec(cur, il);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
|
||||||
|
|
@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
|
||||||
|
|
@ -222,6 +222,14 @@ llama_build_and_test(test-backend-ops.cpp)
|
||||||
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||||
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
||||||
|
|
||||||
|
# Test for state restore with fragmented KV cache
|
||||||
|
# Requires a model, uses same args pattern as test-thread-safety
|
||||||
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||||
|
llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf)
|
||||||
|
else()
|
||||||
|
llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
# these tests use the backends directly and cannot be built with dynamic loading
|
# these tests use the backends directly and cannot be built with dynamic loading
|
||||||
llama_build_and_test(test-barrier.cpp)
|
llama_build_and_test(test-barrier.cpp)
|
||||||
|
|
|
||||||
|
|
@ -3588,6 +3588,163 @@ static void test_template_output_peg_parsers() {
|
||||||
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// NVIDIA Nemotron-3 Nano
|
||||||
|
auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
|
||||||
|
|
||||||
|
// Test basic message
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "Hello, world!\nWhat's up?";
|
||||||
|
t.expect = message_assist;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test basic message and reasoning with reasoning_format = none
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
t.expect.content = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test basic message and reasoning with reasoning_format = auto
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
|
||||||
|
t.params.enable_thinking = true;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
|
||||||
|
t.expect = message_assist_thoughts;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {special_function_tool};
|
||||||
|
|
||||||
|
t.expect = message_assist_call;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with reasoning
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"I'm\nthinking\n</think>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {special_function_tool};
|
||||||
|
|
||||||
|
t.expect = message_assist_call_thoughts;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test parallel tool calls
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=special_function_with_opt>\n"
|
||||||
|
"<parameter=arg1>\n"
|
||||||
|
"1\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"<parameter=arg2>\n"
|
||||||
|
"2\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.parallel_tool_calls = true;
|
||||||
|
t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "special_function",
|
||||||
|
/* .arguments = */ R"({"arg1": 1})",
|
||||||
|
/* .id = */ {},
|
||||||
|
}, {
|
||||||
|
/* .name = */ "special_function_with_opt",
|
||||||
|
/* .arguments = */ R"({"arg1": 1, "arg2": 2})",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with string parameter
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=python>\n"
|
||||||
|
"<parameter=code>\n"
|
||||||
|
"def hello():\n"
|
||||||
|
" print(\"Hello, world!\")\n"
|
||||||
|
"\n"
|
||||||
|
"hello()\n"
|
||||||
|
"</parameter>\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {python_tool};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "python",
|
||||||
|
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test tool call with string parameter and no closing </parameter> tag
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"<tool_call>\n"
|
||||||
|
"<function=python>\n"
|
||||||
|
"<parameter=code>\n"
|
||||||
|
"def hello():\n"
|
||||||
|
" print(\"Hello, world!\")\n"
|
||||||
|
"\n"
|
||||||
|
"hello()\n"
|
||||||
|
"</function>\n"
|
||||||
|
"</tool_call>";
|
||||||
|
t.params.enable_thinking = false;
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.tools = {python_tool};
|
||||||
|
|
||||||
|
t.expect.tool_calls = {{
|
||||||
|
/* .name = */ "python",
|
||||||
|
/* .arguments = */ "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
|
||||||
|
/* .id = */ {},
|
||||||
|
}};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test response format
|
||||||
|
test_peg_parser(tmpls.get(), [&](auto & t) {
|
||||||
|
t.input =
|
||||||
|
"I need to output the invoice details in JSON\n"
|
||||||
|
"</think>\n"
|
||||||
|
R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
|
t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||||
|
t.params.json_schema = invoice_schema;
|
||||||
|
|
||||||
|
t.expect.reasoning_content = "I need to output the invoice details in JSON";
|
||||||
|
t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_msg_diffs_compute() {
|
static void test_msg_diffs_compute() {
|
||||||
|
|
|
||||||
|
|
@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_resolves_to_string() {
|
||||||
|
fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
|
||||||
|
|
||||||
|
auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
|
||||||
|
fprintf(stderr, "- %s\n", name.c_str());
|
||||||
|
common_schema_info info;
|
||||||
|
auto schema = nlohmann::ordered_json::parse(schema_str);
|
||||||
|
info.resolve_refs(schema);
|
||||||
|
bool result = info.resolves_to_string(schema);
|
||||||
|
if (result != expected) {
|
||||||
|
fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
|
||||||
|
fprintf(stderr, "Schema: %s\n", schema_str.c_str());
|
||||||
|
fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Basic type checks
|
||||||
|
test("type string", R"({"type": "string"})", true);
|
||||||
|
test("type integer", R"({"type": "integer"})", false);
|
||||||
|
test("type number", R"({"type": "number"})", false);
|
||||||
|
test("type boolean", R"({"type": "boolean"})", false);
|
||||||
|
test("type object", R"({"type": "object"})", false);
|
||||||
|
test("type array", R"({"type": "array"})", false);
|
||||||
|
|
||||||
|
// Type array (nullable string)
|
||||||
|
test("type array with string", R"({"type": ["string", "null"]})", true);
|
||||||
|
test("type array without string", R"({"type": ["integer", "null"]})", false);
|
||||||
|
|
||||||
|
// String-specific keywords
|
||||||
|
test("minLength implies string", R"({"minLength": 1})", true);
|
||||||
|
test("maxLength implies string", R"({"maxLength": 10})", true);
|
||||||
|
test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
|
||||||
|
|
||||||
|
// Format
|
||||||
|
test("format date", R"({"format": "date"})", true);
|
||||||
|
test("format uuid", R"({"format": "uuid"})", true);
|
||||||
|
test("format email", R"({"format": "email"})", true);
|
||||||
|
|
||||||
|
// Const
|
||||||
|
test("const string", R"({"const": "hello"})", true);
|
||||||
|
test("const number", R"({"const": 123})", false);
|
||||||
|
|
||||||
|
// Enum
|
||||||
|
test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
|
||||||
|
test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
|
||||||
|
test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
|
||||||
|
|
||||||
|
// anyOf
|
||||||
|
test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
|
||||||
|
test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
|
||||||
|
|
||||||
|
// oneOf
|
||||||
|
test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
|
||||||
|
test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
|
||||||
|
|
||||||
|
// allOf - all must be strings
|
||||||
|
test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
|
||||||
|
test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
|
||||||
|
|
||||||
|
// $ref
|
||||||
|
test("$ref to string",
|
||||||
|
R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
|
||||||
|
test("$ref to integer",
|
||||||
|
R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
|
||||||
|
|
||||||
|
// Nested
|
||||||
|
test("nested anyOf with string",
|
||||||
|
R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
|
||||||
|
|
||||||
|
fprintf(stderr, "All resolves_to_string tests passed!\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
||||||
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
||||||
|
|
||||||
|
test_resolves_to_string();
|
||||||
|
|
||||||
test_all("C++", [](const TestCase & tc) {
|
test_all("C++", [](const TestCase & tc) {
|
||||||
try {
|
try {
|
||||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,122 @@
|
||||||
|
// Test for state restore with fragmented KV cache
|
||||||
|
// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527
|
||||||
|
// The issue was that state restore required contiguous KV cache slots,
|
||||||
|
// which fails when the cache is fragmented.
|
||||||
|
//
|
||||||
|
// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false)
|
||||||
|
// in state_read_meta(), allowing non-contiguous slot allocation.
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
params.sampling.seed = 1234;
|
||||||
|
params.kv_unified = true;
|
||||||
|
params.n_parallel = 3;
|
||||||
|
params.n_ctx = 256;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
// init
|
||||||
|
common_init_result_ptr llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init->model();
|
||||||
|
llama_context * ctx = llama_init->context();
|
||||||
|
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(model);
|
||||||
|
|
||||||
|
// tokenize prompt
|
||||||
|
std::vector<llama_token> tokens(70, 1);
|
||||||
|
|
||||||
|
// interleave the 3 sequences:
|
||||||
|
// 01201230123...
|
||||||
|
llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1);
|
||||||
|
for (size_t i = 0; i < tokens.size(); i++) {
|
||||||
|
for (int s = 0; s < params.n_parallel; ++s) {
|
||||||
|
common_batch_add(batch, tokens[i], i, {s}, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch)) {
|
||||||
|
fprintf(stderr, "%s : failed to decode seq 0\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
|
||||||
|
|
||||||
|
// Save state of seq 1
|
||||||
|
std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 1));
|
||||||
|
const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1);
|
||||||
|
if (ncopy != seq_state.size()) {
|
||||||
|
fprintf(stderr, "%s : failed to save seq 1 state\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy);
|
||||||
|
|
||||||
|
// clear seq 1 to create a "hole" in the KV cache (fragmentation)
|
||||||
|
// 0.20.20.20.2....
|
||||||
|
llama_memory_t mem = llama_get_memory(ctx);
|
||||||
|
llama_memory_seq_rm(mem, 1, -1, -1);
|
||||||
|
fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
|
||||||
|
|
||||||
|
// Now the cache has holes where seq 1 was
|
||||||
|
// This creates fragmentation - there's no contiguous block large enough
|
||||||
|
// for the seq 1 state if we only look for contiguous slots
|
||||||
|
|
||||||
|
// Restore seq 1 state into seq 1 (should work with non-contiguous allocation)
|
||||||
|
// We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
|
||||||
|
// Before the fix, this would fail with "failed to find available cells in kv cache"
|
||||||
|
const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
|
||||||
|
if (nset != seq_state.size()) {
|
||||||
|
fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n",
|
||||||
|
__func__, nset, seq_state.size());
|
||||||
|
fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__);
|
||||||
|
llama_batch_free(batch);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
|
||||||
|
|
||||||
|
// Verify we can decode with the restored state
|
||||||
|
// Generate one token to verify the restored state is usable
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
|
||||||
|
|
||||||
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||||
|
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
|
common_batch_clear(batch);
|
||||||
|
common_batch_add(batch, next_token, (int)tokens.size(), {1}, true);
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch)) {
|
||||||
|
fprintf(stderr, "%s : failed to decode with restored state\n", __func__);
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
llama_batch_free(batch);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str());
|
||||||
|
fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__);
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -37,4 +37,5 @@ else()
|
||||||
add_subdirectory(cvector-generator)
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
endif()
|
endif()
|
||||||
|
add_subdirectory(fit-params)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
TODO
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp/tools/main
|
# llama.cpp/tools/completion
|
||||||
|
|
||||||
This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||||
|
|
||||||
|
|
@ -27,64 +27,64 @@ Once downloaded, place your model in the models folder in llama.cpp.
|
||||||
##### Input prompt (One-and-done)
|
##### Input prompt (One-and-done)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
|
./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
##### Conversation mode (Allow for continuous interaction with the model)
|
##### Conversation mode (Allow for continuous interaction with the model)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
|
./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Conversation mode using built-in jinja chat template
|
##### Conversation mode using built-in jinja chat template
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
|
./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
|
||||||
```
|
```
|
||||||
|
|
||||||
##### One-and-done query using jinja with custom system prompt and a starting prompt
|
##### One-and-done query using jinja with custom system prompt and a starting prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
|
./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
|
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
##### Input prompt (One-and-done)
|
##### Input prompt (One-and-done)
|
||||||
```powershell
|
```powershell
|
||||||
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
|
./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
##### Conversation mode (Allow for continuous interaction with the model)
|
##### Conversation mode (Allow for continuous interaction with the model)
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
|
./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Conversation mode using built-in jinja chat template
|
##### Conversation mode using built-in jinja chat template
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
|
./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
|
||||||
```
|
```
|
||||||
|
|
||||||
##### One-and-done query using jinja with custom system prompt and a starting prompt
|
##### One-and-done query using jinja with custom system prompt and a starting prompt
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
|
./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
|
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models:
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
|
||||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
|
||||||
|
|
@ -97,7 +97,7 @@ In this section, we cover the most commonly used options for running the `llama-
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
|
The `llama-completion` program provides several ways to interact with the LLaMA models using input prompts:
|
||||||
|
|
||||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||||
|
|
@ -107,7 +107,7 @@ The `llama-cli` program provides several ways to interact with the LLaMA models
|
||||||
|
|
||||||
## Interaction
|
## Interaction
|
||||||
|
|
||||||
The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
The `llama-completion` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
||||||
|
|
||||||
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
||||||
|
|
||||||
|
|
@ -136,7 +136,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
|
||||||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llama-cli -r "User:" --in-prefix " "
|
./llama-completion -r "User:" --in-prefix " "
|
||||||
```
|
```
|
||||||
|
|
||||||
### In-Suffix
|
### In-Suffix
|
||||||
|
|
@ -144,7 +144,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
|
||||||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
./llama-completion -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||||
```
|
```
|
||||||
When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
|
When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
set(TARGET llama-fit-params)
|
||||||
|
add_executable(${TARGET} fit-params.cpp)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
if(LLAMA_TOOLS_INSTALL)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
endif()
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
# fit-params
|
||||||
|
|
||||||
|
llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime,
|
||||||
|
this is controlled using the CLI arguments starting with `-fit`/`--fit`.
|
||||||
|
Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs.
|
||||||
|
`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout.
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
# First, run llama-fit-params and store the results in a file:
|
||||||
|
> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
||||||
|
ggml_cuda_init: found 1 CUDA devices:
|
||||||
|
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
|
||||||
|
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
|
||||||
|
llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory
|
||||||
|
llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB
|
||||||
|
llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total
|
||||||
|
llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB
|
||||||
|
llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory:
|
||||||
|
llama_params_fit_impl: - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing), 19187 MiB used, 1199 MiB free
|
||||||
|
llama_params_fit: successfully fit params to free device memory
|
||||||
|
llama_params_fit: fitting params to free memory took 1.15 seconds
|
||||||
|
Printing fitted CLI arguments to stdout...
|
||||||
|
-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU
|
||||||
|
|
||||||
|
# Next, use those results for a llama.cpp binary:
|
||||||
|
> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
||||||
|
ggml_cuda_init: found 1 CUDA devices:
|
||||||
|
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
|
||||||
|
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
|
||||||
|
system info: n_threads = 16, n_threads_batch = 16, total_threads = 32
|
||||||
|
|
||||||
|
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
||||||
|
|
||||||
|
main: binding port with default address family
|
||||||
|
main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
|
||||||
|
main: loading model
|
||||||
|
srv load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf'
|
||||||
|
llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory
|
||||||
|
llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed
|
||||||
|
llama_params_fit: successfully fit params to free device memory
|
||||||
|
llama_params_fit: fitting params to free memory took 0.28 seconds
|
||||||
|
[...]
|
||||||
|
main: server is listening on http://127.0.0.1:8080 - starting the main loop
|
||||||
|
srv update_slots: all slots are idle
|
||||||
|
^Csrv operator(): operator(): cleaning up before exit...
|
||||||
|
|
||||||
|
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
||||||
|
llama_memory_breakdown_print: | - CUDA0 (RTX 4090) | 24077 = 945 + (19187 = 17904 + 384 + 898) + 3945 |
|
||||||
|
llama_memory_breakdown_print: | - Host | 58271 = 58259 + 0 + 12 |
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
|
||||||
|
LOG_INF("Printing fitted CLI arguments to stdout...\n");
|
||||||
|
std::cout << "-c " << cparams.n_ctx;
|
||||||
|
std::cout << " -ngl " << mparams.n_gpu_layers;
|
||||||
|
|
||||||
|
size_t nd = llama_max_devices();
|
||||||
|
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||||
|
nd--;
|
||||||
|
}
|
||||||
|
if (nd > 1) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
if (id == 0) {
|
||||||
|
std::cout << " -ts ";
|
||||||
|
}
|
||||||
|
if (id > 0) {
|
||||||
|
std::cout << ",";
|
||||||
|
}
|
||||||
|
std::cout << mparams.tensor_split[id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||||
|
if (itbo == 0) {
|
||||||
|
std::cout << " -ot ";
|
||||||
|
}
|
||||||
|
if (itbo > 0) {
|
||||||
|
std::cout << ",";
|
||||||
|
}
|
||||||
|
std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft);
|
||||||
|
}
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -80,7 +80,7 @@ Each test is repeated the number of times given by `-r`, and the results are ave
|
||||||
|
|
||||||
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
|
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
|
||||||
|
|
||||||
For a description of the other options, see the [main example](../main/README.md).
|
For a description of the other options, see the [completion example](../completion/README.md).
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> The measurements with `llama-bench` do not include the times for tokenization and for sampling.
|
> The measurements with `llama-bench` do not include the times for tokenization and for sampling.
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ add_library(mtmd
|
||||||
clip-graph.h
|
clip-graph.h
|
||||||
models/models.h
|
models/models.h
|
||||||
models/cogvlm.cpp
|
models/cogvlm.cpp
|
||||||
|
models/glm4v.cpp
|
||||||
models/internvl.cpp
|
models/internvl.cpp
|
||||||
models/kimivl.cpp
|
models/kimivl.cpp
|
||||||
models/llama4.cpp
|
models/llama4.cpp
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,8 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
|
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
|
||||||
|
|
||||||
struct clip_graph {
|
struct clip_graph {
|
||||||
const clip_model & model;
|
const clip_model & model;
|
||||||
const clip_hparams & hparams;
|
const clip_hparams & hparams;
|
||||||
|
|
@ -49,7 +51,7 @@ struct clip_graph {
|
||||||
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
||||||
|
|
||||||
// siglip2 naflex
|
// siglip2 naflex
|
||||||
ggml_tensor * resize_position_embeddings();
|
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
|
||||||
|
|
||||||
// build vision transformer (ViT) cgraph
|
// build vision transformer (ViT) cgraph
|
||||||
// this function should cover most of the models
|
// this function should cover most of the models
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@
|
||||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||||
|
#define TN_NORM_EMBD "v.norm_embd.%s"
|
||||||
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
||||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
|
|
@ -86,6 +87,10 @@
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
|
#define TN_MM_UP "mm.up.%s"
|
||||||
|
#define TN_MM_GATE "mm.gate.%s"
|
||||||
|
#define TN_MM_DOWN "mm.down.%s"
|
||||||
|
#define TN_MM_POST_NORM "mm.post_norm.%s"
|
||||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
|
|
@ -95,7 +100,7 @@
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||||
|
|
@ -165,6 +170,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
PROJECTOR_TYPE_COGVLM,
|
PROJECTOR_TYPE_COGVLM,
|
||||||
PROJECTOR_TYPE_JANUS_PRO,
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
|
PROJECTOR_TYPE_GLM4V,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -192,6 +198,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
|
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
@ -495,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||||
|
|
||||||
//
|
//
|
||||||
// API used internally with mtmd
|
// API used internally with mtmd
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,13 @@ struct clip_hparams {
|
||||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||||
int32_t proj_stack_factor = 0; // ultravox
|
int32_t proj_stack_factor = 0; // ultravox
|
||||||
|
|
||||||
|
// audio-to-mel preprocessor params
|
||||||
|
int32_t audio_chunk_len = -1; // in seconds
|
||||||
|
int32_t audio_sample_rate = -1;
|
||||||
|
int32_t audio_n_fft = -1;
|
||||||
|
int32_t audio_window_len = -1;
|
||||||
|
int32_t audio_hop_len = -1;
|
||||||
|
|
||||||
// legacy
|
// legacy
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
int minicpmv_version = 0;
|
int minicpmv_version = 0;
|
||||||
|
|
@ -151,6 +158,8 @@ struct clip_model {
|
||||||
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||||
ggml_tensor * patch_bias = nullptr;
|
ggml_tensor * patch_bias = nullptr;
|
||||||
ggml_tensor * position_embeddings = nullptr;
|
ggml_tensor * position_embeddings = nullptr;
|
||||||
|
ggml_tensor * norm_embd_w = nullptr;
|
||||||
|
ggml_tensor * norm_embd_b = nullptr;
|
||||||
|
|
||||||
ggml_tensor * pre_ln_w = nullptr;
|
ggml_tensor * pre_ln_w = nullptr;
|
||||||
ggml_tensor * pre_ln_b = nullptr;
|
ggml_tensor * pre_ln_b = nullptr;
|
||||||
|
|
@ -165,6 +174,14 @@ struct clip_model {
|
||||||
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||||
ggml_tensor * mm_fc_w;
|
ggml_tensor * mm_fc_w;
|
||||||
ggml_tensor * mm_fc_b;
|
ggml_tensor * mm_fc_b;
|
||||||
|
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_up_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_b = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_b = nullptr;
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
ggml_tensor * mm_input_norm_w = nullptr;
|
ggml_tensor * mm_input_norm_w = nullptr;
|
||||||
|
|
@ -246,9 +263,10 @@ struct clip_model {
|
||||||
ggml_tensor * mm_input_proj_w = nullptr;
|
ggml_tensor * mm_input_proj_w = nullptr;
|
||||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||||
|
|
||||||
// pixtral
|
// pixtral, glm4v
|
||||||
ggml_tensor * token_embd_img_break = nullptr;
|
ggml_tensor * token_embd_img_break = nullptr;
|
||||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||||
|
ggml_tensor * mm_patch_merger_b = nullptr;
|
||||||
|
|
||||||
// ultravox / whisper encoder
|
// ultravox / whisper encoder
|
||||||
ggml_tensor * conv1d_1_w = nullptr;
|
ggml_tensor * conv1d_1_w = nullptr;
|
||||||
|
|
@ -278,3 +296,5 @@ struct clip_model {
|
||||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||||
|
|
|
||||||
|
|
@ -264,11 +264,11 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
// siglip2 naflex
|
// siglip2 naflex
|
||||||
ggml_tensor * clip_graph::resize_position_embeddings() {
|
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
|
||||||
ggml_tensor * pos_embd = model.position_embeddings;
|
ggml_tensor * pos_embd = model.position_embeddings;
|
||||||
const int height = img.ny / patch_size;
|
const int height = img.ny / patch_size;
|
||||||
const int width = img.nx / patch_size;
|
const int width = img.nx / patch_size;
|
||||||
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
|
const uint32_t mode = interpolation_mode;
|
||||||
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||||
|
|
||||||
GGML_ASSERT(pos_embd);
|
GGML_ASSERT(pos_embd);
|
||||||
|
|
@ -485,19 +485,14 @@ ggml_tensor * clip_graph::build_norm(
|
||||||
? ggml_rms_norm(ctx0, cur, norm_eps)
|
? ggml_rms_norm(ctx0, cur, norm_eps)
|
||||||
: ggml_norm(ctx0, cur, norm_eps);
|
: ggml_norm(ctx0, cur, norm_eps);
|
||||||
|
|
||||||
if (mw || mb) {
|
|
||||||
cb(cur, "norm", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mw) {
|
if (mw) {
|
||||||
cur = ggml_mul(ctx0, cur, mw);
|
cur = ggml_mul(ctx0, cur, mw);
|
||||||
if (mb) {
|
cb(cur, "norm_w", il);
|
||||||
cb(cur, "norm_w", il);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mb) {
|
if (mb) {
|
||||||
cur = ggml_add(ctx0, cur, mb);
|
cur = ggml_add(ctx0, cur, mb);
|
||||||
|
cb(cur, "norm_b", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
|
|
@ -842,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
builder = std::make_unique<clip_graph_llava>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
builder = std::make_unique<clip_graph_glm4v>(ctx, img);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("missing cgraph builder");
|
GGML_ABORT("missing cgraph builder");
|
||||||
}
|
}
|
||||||
|
|
@ -1155,6 +1154,14 @@ struct clip_model_loader {
|
||||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
hparams.rope_theta = 10000.0f;
|
||||||
|
hparams.n_merge = 2; // default value for GLM4-V
|
||||||
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||||
|
hparams.set_limit_image_tokens(8, 4096);
|
||||||
|
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
hparams.rope_theta = 10000.0f;
|
hparams.rope_theta = 10000.0f;
|
||||||
|
|
@ -1170,11 +1177,15 @@ struct clip_model_loader {
|
||||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||||
model.proj_type == PROJECTOR_TYPE_GLMA;
|
model.proj_type == PROJECTOR_TYPE_GLMA;
|
||||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||||
if (hparams.n_mel_bins != 128) {
|
|
||||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
|
||||||
}
|
|
||||||
hparams.ffn_op = FFN_GELU_ERF;
|
hparams.ffn_op = FFN_GELU_ERF;
|
||||||
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
||||||
|
|
||||||
|
// audio preprocessing params
|
||||||
|
hparams.audio_chunk_len = 30; // in seconds
|
||||||
|
hparams.audio_sample_rate = 16000;
|
||||||
|
hparams.audio_n_fft = 400;
|
||||||
|
hparams.audio_window_len = 400;
|
||||||
|
hparams.audio_hop_len = 160;
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
@ -1212,6 +1223,11 @@ struct clip_model_loader {
|
||||||
LOG_INF("\n--- audio hparams ---\n");
|
LOG_INF("\n--- audio hparams ---\n");
|
||||||
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
||||||
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
||||||
|
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
|
||||||
|
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
|
||||||
|
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
||||||
|
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
||||||
|
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
|
|
@ -1273,6 +1289,9 @@ struct clip_model_loader {
|
||||||
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
||||||
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
||||||
|
|
||||||
|
model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
|
||||||
|
model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false);
|
||||||
|
|
||||||
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
||||||
|
|
||||||
// layers
|
// layers
|
||||||
|
|
@ -1461,6 +1480,20 @@ struct clip_model_loader {
|
||||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
{
|
||||||
|
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||||
|
model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight"));
|
||||||
|
model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false);
|
||||||
|
model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
|
||||||
|
model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false);
|
||||||
|
model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight"));
|
||||||
|
model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false);
|
||||||
|
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
||||||
|
model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
|
||||||
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
|
||||||
|
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
{
|
{
|
||||||
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
|
||||||
|
|
@ -1489,8 +1522,8 @@ struct clip_model_loader {
|
||||||
// [IMG_BREAK] token embedding
|
// [IMG_BREAK] token embedding
|
||||||
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
|
||||||
// for mistral small 3.1
|
// for mistral small 3.1
|
||||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||||
{
|
{
|
||||||
|
|
@ -1498,8 +1531,8 @@ struct clip_model_loader {
|
||||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
||||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
||||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||||
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
{
|
{
|
||||||
|
|
@ -1864,6 +1897,8 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||||
if (ctx_params.warmup) {
|
if (ctx_params.warmup) {
|
||||||
loader.warmup(*ctx_vision);
|
loader.warmup(*ctx_vision);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (loader.has_audio) {
|
if (loader.has_audio) {
|
||||||
|
|
@ -2573,6 +2608,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
|
||||||
clip_image_u8 resized;
|
clip_image_u8 resized;
|
||||||
|
|
@ -2815,16 +2851,30 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
||||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
const int n_total = clip_n_output_tokens(ctx, img);
|
const int n_total = clip_n_output_tokens(ctx, img);
|
||||||
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
const auto & proj = ctx->proj_type();
|
||||||
return img->nx / (params.patch_size * 2);
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return (img->nx / params.patch_size) / 2;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return n_total;
|
return n_total;
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
|
const auto & proj = ctx->proj_type();
|
||||||
return img->ny / (params.patch_size * 2);
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return (img->ny / params.patch_size) / 2;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -2881,6 +2931,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
// dynamic size (2 conv, so double patch size)
|
// dynamic size (2 conv, so double patch size)
|
||||||
int x_patch = img->nx / (params.patch_size * 2);
|
int x_patch = img->nx / (params.patch_size * 2);
|
||||||
|
|
@ -3128,6 +3179,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN3VL:
|
case PROJECTOR_TYPE_QWEN3VL:
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
const int merge_ratio = hparams.n_merge;
|
const int merge_ratio = hparams.n_merge;
|
||||||
const int pw = image_size_width / patch_size;
|
const int pw = image_size_width / patch_size;
|
||||||
|
|
@ -3354,7 +3406,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
if (vec != nullptr) {
|
||||||
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -3402,6 +3456,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_2_w->ne[1];
|
return ctx->model.mm_2_w->ne[1];
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||||
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
|
return ctx->model.mm_ffn_down_w->ne[1];
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("Unknown projector type");
|
GGML_ABORT("Unknown projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -3418,10 +3474,11 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
bool clip_is_mrope(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
|
||||||
|
|| ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||||
|
|
@ -3478,3 +3535,26 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
||||||
batch->entries.push_back(clip_image_f32_ptr(audio));
|
batch->entries.push_back(clip_image_f32_ptr(audio));
|
||||||
batch->is_audio = true;
|
batch->is_audio = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||||
|
return &ctx->model.hparams;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// API for debugging
|
||||||
|
//
|
||||||
|
|
||||||
|
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||||
|
clip_image_f32 img;
|
||||||
|
img.nx = w;
|
||||||
|
img.ny = h;
|
||||||
|
img.buf.resize(h * w * 3);
|
||||||
|
for (int i = 0; i < h * w * 3; i++) {
|
||||||
|
img.buf[i] = static_cast<float>(fill_value);
|
||||||
|
}
|
||||||
|
bool cur_debug_graph = ctx->debug_graph;
|
||||||
|
ctx->debug_graph = true;
|
||||||
|
clip_image_encode(ctx, 1, &img, nullptr);
|
||||||
|
ctx->debug_graph = cur_debug_graph;
|
||||||
|
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
||||||
|
|
||||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_glm4v::build() {
|
||||||
|
GGML_ASSERT(model.patch_bias != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
GGML_ASSERT(model.class_embedding == nullptr);
|
||||||
|
|
||||||
|
const int batch_size = 1;
|
||||||
|
|
||||||
|
norm_type norm_t = NORM_TYPE_RMS;
|
||||||
|
|
||||||
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||||
|
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||||
|
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||||
|
|
||||||
|
// second conv dimension
|
||||||
|
{
|
||||||
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, inp_1);
|
||||||
|
|
||||||
|
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||||
|
inp = ggml_cont_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
inp = ggml_reshape_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||||
|
inp = ggml_cont_3d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add patch bias
|
||||||
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
|
cb(inp, "patch_bias", -1);
|
||||||
|
|
||||||
|
// pos-conv norm
|
||||||
|
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
|
||||||
|
|
||||||
|
// calculate absolute position embedding and apply
|
||||||
|
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
|
||||||
|
learned_pos_embd = ggml_cont_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
learned_pos_embd = ggml_reshape_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||||
|
learned_pos_embd = ggml_cont_3d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
cb(learned_pos_embd, "learned_pos_embd", -1);
|
||||||
|
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
return ggml_rope_multi(
|
||||||
|
ctx0, cur, positions, nullptr,
|
||||||
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||||
|
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
norm_t,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
cb(cur, "vit_out", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
|
||||||
|
|
||||||
|
// GLM4V projector
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
|
||||||
|
|
||||||
|
// patch merger (downsample)
|
||||||
|
{
|
||||||
|
int n_merge = hparams.n_merge;
|
||||||
|
GGML_ASSERT(n_merge > 0);
|
||||||
|
|
||||||
|
int n_token_out = n_patches / n_merge / n_merge;
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
|
||||||
|
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
|
||||||
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FC projector
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||||
|
// default LayerNorm (post_projection_norm)
|
||||||
|
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
cur = ggml_gelu_erf(ctx0, cur);
|
||||||
|
cb(cur, "after_fc_proj", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FFN projector
|
||||||
|
{
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||||
|
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
|
||||||
|
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||||
|
hparams.ffn_op, -1);
|
||||||
|
cb(cur, "after_ffn_proj", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -56,3 +56,8 @@ struct clip_graph_whisper_enc : clip_graph {
|
||||||
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct clip_graph_glm4v : clip_graph {
|
||||||
|
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
|
ggml_cgraph * build() override;
|
||||||
|
};
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "clip-model.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
@ -8,18 +9,7 @@
|
||||||
|
|
||||||
#define MTMD_INTERNAL_HEADER
|
#define MTMD_INTERNAL_HEADER
|
||||||
|
|
||||||
#define WHISPER_ASSERT GGML_ASSERT
|
struct mtmd_audio_mel {
|
||||||
|
|
||||||
#define WHISPER_SAMPLE_RATE 16000
|
|
||||||
#define WHISPER_N_FFT 400
|
|
||||||
#define WHISPER_HOP_LENGTH 160
|
|
||||||
#define WHISPER_CHUNK_SIZE 30
|
|
||||||
|
|
||||||
#define COMMON_SAMPLE_RATE 16000
|
|
||||||
|
|
||||||
namespace whisper_preprocessor {
|
|
||||||
|
|
||||||
struct whisper_mel {
|
|
||||||
int n_len;
|
int n_len;
|
||||||
int n_len_org;
|
int n_len_org;
|
||||||
int n_mel;
|
int n_mel;
|
||||||
|
|
@ -27,23 +17,18 @@ struct whisper_mel {
|
||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct whisper_filters {
|
struct mtmd_audio_preprocessor {
|
||||||
int32_t n_mel;
|
const clip_hparams & hparams;
|
||||||
int32_t n_fft;
|
|
||||||
|
|
||||||
std::vector<float> data;
|
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||||
|
|
||||||
|
virtual ~mtmd_audio_preprocessor() = default;
|
||||||
|
virtual void initialize() = 0; // NOT thread-safe
|
||||||
|
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool preprocess_audio(
|
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||||
const float * samples,
|
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||||
size_t n_samples,
|
void initialize() override;
|
||||||
const whisper_filters & filters,
|
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||||
std::vector<whisper_mel> & output);
|
};
|
||||||
|
|
||||||
} // namespace whisper_preprocessor
|
|
||||||
|
|
||||||
namespace whisper_precalc_filters {
|
|
||||||
|
|
||||||
whisper_preprocessor::whisper_filters get_128_bins();
|
|
||||||
|
|
||||||
} // namespace whisper_precalc_filters
|
|
||||||
|
|
|
||||||
|
|
@ -151,8 +151,7 @@ struct mtmd_context {
|
||||||
// string template for slice image delimiters with row/col (idefics3)
|
// string template for slice image delimiters with row/col (idefics3)
|
||||||
std::string sli_img_start_tmpl;
|
std::string sli_img_start_tmpl;
|
||||||
|
|
||||||
// for whisper, we pre-calculate the mel filter bank
|
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||||
whisper_preprocessor::whisper_filters w_filters;
|
|
||||||
|
|
||||||
// TODO @ngxson : add timings
|
// TODO @ngxson : add timings
|
||||||
|
|
||||||
|
|
@ -218,7 +217,7 @@ struct mtmd_context {
|
||||||
|
|
||||||
void init_vision() {
|
void init_vision() {
|
||||||
GGML_ASSERT(ctx_v != nullptr);
|
GGML_ASSERT(ctx_v != nullptr);
|
||||||
use_mrope = clip_is_qwen2vl(ctx_v);
|
use_mrope = clip_is_mrope(ctx_v);
|
||||||
|
|
||||||
projector_type proj = clip_get_projector_type(ctx_v);
|
projector_type proj = clip_get_projector_type(ctx_v);
|
||||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||||
|
|
@ -310,6 +309,10 @@ struct mtmd_context {
|
||||||
img_beg = "<|image_start|>";
|
img_beg = "<|image_start|>";
|
||||||
img_end = "<|image_end|>";
|
img_end = "<|image_end|>";
|
||||||
|
|
||||||
|
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
||||||
|
img_beg = "<|begin_of_image|>";
|
||||||
|
img_end = "<|end_of_image|>";
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -317,14 +320,25 @@ struct mtmd_context {
|
||||||
GGML_ASSERT(ctx_a != nullptr);
|
GGML_ASSERT(ctx_a != nullptr);
|
||||||
projector_type proj = clip_get_projector_type(ctx_a);
|
projector_type proj = clip_get_projector_type(ctx_a);
|
||||||
|
|
||||||
if (clip_has_whisper_encoder(ctx_a)) {
|
|
||||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
|
||||||
w_filters = whisper_precalc_filters::get_128_bins();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||||
|
|
||||||
|
// set preprocessor
|
||||||
|
switch (proj) {
|
||||||
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_QWEN25O:
|
||||||
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported audio projector type");
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize audio preprocessor
|
||||||
|
audio_preproc->initialize();
|
||||||
|
|
||||||
|
// set special tokens
|
||||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||||
aud_beg = "<|audio_bos|>";
|
aud_beg = "<|audio_bos|>";
|
||||||
|
|
@ -653,11 +667,10 @@ struct mtmd_tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// preprocess audio
|
// preprocess audio
|
||||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
|
||||||
const float * samples = (const float *)bitmap->data.data();
|
const float * samples = (const float *)bitmap->data.data();
|
||||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("Unable to preprocess audio\n");
|
LOG_ERR("Unable to preprocess audio\n");
|
||||||
return 2;
|
return 2;
|
||||||
|
|
@ -863,8 +876,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||||
if (!ctx->ctx_a) {
|
if (!ctx->ctx_a) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// for now, we assume that all audio models have the same bitrate
|
return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
|
||||||
return 16000; // 16kHz
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -1430,7 +1430,7 @@ Model presets allow advanced users to define custom configurations using an `.in
|
||||||
llama-server --models-preset ./my-models.ini
|
llama-server --models-preset ./my-models.ini
|
||||||
```
|
```
|
||||||
|
|
||||||
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
|
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`.
|
||||||
|
|
||||||
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
|
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
|
||||||
|
|
||||||
|
|
@ -1445,7 +1445,7 @@ version = 1
|
||||||
; string value
|
; string value
|
||||||
chat-template = chatml
|
chat-template = chatml
|
||||||
; numeric value
|
; numeric value
|
||||||
n-gpu-layer = 123
|
n-gpu-layers = 123
|
||||||
; flag value (for certain flags, you need to use the "no-" prefix for negation)
|
; flag value (for certain flags, you need to use the "no-" prefix for negation)
|
||||||
jinja = true
|
jinja = true
|
||||||
; shorthand argument (for example, context size)
|
; shorthand argument (for example, context size)
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -619,11 +619,12 @@ flowchart TB
|
||||||
|
|
||||||
### Test Types
|
### Test Types
|
||||||
|
|
||||||
| Type | Tool | Location | Command |
|
| Type | Tool | Location | Command |
|
||||||
| ------------- | ------------------ | -------------------------------- | ------------------- |
|
| ------------- | ------------------ | ---------------- | ------------------- |
|
||||||
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
|
| **Unit** | Vitest | `tests/unit/` | `npm run test:unit` |
|
||||||
| **Unit** | Vitest | `tests/client/`, `tests/server/` | `npm run test:unit` |
|
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
|
||||||
| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui` |
|
| **E2E** | Playwright | `tests/e2e/` | `npm run test:e2e` |
|
||||||
|
| **Client** | Vitest | `tests/client/`. | `npm run test:unit` |
|
||||||
|
|
||||||
### Running Tests
|
### Running Tests
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,12 +13,11 @@
|
||||||
"reset": "rm -rf .svelte-kit node_modules",
|
"reset": "rm -rf .svelte-kit node_modules",
|
||||||
"format": "prettier --write .",
|
"format": "prettier --write .",
|
||||||
"lint": "prettier --check . && eslint .",
|
"lint": "prettier --check . && eslint .",
|
||||||
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:server -- --run && npm run test:e2e",
|
"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
|
||||||
"test:e2e": "playwright test",
|
"test:e2e": "playwright test",
|
||||||
"test:client": "vitest --project=client",
|
"test:client": "vitest --project=client",
|
||||||
"test:server": "vitest --project=server",
|
"test:unit": "vitest --project=unit",
|
||||||
"test:ui": "vitest --project=ui",
|
"test:ui": "vitest --project=ui",
|
||||||
"test:unit": "vitest",
|
|
||||||
"storybook": "storybook dev -p 6006",
|
"storybook": "storybook dev -p 6006",
|
||||||
"build-storybook": "storybook build",
|
"build-storybook": "storybook build",
|
||||||
"cleanup": "rm -rf .svelte-kit build node_modules test-results"
|
"cleanup": "rm -rf .svelte-kit build node_modules test-results"
|
||||||
|
|
|
||||||
|
|
@ -241,7 +241,7 @@
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
|
{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
|
||||||
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="69rem" />
|
<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="calc(69rem - 2rem)" />
|
||||||
{:else if isAudio}
|
{:else if isAudio}
|
||||||
<div class="flex items-center justify-center p-8">
|
<div class="flex items-center justify-center p-8">
|
||||||
<div class="w-full max-w-md text-center">
|
<div class="w-full max-w-md text-center">
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { RemoveButton } from '$lib/components/app';
|
import { RemoveButton } from '$lib/components/app';
|
||||||
import { getFileTypeLabel, getPreviewText, formatFileSize, isTextFile } from '$lib/utils';
|
import { formatFileSize, getFileTypeLabel, getPreviewText, isTextFile } from '$lib/utils';
|
||||||
import { AttachmentType } from '$lib/enums';
|
import { AttachmentType } from '$lib/enums';
|
||||||
|
|
||||||
interface Props {
|
interface Props {
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
MimeTypeImage,
|
MimeTypeImage,
|
||||||
MimeTypeText
|
MimeTypeText
|
||||||
} from '$lib/enums';
|
} from '$lib/enums';
|
||||||
import { isIMEComposing } from '$lib/utils';
|
import { isIMEComposing, parseClipboardContent } from '$lib/utils';
|
||||||
import {
|
import {
|
||||||
AudioRecorder,
|
AudioRecorder,
|
||||||
convertToWav,
|
convertToWav,
|
||||||
|
|
@ -191,7 +191,6 @@
|
||||||
|
|
||||||
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
|
if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
|
||||||
|
|
||||||
// Check if model is selected first
|
|
||||||
if (!checkModelSelected()) return;
|
if (!checkModelSelected()) return;
|
||||||
|
|
||||||
const messageToSend = message.trim();
|
const messageToSend = message.trim();
|
||||||
|
|
@ -228,6 +227,31 @@
|
||||||
|
|
||||||
const text = event.clipboardData.getData(MimeTypeText.PLAIN);
|
const text = event.clipboardData.getData(MimeTypeText.PLAIN);
|
||||||
|
|
||||||
|
if (text.startsWith('"')) {
|
||||||
|
const parsed = parseClipboardContent(text);
|
||||||
|
|
||||||
|
if (parsed.textAttachments.length > 0) {
|
||||||
|
event.preventDefault();
|
||||||
|
|
||||||
|
message = parsed.message;
|
||||||
|
|
||||||
|
const attachmentFiles = parsed.textAttachments.map(
|
||||||
|
(att) =>
|
||||||
|
new File([att.content], att.name, {
|
||||||
|
type: MimeTypeText.PLAIN
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
onFileUpload?.(attachmentFiles);
|
||||||
|
|
||||||
|
setTimeout(() => {
|
||||||
|
textareaRef?.focus();
|
||||||
|
}, 10);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
text.length > 0 &&
|
text.length > 0 &&
|
||||||
pasteLongTextToFileLength > 0 &&
|
pasteLongTextToFileLength > 0 &&
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@
|
||||||
|
|
||||||
<div class="flex items-center gap-1 {className}">
|
<div class="flex items-center gap-1 {className}">
|
||||||
<DropdownMenu.Root>
|
<DropdownMenu.Root>
|
||||||
<DropdownMenu.Trigger name="Attach files">
|
<DropdownMenu.Trigger name="Attach files" {disabled}>
|
||||||
<Tooltip.Root>
|
<Tooltip.Root>
|
||||||
<Tooltip.Trigger>
|
<Tooltip.Trigger>
|
||||||
<Button
|
<Button
|
||||||
|
|
|
||||||
|
|
@ -173,6 +173,7 @@
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<ModelsSelector
|
<ModelsSelector
|
||||||
|
{disabled}
|
||||||
bind:this={selectorModelRef}
|
bind:this={selectorModelRef}
|
||||||
currentModel={conversationModel}
|
currentModel={conversationModel}
|
||||||
forceForegroundText={true}
|
forceForegroundText={true}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { chatStore } from '$lib/stores/chat.svelte';
|
import { chatStore } from '$lib/stores/chat.svelte';
|
||||||
import { copyToClipboard, isIMEComposing } from '$lib/utils';
|
import { config } from '$lib/stores/settings.svelte';
|
||||||
|
import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
|
||||||
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
|
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
|
||||||
import ChatMessageUser from './ChatMessageUser.svelte';
|
import ChatMessageUser from './ChatMessageUser.svelte';
|
||||||
import ChatMessageSystem from './ChatMessageSystem.svelte';
|
import ChatMessageSystem from './ChatMessageSystem.svelte';
|
||||||
|
|
@ -87,7 +88,9 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
async function handleCopy() {
|
async function handleCopy() {
|
||||||
await copyToClipboard(message.content, 'Message copied to clipboard');
|
const asPlainText = Boolean(config().copyTextAttachmentsAsPlainText);
|
||||||
|
const clipboardContent = formatMessageForClipboard(message.content, message.extra, asPlainText);
|
||||||
|
await copyToClipboard(clipboardContent, 'Message copied to clipboard');
|
||||||
onCopy?.(message);
|
onCopy?.(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,11 @@
|
||||||
label: 'Paste long text to file length',
|
label: 'Paste long text to file length',
|
||||||
type: 'input'
|
type: 'input'
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
key: 'copyTextAttachmentsAsPlainText',
|
||||||
|
label: 'Copy text attachments as plain text',
|
||||||
|
type: 'checkbox'
|
||||||
|
},
|
||||||
{
|
{
|
||||||
key: 'enableContinueGeneration',
|
key: 'enableContinueGeneration',
|
||||||
label: 'Enable "Continue" button',
|
label: 'Enable "Continue" button',
|
||||||
|
|
@ -109,6 +114,16 @@
|
||||||
key: 'disableAutoScroll',
|
key: 'disableAutoScroll',
|
||||||
label: 'Disable automatic scroll',
|
label: 'Disable automatic scroll',
|
||||||
type: 'checkbox'
|
type: 'checkbox'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'alwaysShowSidebarOnDesktop',
|
||||||
|
label: 'Always show sidebar on desktop',
|
||||||
|
type: 'checkbox'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'autoShowSidebarOnNewChat',
|
||||||
|
label: 'Auto-show sidebar on new chat',
|
||||||
|
type: 'checkbox'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -404,7 +419,7 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Mobile Header with Horizontal Scrollable Menu -->
|
<!-- Mobile Header with Horizontal Scrollable Menu -->
|
||||||
<div class="flex flex-col md:hidden">
|
<div class="flex flex-col pt-6 md:hidden">
|
||||||
<div class="border-b border-border/30 py-4">
|
<div class="border-b border-border/30 py-4">
|
||||||
<!-- Horizontal Scrollable Category Menu with Navigation -->
|
<!-- Horizontal Scrollable Category Menu with Navigation -->
|
||||||
<div class="relative flex items-center" style="scroll-padding: 1rem;">
|
<div class="relative flex items-center" style="scroll-padding: 1rem;">
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { Download, Upload } from '@lucide/svelte';
|
import { Download, Upload, Trash2 } from '@lucide/svelte';
|
||||||
import { Button } from '$lib/components/ui/button';
|
import { Button } from '$lib/components/ui/button';
|
||||||
import { DialogConversationSelection } from '$lib/components/app';
|
import { DialogConversationSelection } from '$lib/components/app';
|
||||||
import { createMessageCountMap } from '$lib/utils';
|
import { createMessageCountMap } from '$lib/utils';
|
||||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||||
|
import { toast } from 'svelte-sonner';
|
||||||
|
import DialogConfirmation from '$lib/components/app/dialogs/DialogConfirmation.svelte';
|
||||||
|
|
||||||
let exportedConversations = $state<DatabaseConversation[]>([]);
|
let exportedConversations = $state<DatabaseConversation[]>([]);
|
||||||
let importedConversations = $state<DatabaseConversation[]>([]);
|
let importedConversations = $state<DatabaseConversation[]>([]);
|
||||||
|
|
@ -18,11 +20,14 @@
|
||||||
[]
|
[]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Delete functionality state
|
||||||
|
let showDeleteDialog = $state(false);
|
||||||
|
|
||||||
async function handleExportClick() {
|
async function handleExportClick() {
|
||||||
try {
|
try {
|
||||||
const allConversations = conversations();
|
const allConversations = conversations();
|
||||||
if (allConversations.length === 0) {
|
if (allConversations.length === 0) {
|
||||||
alert('No conversations to export');
|
toast.info('No conversations to export');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -145,6 +150,36 @@
|
||||||
alert('Failed to import conversations. Please check the file format.');
|
alert('Failed to import conversations. Please check the file format.');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function handleDeleteAllClick() {
|
||||||
|
try {
|
||||||
|
const allConversations = conversations();
|
||||||
|
|
||||||
|
if (allConversations.length === 0) {
|
||||||
|
toast.info('No conversations to delete');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
showDeleteDialog = true;
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to load conversations for deletion:', err);
|
||||||
|
toast.error('Failed to load conversations');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleDeleteAllConfirm() {
|
||||||
|
try {
|
||||||
|
await conversationsStore.deleteAll();
|
||||||
|
|
||||||
|
showDeleteDialog = false;
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to delete conversations:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleDeleteAllCancel() {
|
||||||
|
showDeleteDialog = false;
|
||||||
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="space-y-6">
|
<div class="space-y-6">
|
||||||
|
|
@ -229,6 +264,25 @@
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="grid border-t border-border/30 pt-4">
|
||||||
|
<h4 class="mb-2 text-sm font-medium text-destructive">Delete All Conversations</h4>
|
||||||
|
|
||||||
|
<p class="mb-4 text-sm text-muted-foreground">
|
||||||
|
Permanently delete all conversations and their messages. This action cannot be undone.
|
||||||
|
Consider exporting your conversations first if you want to keep a backup.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<Button
|
||||||
|
class="text-destructive-foreground w-full justify-start justify-self-start bg-destructive hover:bg-destructive/80 md:w-auto"
|
||||||
|
onclick={handleDeleteAllClick}
|
||||||
|
variant="destructive"
|
||||||
|
>
|
||||||
|
<Trash2 class="mr-2 h-4 w-4" />
|
||||||
|
|
||||||
|
Delete all conversations
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
@ -249,3 +303,15 @@
|
||||||
onCancel={() => (showImportDialog = false)}
|
onCancel={() => (showImportDialog = false)}
|
||||||
onConfirm={handleImportConfirm}
|
onConfirm={handleImportConfirm}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
<DialogConfirmation
|
||||||
|
bind:open={showDeleteDialog}
|
||||||
|
title="Delete all conversations"
|
||||||
|
description="Are you sure you want to delete all conversations? This action cannot be undone and will permanently remove all your conversations and messages."
|
||||||
|
confirmText="Delete All"
|
||||||
|
cancelText="Cancel"
|
||||||
|
variant="destructive"
|
||||||
|
icon={Trash2}
|
||||||
|
onConfirm={handleDeleteAllConfirm}
|
||||||
|
onCancel={handleDeleteAllCancel}
|
||||||
|
/>
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
import Input from '$lib/components/ui/input/input.svelte';
|
import Input from '$lib/components/ui/input/input.svelte';
|
||||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||||
import { chatStore } from '$lib/stores/chat.svelte';
|
import { chatStore } from '$lib/stores/chat.svelte';
|
||||||
|
import { getPreviewText } from '$lib/utils/text';
|
||||||
import ChatSidebarActions from './ChatSidebarActions.svelte';
|
import ChatSidebarActions from './ChatSidebarActions.svelte';
|
||||||
|
|
||||||
const sidebar = Sidebar.useSidebar();
|
const sidebar = Sidebar.useSidebar();
|
||||||
|
|
@ -20,6 +21,9 @@
|
||||||
let showEditDialog = $state(false);
|
let showEditDialog = $state(false);
|
||||||
let selectedConversation = $state<DatabaseConversation | null>(null);
|
let selectedConversation = $state<DatabaseConversation | null>(null);
|
||||||
let editedName = $state('');
|
let editedName = $state('');
|
||||||
|
let selectedConversationNamePreview = $derived.by(() =>
|
||||||
|
selectedConversation ? getPreviewText(selectedConversation.name) : ''
|
||||||
|
);
|
||||||
|
|
||||||
let filteredConversations = $derived.by(() => {
|
let filteredConversations = $derived.by(() => {
|
||||||
if (searchQuery.trim().length > 0) {
|
if (searchQuery.trim().length > 0) {
|
||||||
|
|
@ -162,7 +166,7 @@
|
||||||
bind:open={showDeleteDialog}
|
bind:open={showDeleteDialog}
|
||||||
title="Delete Conversation"
|
title="Delete Conversation"
|
||||||
description={selectedConversation
|
description={selectedConversation
|
||||||
? `Are you sure you want to delete "${selectedConversation.name}"? This action cannot be undone and will permanently remove all messages in this conversation.`
|
? `Are you sure you want to delete "${selectedConversationNamePreview}"? This action cannot be undone and will permanently remove all messages in this conversation.`
|
||||||
: ''}
|
: ''}
|
||||||
confirmText="Delete"
|
confirmText="Delete"
|
||||||
cancelText="Cancel"
|
cancelText="Cancel"
|
||||||
|
|
|
||||||
|
|
@ -504,6 +504,14 @@
|
||||||
background: hsl(var(--muted) / 0.1);
|
background: hsl(var(--muted) / 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* User message markdown should keep table borders visible on light primary backgrounds */
|
||||||
|
div.markdown-user-content :global(table),
|
||||||
|
div.markdown-user-content :global(th),
|
||||||
|
div.markdown-user-content :global(td),
|
||||||
|
div.markdown-user-content :global(.table-wrapper) {
|
||||||
|
border-color: currentColor;
|
||||||
|
}
|
||||||
|
|
||||||
/* Horizontal rules */
|
/* Horizontal rules */
|
||||||
div :global(hr) {
|
div :global(hr) {
|
||||||
border: none;
|
border: none;
|
||||||
|
|
@ -642,6 +650,21 @@
|
||||||
background: var(--muted);
|
background: var(--muted);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Disable hover effects when rendering user messages */
|
||||||
|
.markdown-user-content :global(a),
|
||||||
|
.markdown-user-content :global(a:hover) {
|
||||||
|
color: var(--primary-foreground);
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-user-content :global(table:hover) {
|
||||||
|
box-shadow: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-user-content :global(th:hover),
|
||||||
|
.markdown-user-content :global(td:hover) {
|
||||||
|
background: inherit;
|
||||||
|
}
|
||||||
|
|
||||||
/* Enhanced blockquotes */
|
/* Enhanced blockquotes */
|
||||||
div :global(blockquote) {
|
div :global(blockquote) {
|
||||||
transition: all 0.2s ease;
|
transition: all 0.2s ease;
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue