Merge branch 'ggml-org:master' into power-law-sampler

This commit is contained in:
ddh0 2025-12-13 01:17:53 -06:00 committed by GitHub
commit 1879fc6dc6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
62 changed files with 3133 additions and 2521 deletions

View File

@ -4,7 +4,7 @@
# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
# ==============================================================================
# BUILD STAGE
@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
# ==============================================================================
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
ENTRYPOINT [ "/app/llama-cli" ]

View File

@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

View File

@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@"
elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
exec ./llama-completion "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
exec ./llama-bench "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
else
echo "Unknown command: $arg1"
echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --run (-r): Run a model (chat) previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin"
echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text."

View File

@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app

View File

@ -1400,25 +1400,54 @@ jobs:
chip_type: ['910b', '310p']
build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies
- name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Set container image
id: cann-image
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake libcurl-devel
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
run: |
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
HOST_UID=$(id -u)
HOST_GID=$(id -g)
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-DGGML_CANN=on \
-DSOC_TYPE=ascend${{ matrix.chip_type }}
cmake --build build -j $(nproc)
docker run --rm \
-v "${PWD}:/workspace" \
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
-DSOC_TYPE=${SOC_TYPE}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
'
# TODO: simplify the following workflows using a matrix
# TODO: run lighter CI on PRs and the full CI only on master (if needed)

View File

@ -731,6 +731,78 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
openEuler-cann:
strategy:
matrix:
arch: [x86, aarch64]
chip_type: ['910b', '310p']
build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Free up disk space
uses: ggml-org/free-disk-space@v1.3.1
with:
tool-cache: true
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"
- name: Pull container image
run: docker pull "${{ steps.cann-image.outputs.image }}"
- name: Build
env:
BUILD_TYPE: ${{ matrix.build }}
SOC_TYPE: ascend${{ matrix.chip_type }}
run: |
HOST_UID=$(id -u)
HOST_GID=$(id -g)
docker run --rm \
-v "${PWD}:/workspace" \
-w /workspace \
-e SOC_TYPE=${SOC_TYPE} \
-e BUILD_TYPE=${BUILD_TYPE} \
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGGML_CANN=on \
-DSOC_TYPE=${SOC_TYPE}
cmake --build build -j $(nproc)
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
'
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -752,6 +824,7 @@ jobs:
- macOS-arm64
- macOS-x64
- ios-xcode-build
- openEuler-cann
steps:
- name: Clone
@ -844,6 +917,12 @@ jobs:
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
**openEuler:**
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
- [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
- [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
- name: Upload release
id: upload_release
uses: actions/github-script@v3

View File

@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
bool common_arg::get_value_from_env(std::string & output) const {
if (env == nullptr) return false;
if (!args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
char * neg_value = std::getenv(neg_env.c_str());
if (neg_value) {
output = "0"; // falsey
return true;
}
}
char * value = std::getenv(env);
if (value) {
output = value;
@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
}
bool common_arg::has_value_from_env() const {
if (env != nullptr && !args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
if (std::getenv(neg_env.c_str())) {
return true;
}
}
return env != nullptr && std::getenv(env);
}
@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
std::string leading_spaces(n_leading_spaces, ' ');
std::ostringstream ss;
for (const auto arg : args) {
if (arg == args.front()) {
if (args.size() == 1) {
auto all_args = get_args(); // also contains args_neg
for (const auto & arg : all_args) {
if (arg == all_args.front()) {
if (all_args.size() == 1) {
ss << arg;
} else {
// first arg is usually abbreviation, we need padding to make it more beautiful
@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
ss << tmp << spaces;
}
} else {
ss << arg << (arg != args.back() ? ", " : "");
ss << arg << (arg != all_args.back() ? ", " : "");
}
}
if (value_hint) ss << " " << value_hint;
@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
return ss.str();
}
std::vector<std::string> common_arg::get_args() const {
std::vector<std::string> result;
for (const auto & arg : args) {
result.push_back(std::string(arg));
}
for (const auto & arg : args_neg) {
result.push_back(std::string(arg));
}
return result;
}
std::vector<std::string> common_arg::get_env() const {
std::vector<std::string> result;
if (env) {
result.push_back(std::string(env));
}
if (!args_neg.empty() && env) {
// for compatibility, we need to add LLAMA_ARG_NO_ variant
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
result.push_back(neg_env);
}
return result;
}
//
// utils
//
@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
return msg.str();
}
static bool parse_bool_value(const std::string & value) {
if (is_truthy(value)) {
return true;
} else if (is_falsey(value)) {
return false;
} else {
throw std::invalid_argument("invalid boolean value");
}
}
//
// CLI argument parsing functions
//
@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
std::unordered_map<std::string, common_arg *> arg_to_options;
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt;
arg_to_options[arg] = {&opt, /* is_positive */ true};
}
for (const auto & arg : opt.args_neg) {
arg_to_options[arg] = {&opt, /* is_positive */ false};
}
}
@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
std::string value;
if (opt.get_value_from_env(value)) {
try {
if (opt.handler_void && (value == "1" || value == "true")) {
if (opt.handler_void && is_truthy(value)) {
opt.handler_void(params);
}
if (opt.handler_int) {
opt.handler_int(params, std::stoi(value));
}
if (opt.handler_bool) {
opt.handler_bool(params, parse_bool_value(value));
}
if (opt.handler_string) {
opt.handler_string(params, value);
continue;
@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
auto opt = *arg_to_options[arg];
auto & tmp = arg_to_options[arg];
auto opt = *tmp.first;
bool is_positive = tmp.second;
if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
}
@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
opt.handler_void(params);
continue;
}
if (opt.handler_bool) {
opt.handler_bool(params, is_positive);
continue;
}
// arg with single value
check_arg(i);
@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
arg.c_str(), e.what(), opt.to_string().c_str()));
}
}
@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() {
}
bool common_arg_utils::is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1";
return value == "on" || value == "enabled" || value == "true" || value == "1";
}
bool common_arg_utils::is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0";
return value == "off" || value == "disabled" || value == "false" || value == "0";
}
bool common_arg_utils::is_autoy(const std::string & value) {
@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
{"--display-prompt"},
{"--no-display-prompt"},
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
[](common_params & params) {
params.display_prompt = false;
string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
[](common_params & params, bool value) {
params.display_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_UNIFIED"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](common_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
[](common_params & params) {
params.ctx_shift = true;
{"--no-context-shift"},
string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.ctx_shift = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--perf"},
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](common_params & params) {
params.no_perf = true;
params.sampling.no_perf = true;
string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](common_params & params, bool value) {
params.no_perf = !value;
params.sampling.no_perf = !value;
}
).set_env("LLAMA_ARG_NO_PERF"));
).set_env("LLAMA_ARG_PERF"));
add_opt(common_arg(
{"--show-timings"},
{"--no-show-timings"},
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params) {
params.show_timings = false;
string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params, bool value) {
params.show_timings = value;
}
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-e", "--escape"},
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) {
params.escape = true;
}
));
add_opt(common_arg(
{"--no-escape"},
"do not process escape sequences",
[](common_params & params) {
params.escape = false;
string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params, bool value) {
params.escape = value;
}
));
add_opt(common_arg(
@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
"run in conversation mode:\n"
{"-no-cnv", "--no-conversation"},
"whether to run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n"
"(default: auto enabled if chat template is available)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
[](common_params & params, bool value) {
params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--warmup"},
{"--no-warmup"},
"skip warming up the model with an empty run",
[](common_params & params) {
params.warmup = false;
string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.warmup = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
@ -1718,19 +1770,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-kvo", "--kv-offload"},
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
[](common_params & params) {
params.no_kv_offload = true;
string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
[](common_params & params, bool value) {
params.no_kv_offload = !value;
}
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
).set_env("LLAMA_ARG_KV_OFFLOAD"));
add_opt(common_arg(
{"--repack"},
{"-nr", "--no-repack"},
"disable weight repacking",
[](common_params & params) {
params.no_extra_bufts = true;
string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
[](common_params & params, bool value) {
params.no_extra_bufts = !value;
}
).set_env("LLAMA_ARG_NO_REPACK"));
).set_env("LLAMA_ARG_REPACK"));
add_opt(common_arg(
{"--no-host"},
"bypass host buffer allowing extra buffers to be used",
@ -1859,20 +1913,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
{"-cb", "--cont-batching"},
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
[](common_params & params) {
params.cont_batching = true;
{"-nocb", "--no-cont-batching"},
string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.cont_batching = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
add_opt(common_arg(
{"-nocb", "--no-cont-batching"},
"disable continuous batching",
[](common_params & params) {
params.cont_batching = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg(
{"--mmproj"}, "FILE",
{"-mm", "--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n"
"note: if -hf is used, this argument can be omitted",
[](common_params & params, const std::string & value) {
@ -1880,26 +1928,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
add_opt(common_arg(
{"--mmproj-url"}, "URL",
{"-mmu", "--mmproj-url"}, "URL",
"URL to a multimodal projector file. see tools/mtmd/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
add_opt(common_arg(
{"--no-mmproj"},
"explicitly disable multimodal projector, useful when using -hf",
[](common_params & params) {
params.no_mmproj = true;
{"--mmproj-auto"},
{"--no-mmproj", "--no-mmproj-auto"},
string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
[](common_params & params, bool value) {
params.no_mmproj = !value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg(
{"--mmproj-offload"},
{"--no-mmproj-offload"},
"do not offload multimodal projector to GPU",
[](common_params & params) {
params.mmproj_use_gpu = false;
string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.mmproj_use_gpu = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@ -1939,12 +1989,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_MLOCK"));
add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
[](common_params & params) {
params.use_mmap = false;
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
}
).set_env("LLAMA_ARG_NO_MMAP"));
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
@ -2132,10 +2183,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
));
add_opt(common_arg(
{"--op-offload"},
{"--no-op-offload"},
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
[](common_params & params) {
params.no_op_offload = true;
string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
[](common_params & params, bool value) {
params.no_op_offload = !value;
}
));
add_opt(common_arg(
@ -2331,10 +2383,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--ppl"},
{"--no-ppl"},
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
[](common_params & params) {
params.compute_ppl = false;
string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
[](common_params & params, bool value) {
params.compute_ppl = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
@ -2453,12 +2506,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) {
params.webui = false;
string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.webui = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg(
{"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2563,18 +2617,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg(
{"--slots"},
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
[](common_params & params) {
params.endpoint_slots = true;
{"--no-slots"},
string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.endpoint_slots = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--no-slots"},
"disables slots monitoring endpoint",
[](common_params & params) {
params.endpoint_slots = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--slot-save-path"}, "PATH",
"path to save slot kv cache (default: disabled)",
@ -2625,26 +2673,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"},
"disables automatic loading of models (default: enabled)",
[](common_params & params) {
params.models_autoload = false;
string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.models_autoload = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = true;
{"--no-jinja"},
string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_jinja = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2689,15 +2732,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
{"--prefill-assistant"},
{"--no-prefill-assistant"},
string_format(
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
),
[](common_params & params) {
params.prefill_assistant = false;
[](common_params & params, bool value) {
params.prefill_assistant = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

View File

@ -16,6 +16,7 @@ struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {};
std::vector<const char *> args;
std::vector<const char *> args_neg; // for negated args like --no-xxx
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
@ -25,6 +26,7 @@ struct common_arg {
void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (common_params & params, int) = nullptr;
void (*handler_bool) (common_params & params, bool) = nullptr;
common_arg() = default;
@ -48,6 +50,13 @@ struct common_arg {
void (*handler)(common_params & params)
) : args(args), help(help), handler_void(handler) {}
common_arg(
const std::initializer_list<const char *> & args,
const std::initializer_list<const char *> & args_neg,
const std::string & help,
void (*handler)(common_params & params, bool)
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
// support 2 values for arg
common_arg(
const std::initializer_list<const char *> & args,
@ -80,6 +89,10 @@ struct common_arg {
}
return strcmp(args[0], other.args[0]) == 0;
}
// get all args and env vars (including negated args/env)
std::vector<std::string> get_args() const;
std::vector<std::string> get_env() const;
};
namespace common_arg_utils {

View File

@ -12,6 +12,8 @@
#include <filesystem>
#include <fstream>
#include <future>
#include <map>
#include <mutex>
#include <regex>
#include <string>
#include <thread>
@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
#elif defined(LLAMA_USE_HTTPLIB)
static bool is_output_a_tty() {
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
static inline int max_line = 0;
static void cleanup(const ProgressBar * line) {
lines.erase(line);
if (lines.empty()) {
max_line = 0;
}
}
static bool is_output_a_tty() {
#if defined(_WIN32)
return _isatty(_fileno(stdout));
return _isatty(_fileno(stdout));
#else
return isatty(1);
return isatty(1);
#endif
}
static void print_progress(size_t current, size_t total) {
if (!is_output_a_tty()) {
return;
}
if (!total) {
return;
public:
ProgressBar() = default;
~ProgressBar() {
std::lock_guard<std::mutex> lock(mutex);
cleanup(this);
}
size_t width = 50;
size_t pct = (100 * current) / total;
size_t pos = (width * current) / total;
void update(size_t current, size_t total) {
if (!is_output_a_tty()) {
return;
}
std::cout << "["
<< std::string(pos, '=')
<< (pos < width ? ">" : "")
<< std::string(width - pos, ' ')
<< "] " << std::setw(3) << pct << "% ("
<< current / (1024 * 1024) << " MB / "
<< total / (1024 * 1024) << " MB)\r";
std::cout.flush();
}
if (!total) {
return;
}
std::lock_guard<std::mutex> lock(mutex);
if (lines.find(this) == lines.end()) {
lines[this] = max_line++;
std::cout << "\n";
}
int lines_up = max_line - lines[this];
size_t width = 50;
size_t pct = (100 * current) / total;
size_t pos = (width * current) / total;
std::cout << "\033[s";
if (lines_up > 0) {
std::cout << "\033[" << lines_up << "A";
}
std::cout << "\033[2K\r["
<< std::string(pos, '=')
<< (pos < width ? ">" : "")
<< std::string(width - pos, ' ')
<< "] " << std::setw(3) << pct << "% ("
<< current / (1024 * 1024) << " MB / "
<< total / (1024 * 1024) << " MB) "
<< "\033[u";
std::cout.flush();
if (current == total) {
cleanup(this);
}
}
ProgressBar(const ProgressBar &) = delete;
ProgressBar & operator=(const ProgressBar &) = delete;
};
static bool common_pull_file(httplib::Client & cli,
const std::string & resolve_path,
@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size;
size_t progress_step = 0;
ProgressBar bar;
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
progress_step += len;
if (progress_step >= total_size / 1000 || downloaded == total_size) {
print_progress(downloaded, total_size);
bar.update(downloaded, total_size);
progress_step = 0;
}
return true;
@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
nullptr
);
std::cout << "\n";
if (!res) {
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
return false;

View File

@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
// flag option, no value
if (common_arg_utils::is_falsey(value)) {
// skip the flag
args.pop_back();
// use negative arg if available
if (!opt.args_neg.empty()) {
args.back() = opt.args_neg.back();
} else {
// otherwise, skip the flag
// TODO: maybe throw an error instead?
args.pop_back();
}
}
}
if (opt.value_hint != nullptr) {
@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
std::map<std::string, common_arg> mapping;
for (const auto & opt : ctx_params.options) {
if (opt.env != nullptr) {
mapping[opt.env] = opt;
for (const auto & env : opt.get_env()) {
mapping[env] = opt;
}
for (const auto & arg : opt.args) {
for (const auto & arg : opt.get_args()) {
mapping[rm_leading_dashes(arg)] = opt;
}
}

View File

@ -7286,6 +7286,10 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
# ref https://github.com/ggml-org/llama.cpp/pull/17945
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
_experts: list[dict[str, Tensor]] | None = None
@ -10041,6 +10045,10 @@ class MistralMoeModel(DeepseekV2Model):
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
yarn_params = self.hparams["yarn"]
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
# ref https://github.com/ggml-org/llama.cpp/pull/17945
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):

View File

@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
file << "| `";
// args
for (const auto & arg : opt.args) {
if (arg == opt.args.front()) {
auto all_args = opt.get_args();
for (const auto & arg : all_args) {
if (arg == all_args.front()) {
file << arg;
if (opt.args.size() > 1) file << ", ";
if (all_args.size() > 1) file << ", ";
} else {
file << arg << (arg != opt.args.back() ? ", " : "");
file << arg << (arg != all_args.back() ? ", " : "");
}
}
// value hint

View File

@ -32,10 +32,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
print(f"Max absolute difference: {max_diff:.4f}")
if max_diff > 1.0:
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
return False
return True
def main():

View File

@ -99,6 +99,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_sme (void);
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);

View File

@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
return true;
case GGML_OP_PAD:
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
return ggml_get_op_params_i32(op, 8) == 0;

View File

@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
} ggml_arm_arch_features = { 0 };
#endif
#if defined(__riscv)
struct ggml_riscv_arch_features_type {
int rvv_vlen;
} ggml_riscv_arch_features = { 0 };
#endif
#if defined(_WIN32)
@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
#endif
#endif // __ARM_ARCH
#if defined(__riscv) && defined(__riscv_v_intrinsic)
#include <riscv_vector.h>
static void ggml_init_riscv_arch_features(void) {
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
}
#else
static void ggml_init_riscv_arch_features(void) {}
#endif
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
GGML_ASSERT(!ggml_get_no_alloc(ctx));
@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
#endif
}
int ggml_cpu_get_rvv_vlen(void) {
#if defined(__riscv) && defined(__riscv_v_intrinsic)
return ggml_riscv_arch_features.rvv_vlen;
#else
return 0;
#endif
}
int ggml_cpu_has_f16c(void) {
#if defined(__F16C__)
return 1;
@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
ggml_init_arm_arch_features();
#endif
#if defined(__riscv)
ggml_init_riscv_arch_features();
#endif
is_first_call = false;
}

View File

@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_riscv_v()) {
features.push_back({ "RISCV_V", "1" });
}
if (ggml_cpu_get_rvv_vlen() > 0) {
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
}
if (ggml_cpu_has_vsx()) {
features.push_back({ "VSX", "1" });
}

View File

@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
if (cur->type == GGML_TYPE_Q4_0) {
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
if (cur->ne[1] % 8 == 0) {
return &q4_0_8x8_q8_0;
}

View File

@ -67,19 +67,22 @@
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons

View File

@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const bool did_not_have_any_data = kbc0 == kbc0_stop;
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
int bidx = bidx0 - 1;
int kbc_stop = kbc0;
while(true) {
const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
if (kbc == kbc_stop) { // Did not have any data.
bidx--;
kbc_stop = kbc;

View File

@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
// kbc == k block continuous, current index in continuous ijk space.
int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
const half * mask_h = ncols2 == 1 && !mask ? nullptr :
(const half *) (mask + nb33*(sequence % ne33));
(const half *) (mask + nb33*(sequence % ne33));
float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));

View File

@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
return 8 * (threadIdx.x / 16) + l;
#elif defined(RDNA3)
return 2 * l + (threadIdx.x / 16);
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4)
} else {
NO_DEVICE_CODE;
@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
}
}
#elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3)
// RDNA3 has duplicated data as input.
static constexpr int ne = I * J / 32 * 2;
#else
static constexpr int ne = I * J / 32;
#endif // defined(RDNA3)
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) {
#if defined(RDNA4)
return 4 * (threadIdx.x / 16) + l;
#elif defined(RDNA3)
return l;
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4)
} else {
NO_DEVICE_CODE;
return -1;
@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
#if defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3)
// RDNA3 has duplicated data as input.
static constexpr int ne = I * J / 32 * 2;
#else
static constexpr int ne = I * J / 32;
#endif // defined(RDNA3)
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 16 && J == 8) return true;
return false;
@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 8) {
#if defined(RDNA4)
return 4 * (threadIdx.x / 16) + l;
#elif defined(RDNA3)
return l;
#else
NO_DEVICE_CODE;
return -1;
#endif // defined(RDNA4)
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true;
if (I == 16 && J == 4) return true;
@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
}
#elif defined(AMD_WMMA_AVAILABLE)
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
#if defined(RDNA4)
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
#elif defined(RDNA3)
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
#else
NO_DEVICE_CODE;
#endif // defined(RDNA4)
} else if constexpr (std::is_same_v<T, int>) {
if constexpr (I == 16 && J == 4) {
int64_t * xi = (int64_t *) t.x;
@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
#elif defined(RDNA3)
using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // RDNA4
#else
GGML_UNUSED_VARS(D, A, B);
@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
#elif defined(RDNA3)
using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
using floatx8_t = __attribute__((ext_vector_type(8))) float;
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // RDNA4
#else
GGML_UNUSED_VARS(D, A, B);

View File

@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
return false;
}
} else {
if (src1_ncols > 16) {
if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
return false;
} else if (src1_ncols > 16) {
return false;
}
}
@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
case GGML_TYPE_F32:
return ampere_mma_available(cc);
case GGML_TYPE_F16:
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
case GGML_TYPE_BF16:
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
return ampere_mma_available(cc) || amd_wmma_available(cc);
default:
return false;
}

View File

@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp16_mma_hardware_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
return ne11 <= 3;
}
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return ne11 <= 5;
}
return ne11 <= 2;

View File

@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
freq_base (cparams.rope_freq_base),
freq_scale (cparams.rope_freq_scale),
ext_factor (cparams.yarn_ext_factor),
attn_factor (cparams.yarn_attn_factor),
attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
beta_fast (cparams.yarn_beta_fast),
beta_slow (cparams.yarn_beta_slow),
norm_eps (hparams.f_norm_eps),

View File

@ -1,7 +1,9 @@
#include "llama-hparams.h"
#include "ggml.h"
#include <cassert>
#include <cmath>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
return false;
}
float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
GGML_ASSERT(ext_factor >= 0.0f);
if (ext_factor != 0.0f) {
attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
}
return attn_factor;
}

View File

@ -107,6 +107,7 @@ struct llama_hparams {
float rope_freq_base_train_swa;
float rope_freq_scale_train;
float rope_freq_scale_train_swa;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
@ -267,7 +268,13 @@ struct llama_hparams {
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
//
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
// https://github.com/ggml-org/llama.cpp/pull/17945
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
float freq_scale) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;
ggml_tensor * tmp;
if (ggml_is_quantized(cur->type)) {

View File

@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// that have no expert_gating_func model parameter set
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
}
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// cancel the factor from the convert script
hparams.rope_yarn_log_mul /= 0.1f;
}
// (optional) temperature tuning - used by mistral-large
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
@ -2267,9 +2272,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
if (hparams.f_attn_temp_scale != 0.0f) {
@ -2279,18 +2284,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
}
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
// but may need further verification with other values
if (hparams.rope_yarn_log_mul != 0.0f) {
float factor = 1.0f / hparams.rope_freq_scale_train;
float mscale = 1.0f;
float mscale_all_dims = hparams.rope_yarn_log_mul;
static auto get_mscale = [](float scale, float mscale) {
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
};
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
}
switch (hparams.n_layer) {
case 26: type = LLM_TYPE_3B; break;
case 34: type = LLM_TYPE_8B; break;
@ -2301,6 +2294,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: throw std::runtime_error("unsupported model architecture");
}
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
if (hparams.rope_yarn_log_mul != 0.0f) {
const float factor = 1.0f / hparams.rope_freq_scale_train;
// note: here we assume `mscale == 1.0f`
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
float mscale = 1.0f;
const float mscale_all_dims = hparams.rope_yarn_log_mul;
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// special-case DEEPSEEK v2:
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
mscale = mscale_all_dims;
}
static auto get_mscale = [](float scale, float mscale) {
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
};
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
__func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
}
pimpl->n_bytes = ml.n_bytes;
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
@ -6806,6 +6825,7 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
// MRoPE (Multi-axis Rotary Position Embedding) sections
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@ -6869,7 +6889,6 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
}
if (arch == LLM_ARCH_QWEN2MOE) {

View File

@ -1,7 +1,5 @@
#include "models.h"
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
GGML_ASSERT(ext_factor >= 0.0f);
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
// use the original attn_factor to pre-scale the kq_scale
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -20,20 +20,20 @@ int main(void) {
std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : ctx_arg.options) {
// check for args duplications
for (const auto & arg : opt.args) {
for (const auto & arg : opt.get_args()) {
if (seen_args.find(arg) == seen_args.end()) {
seen_args.insert(arg);
} else {
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
exit(1);
}
}
// check for env var duplications
if (opt.env) {
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
seen_env_vars.insert(opt.env);
for (const auto & env : opt.get_env()) {
if (seen_env_vars.find(env) == seen_env_vars.end()) {
seen_env_vars.insert(env);
} else {
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
exit(1);
}
}
@ -115,6 +115,14 @@ int main(void) {
assert(params.model.path == "blah.gguf");
assert(params.cpuparams.n_threads == 1010);
printf("test-arg-parser: test negated environment variables\n\n");
setenv("LLAMA_ARG_MMAP", "0", true);
setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
argv = {"binary_name"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.use_mmap == false);
assert(params.no_perf == true);
printf("test-arg-parser: test environment variables being overwritten\n\n");

View File

@ -6,11 +6,25 @@ add_library(mtmd
mtmd.cpp
mtmd-audio.cpp
mtmd.h
mtmd-helper.cpp
mtmd-helper.h
clip.cpp
clip.h
clip-impl.h
mtmd-helper.cpp
mtmd-helper.h
clip-model.h
clip-graph.h
models/models.h
models/cogvlm.cpp
models/internvl.cpp
models/kimivl.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp
models/pixtral.cpp
models/qwen2vl.cpp
models/qwen3vl.cpp
models/siglip.cpp
models/whisper-enc.cpp
)
set_target_properties(mtmd PROPERTIES
@ -53,6 +67,15 @@ if (TARGET BUILD_INFO)
add_dependencies(mtmd-helper BUILD_INFO)
endif()
# if mtmd is linked against common, we throw an error
if (TARGET mtmd)
get_target_property(libs mtmd LINK_LIBRARIES)
if (libs AND "common" IN_LIST libs)
message(FATAL_ERROR "mtmd is designed to be a public library.\n"
"It must not link against common")
endif()
endif()
add_executable(llama-llava-cli deprecation-warning.cpp)
add_executable(llama-gemma3-cli deprecation-warning.cpp)
add_executable(llama-minicpmv-cli deprecation-warning.cpp)

115
tools/mtmd/clip-graph.h Normal file
View File

@ -0,0 +1,115 @@
#pragma once
#include "ggml.h"
#include "ggml-cpp.h"
#include "clip.h"
#include "clip-impl.h"
#include "clip-model.h"
#include <vector>
#include <functional>
struct clip_graph {
const clip_model & model;
const clip_hparams & hparams;
projector_type proj_type;
// we only support single image per batch
const clip_image_f32 & img;
const int patch_size;
const int n_patches_x;
const int n_patches_y;
const int n_patches;
const int n_embd;
const int n_head;
const int d_head;
const int n_layer;
const int n_mmproj_embd;
const float eps;
const float kq_scale;
const clip_flash_attn_type flash_attn_type;
// for debugging
const bool debug_graph;
std::vector<ggml_tensor *> & debug_print_tensors;
ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;
clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
virtual ~clip_graph() = default;
virtual ggml_cgraph * build() = 0;
//
// utility functions
//
void cb(ggml_tensor * cur0, const char * name, int il) const;
// siglip2 naflex
ggml_tensor * resize_position_embeddings();
// build vision transformer (ViT) cgraph
// this function should cover most of the models
// if your model has specific features, you should probably duplicate this function
ggml_tensor * build_vit(
ggml_tensor * inp,
int64_t n_pos,
norm_type norm_t,
ffn_op_type ffn_t,
ggml_tensor * learned_pos_embd,
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
// build the input after conv2d (inp_raw --> patches)
// returns tensor with shape [n_embd, n_patches]
ggml_tensor * build_inp();
ggml_tensor * build_inp_raw(int channels = 3);
ggml_tensor * build_norm(
ggml_tensor * cur,
ggml_tensor * mw,
ggml_tensor * mb,
norm_type type,
float norm_eps,
int il) const;
ggml_tensor * build_ffn(
ggml_tensor * cur,
ggml_tensor * up,
ggml_tensor * up_b,
ggml_tensor * gate,
ggml_tensor * gate_b,
ggml_tensor * down,
ggml_tensor * down_b,
ffn_op_type type_op,
int il) const;
ggml_tensor * build_attn(
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_mask,
float kq_scale,
int il) const;
// implementation of the 2D RoPE without adding a new op in ggml
// this is not efficient (use double the memory), but works on all backends
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
ggml_tensor * build_rope_2d(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * pos_a, // first half
ggml_tensor * pos_b, // second half
const float freq_base,
const bool interleave_freq
);
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
// support dynamic resolution
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
};

View File

@ -1,3 +1,5 @@
#pragma once
#include "ggml.h"
#include "gguf.h"
#include "clip.h"
@ -13,6 +15,8 @@
// Internal header for clip.cpp
#define MTMD_INTERNAL_HEADER
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
@ -132,6 +136,10 @@
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
// forward declaration
// TODO: improve this later
struct clip_ctx;
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,

279
tools/mtmd/clip-model.h Normal file
View File

@ -0,0 +1,279 @@
#pragma once
#include "ggml.h"
#include "clip.h"
#include "clip-impl.h"
#include <vector>
#include <unordered_set>
#include <cstdint>
#include <cmath>
enum ffn_op_type {
FFN_GELU,
FFN_GELU_ERF,
FFN_SILU,
FFN_GELU_QUICK,
};
enum norm_type {
NORM_TYPE_NORMAL,
NORM_TYPE_RMS,
};
enum patch_merge_type {
PATCH_MERGE_FLAT,
PATCH_MERGE_SPATIAL_UNPAD,
};
struct clip_hparams {
int32_t image_size = 0;
int32_t patch_size = 0;
int32_t n_embd = 0;
int32_t n_ff = 0;
int32_t projection_dim = 0;
int32_t n_head = 0;
int32_t n_layer = 0;
// idefics3
int32_t image_longest_edge = 0;
int32_t image_min_pixels = -1;
int32_t image_max_pixels = -1;
int32_t n_merge = 0; // number of patch merges **per-side**
float image_mean[3];
float image_std[3];
// for models using dynamic image size, we need to have a smaller image size to warmup
// otherwise, user will get OOM everytime they load the model
int32_t warmup_image_size = 0;
int32_t warmup_audio_size = 3000;
ffn_op_type ffn_op = FFN_GELU;
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
float eps = 1e-6;
float rope_theta = 0.0;
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
int32_t image_crop_resolution;
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
// audio
int32_t n_mel_bins = 0; // whisper preprocessor
int32_t proj_stack_factor = 0; // ultravox
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
// custom value provided by user, can be undefined if not set
int32_t custom_image_min_tokens = -1;
int32_t custom_image_max_tokens = -1;
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
const int cur_merge = n_merge == 0 ? 1 : n_merge;
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
}
void set_warmup_n_tokens(int n_tokens) {
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
const int cur_merge = n_merge == 0 ? 1 : n_merge;
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
// TODO: support warmup size for custom token numbers
}
};
struct clip_layer {
// attention
ggml_tensor * k_w = nullptr;
ggml_tensor * k_b = nullptr;
ggml_tensor * q_w = nullptr;
ggml_tensor * q_b = nullptr;
ggml_tensor * v_w = nullptr;
ggml_tensor * v_b = nullptr;
ggml_tensor * qkv_w = nullptr;
ggml_tensor * qkv_b = nullptr;
ggml_tensor * o_w = nullptr;
ggml_tensor * o_b = nullptr;
ggml_tensor * k_norm = nullptr;
ggml_tensor * q_norm = nullptr;
// layernorm 1
ggml_tensor * ln_1_w = nullptr;
ggml_tensor * ln_1_b = nullptr;
ggml_tensor * ff_up_w = nullptr;
ggml_tensor * ff_up_b = nullptr;
ggml_tensor * ff_gate_w = nullptr;
ggml_tensor * ff_gate_b = nullptr;
ggml_tensor * ff_down_w = nullptr;
ggml_tensor * ff_down_b = nullptr;
// layernorm 2
ggml_tensor * ln_2_w = nullptr;
ggml_tensor * ln_2_b = nullptr;
// layer scale (no bias)
ggml_tensor * ls_1_w = nullptr;
ggml_tensor * ls_2_w = nullptr;
// qwen3vl deepstack merger
ggml_tensor * deepstack_norm_w = nullptr;
ggml_tensor * deepstack_norm_b = nullptr;
ggml_tensor * deepstack_fc1_w = nullptr;
ggml_tensor * deepstack_fc1_b = nullptr;
ggml_tensor * deepstack_fc2_w = nullptr;
ggml_tensor * deepstack_fc2_b = nullptr;
bool has_deepstack() const {
return deepstack_fc1_w != nullptr;
}
};
struct clip_model {
clip_modality modality = CLIP_MODALITY_VISION;
projector_type proj_type = PROJECTOR_TYPE_MLP;
clip_hparams hparams;
// embeddings
ggml_tensor * class_embedding = nullptr;
ggml_tensor * patch_embeddings_0 = nullptr;
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
ggml_tensor * patch_bias = nullptr;
ggml_tensor * position_embeddings = nullptr;
ggml_tensor * pre_ln_w = nullptr;
ggml_tensor * pre_ln_b = nullptr;
std::vector<clip_layer> layers;
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
ggml_tensor * post_ln_w;
ggml_tensor * post_ln_b;
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
ggml_tensor * mm_fc_w;
ggml_tensor * mm_fc_b;
// LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr;
ggml_tensor * mm_input_norm_b = nullptr;
ggml_tensor * mm_0_w = nullptr;
ggml_tensor * mm_0_b = nullptr;
ggml_tensor * mm_2_w = nullptr;
ggml_tensor * mm_2_b = nullptr;
ggml_tensor * image_newline = nullptr;
// Yi type models with mlp+normalization projection
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
ggml_tensor * mm_1_b = nullptr;
ggml_tensor * mm_3_w = nullptr;
ggml_tensor * mm_3_b = nullptr;
ggml_tensor * mm_4_w = nullptr;
ggml_tensor * mm_4_b = nullptr;
// GLMV-Edge projection
ggml_tensor * mm_model_adapter_conv_w = nullptr;
ggml_tensor * mm_model_adapter_conv_b = nullptr;
// MobileVLM projection
ggml_tensor * mm_model_mlp_1_w = nullptr;
ggml_tensor * mm_model_mlp_1_b = nullptr;
ggml_tensor * mm_model_mlp_3_w = nullptr;
ggml_tensor * mm_model_mlp_3_b = nullptr;
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
// MobileVLM_V2 projection
ggml_tensor * mm_model_mlp_0_w = nullptr;
ggml_tensor * mm_model_mlp_0_b = nullptr;
ggml_tensor * mm_model_mlp_2_w = nullptr;
ggml_tensor * mm_model_mlp_2_b = nullptr;
ggml_tensor * mm_model_peg_0_w = nullptr;
ggml_tensor * mm_model_peg_0_b = nullptr;
// MINICPMV projection
ggml_tensor * mm_model_pos_embed_k = nullptr;
ggml_tensor * mm_model_query = nullptr;
ggml_tensor * mm_model_proj = nullptr;
ggml_tensor * mm_model_kv_proj = nullptr;
ggml_tensor * mm_model_attn_q_w = nullptr;
ggml_tensor * mm_model_attn_q_b = nullptr;
ggml_tensor * mm_model_attn_k_w = nullptr;
ggml_tensor * mm_model_attn_k_b = nullptr;
ggml_tensor * mm_model_attn_v_w = nullptr;
ggml_tensor * mm_model_attn_v_b = nullptr;
ggml_tensor * mm_model_attn_o_w = nullptr;
ggml_tensor * mm_model_attn_o_b = nullptr;
ggml_tensor * mm_model_ln_q_w = nullptr;
ggml_tensor * mm_model_ln_q_b = nullptr;
ggml_tensor * mm_model_ln_kv_w = nullptr;
ggml_tensor * mm_model_ln_kv_b = nullptr;
ggml_tensor * mm_model_ln_post_w = nullptr;
ggml_tensor * mm_model_ln_post_b = nullptr;
// gemma3
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr;
// pixtral
ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr;
// ultravox / whisper encoder
ggml_tensor * conv1d_1_w = nullptr;
ggml_tensor * conv1d_1_b = nullptr;
ggml_tensor * conv1d_2_w = nullptr;
ggml_tensor * conv1d_2_b = nullptr;
ggml_tensor * mm_norm_pre_w = nullptr;
ggml_tensor * mm_norm_mid_w = nullptr;
// cogvlm
ggml_tensor * mm_post_fc_norm_w = nullptr;
ggml_tensor * mm_post_fc_norm_b = nullptr;
ggml_tensor * mm_h_to_4h_w = nullptr;
ggml_tensor * mm_gate_w = nullptr;
ggml_tensor * mm_4h_to_h_w = nullptr;
ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr;
bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
}
bool audio_has_stack_frames() const {
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
}
};

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,8 @@
// !!! Internal header, to be used by mtmd only !!!
#define MTMD_INTERNAL_HEADER
struct clip_ctx;
struct clip_image_size {

View File

@ -0,0 +1,98 @@
#include "models.h"
ggml_cgraph * clip_graph_cogvlm::build() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_pos = n_patches + 1; // +1 for [CLS]
// build input and concatenate class embedding
ggml_tensor * inp = build_inp();
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
inp = ggml_add(ctx0, inp, model.position_embeddings);
cb(inp, "inp_pos", -1);
ggml_tensor * inpL = inp;
for (int il = 0; il < n_layer; il++) {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL;
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], n_embd * sizeof(float));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 2 * n_embd * sizeof(float));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "attn_post_norm", il);
cur = ggml_add(ctx0, cur, inpL);
inpL = cur;
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "ffn_post_norm", il);
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "layer_out", il);
inpL = cur;
}
// remove CLS token (like build_llama4 does)
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
n_embd, n_patches,
ggml_row_size(inpL->type, n_embd), 0);
// Multiply with mm_model_proj
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
// Apply layernorm, weight, bias
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
// Apply GELU
cur = ggml_gelu_inplace(ctx0, cur);
// Branch 1: multiply with mm_h_to_4h_w
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
// Branch 2: multiply with mm_gate_w
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
// Apply silu
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
// Apply mm_4h_to_h_w
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
// Concatenate with boi and eoi
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,69 @@
#include "models.h"
ggml_cgraph * clip_graph_internvl::build() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_pos = n_patches + 1;
ggml_tensor * inp = build_inp();
// add CLS token
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
// The larger models use a different ViT, which uses RMS norm instead of layer norm
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
ggml_tensor * cur = build_vit(
inp, n_pos,
norm_t,
hparams.ffn_op,
model.position_embeddings,
nullptr);
// remove CLS token
cur = ggml_view_2d(ctx0, cur,
n_embd, n_patches,
ggml_row_size(cur->type, n_embd), 0);
// pixel shuffle
{
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = n_patches_y;
const int width = n_patches_x;
GGML_ASSERT(scale_factor > 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor,
cur->ne[1] * cur->ne[2]);
}
// projector (always using GELU activation)
{
// projector LayerNorm uses pytorch's default eps = 1e-5
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_3_w, model.mm_3_b,
FFN_GELU,
-1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,63 @@
#include "models.h"
ggml_cgraph * clip_graph_kimivl::build() {
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * learned_pos_embd = resize_position_embeddings();
// build ViT with 2D position embeddings
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
// first half is X axis and second half is Y axis
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
{
// patch_merger
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
// projection norm
int proj_inp_dim = cur->ne[0];
cur = ggml_view_2d(ctx0, cur,
n_embd, cur->ne[1] * scale_factor * scale_factor,
ggml_row_size(cur->type, n_embd), 0);
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_view_2d(ctx0, cur,
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
ggml_row_size(cur->type, proj_inp_dim), 0);
cb(cur, "proj_inp_normed", -1);
// projection mlp
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
cb(cur, "proj_out", -1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,96 @@
#include "models.h"
ggml_cgraph * clip_graph_llama4::build() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_pos = n_patches + 1; // +1 for [CLS]
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * inp = build_inp_raw();
// Llama4UnfoldConvolution
{
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
patch_size, patch_size, 3, n_embd);
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
cb(inp, "patch_conv", -1);
}
// add CLS token
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
// build ViT with 2D position embeddings
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
// first half is X axis and second half is Y axis
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
model.position_embeddings,
add_pos);
// remove CLS token
cur = ggml_view_2d(ctx0, cur,
n_embd, n_patches,
ggml_row_size(cur->type, n_embd), 0);
// pixel shuffle
// based on Llama4VisionPixelShuffleMLP
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
{
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
GGML_ASSERT(scale_factor > 0);
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
cur = ggml_reshape_4d(ctx0, cur,
n_embd * scale_factor,
n_patches_x / scale_factor,
n_patches_y,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor,
n_patches_x / scale_factor,
n_patches_y / scale_factor,
bsz);
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor,
n_patches / scale_factor / scale_factor);
cb(cur, "pixel_shuffle", -1);
}
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
{
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
cur = ggml_gelu(ctx0, cur);
cb(cur, "adapter_mlp", -1);
}
// Llama4MultiModalProjector
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
cb(cur, "projected", -1);
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

374
tools/mtmd/models/llava.cpp Normal file
View File

@ -0,0 +1,374 @@
#include "models.h"
// this graph is used by llava, granite and glm
// due to having embedding_stack (used by granite), we cannot reuse build_vit
ggml_cgraph * clip_graph_llava::build() {
const int batch_size = 1;
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
// Calculate the deepest feature layer based on hparams and projector type
int max_feature_layer = n_layer;
{
// Get the index of the second to last layer; this is the default for models that have a llava projector
int il_last = hparams.n_layer - 1;
int deepest_feature_layer = -1;
if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
il_last += 1;
}
// If we set explicit vision feature layers, only go up to the deepest one
// NOTE: only used by granite-vision models for now
for (const auto & feature_layer : hparams.vision_feature_layer) {
if (feature_layer > deepest_feature_layer) {
deepest_feature_layer = feature_layer;
}
}
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
}
ggml_tensor * inp = build_inp();
// concat class_embeddings and patch_embeddings
if (model.class_embedding) {
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
}
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
ggml_tensor * inpL = inp;
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
cb(inpL, "pre_ln", -1);
}
std::vector<ggml_tensor *> embedding_stack;
const auto & vision_feature_layer = hparams.vision_feature_layer;
// loop over layers
for (int il = 0; il < max_feature_layer; il++) {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// If this is an embedding feature layer, save the output.
// NOTE: 0 index here refers to the input to the encoder.
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
embedding_stack.push_back(cur);
}
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "layer_inp_normed", il);
// self-attention
{
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
}
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
}
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
cb(cur, "ffn_inp", il);
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "ffn_inp_normed", il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
inpL = cur;
}
// post-layernorm
if (model.post_ln_w) {
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
}
ggml_tensor * embeddings = inpL;
// process vision feature layers (used by granite)
{
// final layer is a vision feature layer
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
embedding_stack.push_back(inpL);
}
// If feature layers are explicitly set, stack them (if we have multiple)
if (!embedding_stack.empty()) {
embeddings = embedding_stack[0];
for (size_t i = 1; i < embedding_stack.size(); i++) {
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
}
}
}
// llava projector (also used by granite)
if (hparams.has_llava_projector) {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(patches, "patches");
ggml_set_input(patches);
// shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1]
embeddings = ggml_get_rows(ctx0, embeddings, patches);
// print_tensor_info(embeddings, "embeddings");
// llava projector
if (proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_gelu(ctx0, embeddings);
if (model.mm_2_w) {
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
}
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
// First LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
model.mm_1_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
// Second LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
model.mm_4_b);
}
else if (proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
int n_patch = 24;
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
mlp_1 = ggml_gelu(ctx0, mlp_1);
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add(ctx0, mlp_3, block_1);
}
// block_2
{
// stride = 2
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
// not sure the parameters is right for globalAvgPooling
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
}
embeddings = block_1;
}
else if (proj_type == PROJECTOR_TYPE_LDPV2)
{
int n_patch = 24;
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
mlp_0 = ggml_gelu(ctx0, mlp_0);
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
// mlp_2 ne = [576, 2048, 1, 1]
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
// mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1]
ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
embeddings = peg_0;
}
else {
GGML_ABORT("fatal error");
}
}
// glm projector
else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
// GLU
{
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
ggml_tensor * x = embeddings;
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
}
// arrangement of BOI/EOI token embeddings
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
{
embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
}
}
else {
GGML_ABORT("llava: unknown projector type");
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -0,0 +1,114 @@
#include "models.h"
ggml_cgraph * clip_graph_minicpmv::build() {
GGML_ASSERT(model.class_embedding == nullptr);
const int n_pos = n_patches;
const int n_embd_proj = n_mmproj_embd;
// position embeddings for the projector (not for ViT)
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
// base frequency omega
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
ggml_set_name(omega, "omega");
ggml_set_input(omega);
// 2D input positions (using float for sinusoidal embeddings)
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
// for selecting learned pos embd, used by ViT
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
ggml_tensor * inp = build_inp();
ggml_tensor * embeddings = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
nullptr);
// resampler projector (it is just another transformer)
ggml_tensor * q = model.mm_model_query;
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
// norm
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
// calculate sinusoidal pos embd
ggml_tensor * pos_embed = nullptr;
{
// outer product
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
// sin and cos
ggml_tensor * pos_embd_x = ggml_concat(
ctx0,
ggml_sin(ctx0, theta_x),
ggml_cos(ctx0, theta_x),
0 // concat on first dim
);
ggml_tensor * pos_embd_y = ggml_concat(
ctx0,
ggml_sin(ctx0, theta_y),
ggml_cos(ctx0, theta_y),
0 // concat on first dim
);
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
}
// k = v + pos_embed
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
// attention
{
const int d_head = 128;
int n_head = n_embd_proj/d_head;
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b);
ggml_tensor * K = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
model.mm_model_attn_k_b);
ggml_tensor * V = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
model.mm_model_attn_v_b);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
cb(Q, "resampler_Q", -1);
cb(K, "resampler_K", -1);
cb(V, "resampler_V", -1);
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
embeddings = build_attn(
model.mm_model_attn_o_w,
model.mm_model_attn_o_b,
Q, K, V, nullptr, resampler_kq_scale, -1);
cb(embeddings, "resampler_attn_out", -1);
}
// layernorm
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
// projection
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -0,0 +1,58 @@
#pragma once
#include "../clip-graph.h"
struct clip_graph_siglip : clip_graph {
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_pixtral : clip_graph {
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_internvl : clip_graph {
clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llama4 : clip_graph {
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_kimivl : clip_graph {
clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_cogvlm : clip_graph {
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llava : clip_graph {
clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_whisper_enc : clip_graph {
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@ -0,0 +1,86 @@
#include "models.h"
ggml_cgraph * clip_graph_pixtral::build() {
const int n_merge = hparams.n_merge;
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
};
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_RMS,
hparams.ffn_op,
nullptr, // no learned pos embd
add_pos);
// mistral small 3.1 patch merger
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
if (model.mm_patch_merger_w) {
GGML_ASSERT(hparams.n_merge > 0);
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
// reshape image tokens to 2D grid
cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
cur = ggml_cont(ctx0, cur);
// torch.nn.functional.unfold is just an im2col under the hood
// we just need a dummy kernel to make it work
ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
// project to n_embd
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
}
// LlavaMultiModalProjector (always using GELU activation)
{
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
}
// arrangement of the [IMG_BREAK] token
if (model.token_embd_img_break) {
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
const int p_total = p_x * p_y;
const int n_embd_text = cur->ne[0];
const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
tmp = ggml_concat(ctx0, tmp, tok, 1);
cur = ggml_view_2d(ctx0, tmp,
n_embd_text, n_tokens_output,
ggml_row_size(tmp->type, n_embd_text), 0);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,183 @@
#include "models.h"
ggml_cgraph * clip_graph_qwen2vl::build() {
GGML_ASSERT(model.patch_bias == nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const bool use_window_attn = hparams.n_wa_pattern > 0;
const int n_wa_pattern = hparams.n_wa_pattern;
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
? NORM_TYPE_RMS // qwen 2.5 vl
: NORM_TYPE_NORMAL; // qwen 2 vl
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
ggml_tensor * inpL = inp;
ggml_tensor * window_mask = nullptr;
ggml_tensor * window_idx = nullptr;
ggml_tensor * inv_window_idx = nullptr;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
if (use_window_attn) {
// handle window attention inputs
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(inv_window_idx, "inv_window_idx");
ggml_set_input(inv_window_idx);
// mask for window attention
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
cb(cur, "ln1", il);
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// apply M-RoPE
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
cb(cur, "attn_out", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
cb(cur, "ffn_inp", il);
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
cb(cur, "ffn_inp_normed", il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
inpL = cur;
}
// post-layernorm
if (model.post_ln_w) {
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// multimodal projection
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
ggml_set_name(window_idx, "window_idx");
ggml_set_input(window_idx);
// embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -0,0 +1,191 @@
#include "models.h"
ggml_cgraph * clip_graph_qwen3vl::build() {
GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
// add patch bias
if (model.patch_bias != nullptr) {
inp = ggml_add(ctx0, inp, model.patch_bias);
cb(inp, "patch_bias", -1);
}
// calculate absolute position embedding and apply
ggml_tensor * learned_pos_embd = resize_position_embeddings();
learned_pos_embd = ggml_cont_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
learned_pos_embd = ggml_reshape_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
learned_pos_embd = ggml_cont_3d(
ctx0, learned_pos_embd,
n_embd, n_patches_x * n_patches_y, batch_size);
inp = ggml_add(ctx0, inp, learned_pos_embd);
cb(inp, "inp_pos_emb", -1);
ggml_tensor * inpL = inp;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
ggml_tensor * deepstack_features = nullptr;
const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
// loop over layers
for (int il = 0; il < n_layer; il++) {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
cb(cur, "ln1", il);
// self-attention
{
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// apply M-RoPE
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
cb(cur, "ffn_inp", il);
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
cb(cur, "ffn_inp_normed", il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
if (layer.has_deepstack()) {
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
feat = build_ffn(feat,
layer.deepstack_fc1_w, layer.deepstack_fc1_b,
nullptr, nullptr,
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
ffn_op_type::FFN_GELU, il);
if(!deepstack_features) {
deepstack_features = feat;
} else {
// concat along the feature dimension
deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
}
}
inpL = cur;
}
// post-layernorm
if (model.post_ln_w) {
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// multimodal projection
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
ffn_op_type::FFN_GELU, -1);
embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}

View File

@ -0,0 +1,81 @@
#include "models.h"
ggml_cgraph * clip_graph_siglip::build() {
ggml_tensor * inp = build_inp();
ggml_tensor * learned_pos_embd = model.position_embeddings;
if (proj_type == PROJECTOR_TYPE_LFM2) {
learned_pos_embd = resize_position_embeddings();
}
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
nullptr);
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
const int batch_size = 1;
GGML_ASSERT(n_patches_x == n_patches_y);
const int patches_per_image = n_patches_x;
const int kernel_size = hparams.n_merge;
cur = ggml_transpose(ctx0, cur);
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
// doing a pool2d to reduce the number of output tokens
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
// apply norm before projection
cur = ggml_rms_norm(ctx0, cur, eps);
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
// apply projection
cur = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
cur);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// pixel_shuffle
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
cur = ggml_mul_mat(ctx0, model.projection, cur);
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
// projection
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
} else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
hparams.ffn_op,
-1);
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,107 @@
#include "models.h"
ggml_cgraph * clip_graph_whisper_enc::build() {
const int n_frames = img.nx;
const int n_pos = n_frames / 2;
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
ggml_tensor * inp = build_inp_raw(1);
// conv1d block
{
// convolution + gelu
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
cur = ggml_gelu_erf(ctx0, cur);
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
cur = ggml_gelu_erf(ctx0, cur);
// transpose
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cb(inp, "after_conv1d", -1);
}
// sanity check (only check one layer, but it should be the same for all)
GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
GGML_ASSERT(model.layers[0].q_b);
GGML_ASSERT(model.layers[0].v_b);
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
ggml_tensor * pos_embd_selected = ggml_view_2d(
ctx0, model.position_embeddings,
model.position_embeddings->ne[0], n_pos,
model.position_embeddings->nb[1], 0
);
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
pos_embd_selected,
nullptr);
cb(cur, "after_transformer", -1);
if (model.audio_has_stack_frames()) {
// StackAudioFrames
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
int64_t stride = n_embd * hparams.proj_stack_factor;
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
int64_t pad = padded_len - ggml_nelements(cur);
if (pad > 0) {
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
}
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
ggml_row_size(cur->type, stride), 0);
cb(cur, "after_stacked", -1);
}
if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
// UltravoxProjector
// pre-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
// ffn in
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
// swiglu
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
cur = ggml_swiglu_swapped(ctx0, cur);
// mid-norm
cur = ggml_rms_norm(ctx0, cur, 1e-6);
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
// ffn out
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
// projector
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
// projector
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else {
GGML_ABORT("%s: unknown projector type", __func__);
}
cb(cur, "projected", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -6,6 +6,8 @@
#include <vector>
#include <string>
#define MTMD_INTERNAL_HEADER
#define WHISPER_ASSERT GGML_ASSERT
#define WHISPER_SAMPLE_RATE 16000

View File

@ -32,6 +32,10 @@
#define STB_IMAGE_IMPLEMENTATION
#include "stb/stb_image.h"
#ifdef MTMD_INTERNAL_HEADER
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
#endif
//
// internal logging functions
//

View File

@ -22,6 +22,11 @@
* Issues related to API usage may receive lower priority support.
*
* For the usage, see an example in mtmd-cli.cpp
*
* For contributors:
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
* - Keep the API minimal, do not expose internal details unless necessary
*/
#ifdef LLAMA_SHARED

View File

@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--no-escape` | do not process escape sequences |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
| -------- | ----------- |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-sp, --special` | special tokens output enabled (default: false) |
| `--no-warmup` | skip warming up the model with an empty run |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
Example usage of docker compose with environment variables:
```yml

Binary file not shown.

View File

@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
expect(output).toBe(input); // Code blocks prevent misinterpretation
});
test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
const input = '```python\nfoo = "\\(bar\\)"\n```';
const output = preprocessLaTeX(input);
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
});
test('preserves backslash brackets in code blocks', () => {
const input = '```python\nfoo = "\\[bar\\]"\n```';
const output = preprocessLaTeX(input);
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
});
test('preserves backslash parentheses in inline code', () => {
const input = 'Use `foo = "\\(bar\\)"` in your code.';
const output = preprocessLaTeX(input);
expect(output).toBe(input);
});
test('escape backslash in mchem ce', () => {
const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
const output = preprocessLaTeX(input);

View File

@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
return expr;
});
// Step 5: Restore code blocks
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
return codeBlocks[parseInt(index)];
});
// Step 6: Apply additional escaping functions (brackets and mhchem)
// Step 5: Apply additional escaping functions (brackets and mhchem)
// This must happen BEFORE restoring code blocks to avoid affecting code content
content = escapeBrackets(content);
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
content = escapeMhchem(content);
}
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
// This must happen BEFORE restoring code blocks to avoid affecting code content
content = content
// Using the lookbehind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g.
@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
// Using the lookbehind pattern `(?<!\\)` we skip matches
// that are preceded by a backslash, e.g. `\\[4pt]`.
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
(_, prefix: string, content: string) => {
return `${prefix}$$${content}$$`;
(_, content: string) => {
return `$$${content}$$`;
}
);
// Step 7: Restore blockquote markers
// Step 7: Restore code blocks
// This happens AFTER all LaTeX conversions to preserve code content
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
return codeBlocks[parseInt(index)];
});
// Step 8: Restore blockquote markers
if (blockquoteMarkers.size > 0) {
const finalLines = content.split('\n');
const restoredLines = finalLines.map((line, index) => {

View File

@ -9,6 +9,10 @@ if (NOT MSVC)
endif()
target_link_libraries (${TARGET} PRIVATE Threads::Threads)
if (WIN32 AND NOT MSVC)
target_link_libraries(${TARGET} PUBLIC ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_compile_definitions(${TARGET} PRIVATE