diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
index b37b4f277d..89831ed5c2 100644
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -33,6 +33,7 @@ FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
+ libglvnd0 libgl1 libglx0 libegl1 libgles2 \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1193779d0b..446a3750d7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -152,13 +152,13 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
- DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
- echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
- "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
- tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -532,13 +532,13 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
- DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
- echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
- "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
- tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -1418,7 +1418,6 @@ jobs:
echo "FIXME: test on devices"
openEuler-latest-cmake-cann:
- if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
run:
shell: bash -el {0}
@@ -1705,6 +1704,34 @@ jobs:
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+ ggml-ci-mac-webgpu:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+
+ - name: Dawn Dependency
+ id: dawn-depends
+ run: |
+ DAWN_VERSION="v2.0.0"
+ DAWN_OWNER="reeselevine"
+ DAWN_REPO="dawn"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ curl -L -o artifact.zip \
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ mkdir dawn
+ unzip artifact.zip
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
diff --git a/.gitignore b/.gitignore
index 05eb578a82..bb122d6924 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,6 +130,7 @@ poetry.toml
# Local scripts
/run-vim.sh
/run-chat.sh
+/run-spec.sh
/.ccache/
# IDE
diff --git a/README.md b/README.md
index ed956bb02e..e59612f7ae 100644
--- a/README.md
+++ b/README.md
@@ -482,21 +482,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
-## [`llama-run`](tools/run)
-
-#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
-
--
- Run a model with a specific prompt (by default it's pulled from Ollama registry)
-
- ```bash
- llama-run granite-code
- ```
-
-
-
-[^3]: [RamaLama](https://github.com/containers/ramalama)
-
## [`llama-simple`](examples/simple)
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
@@ -600,7 +585,6 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
-- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
diff --git a/ci/run.sh b/ci/run.sh
index 5c2d325a56..3deebd5dd3 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -105,7 +105,20 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
+ if [ -z "${CMAKE_PREFIX_PATH}" ]; then
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
+ else
+ export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
+ fi
+ fi
+
+ # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
+ if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
+ fi
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
diff --git a/common/arg.cpp b/common/arg.cpp
index c3610d262b..72750a3cba 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -6,6 +6,7 @@
#include "log.h"
#include "sampling.h"
#include "download.h"
+#include "preset.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -268,6 +269,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
}
}
+static std::string clean_file_name(const std::string & fname) {
+ std::string clean_fname = fname;
+ string_replace_all(clean_fname, "\\", "_");
+ string_replace_all(clean_fname, "/", "_");
+ return clean_fname;
+}
+
+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+ GGML_ASSERT(!params.model.hf_repo.empty());
+
+ const bool offline = params.offline;
+ std::string model_endpoint = get_model_endpoint();
+ auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
+
+ // prepare local path for caching
+ auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
+ auto preset_path = fs_get_cache_file(preset_fname);
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+ const bool has_preset = status >= 200 && status < 400;
+
+ // remote preset is optional, so we don't error out if not found
+ if (has_preset) {
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
+ common_preset global; // unused for now
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
+ if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
+ common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+ preset.apply_to_params(params);
+ } else {
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
+ }
+ } else {
+ LOG_INF("%s", "no remote preset found, skipping\n");
+ }
+
+ return has_preset;
+}
+
struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
@@ -309,9 +350,7 @@ static handle_model_result common_params_handle_model(
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
- std::string filename = model.hf_repo + "_" + model.hf_file;
- // to make sure we don't have any slashes in the filename
- string_replace_all(filename, "/", "_");
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
model.path = fs_get_cache_file(filename);
}
@@ -425,61 +464,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
}
};
- std::set seen_args;
+ auto parse_cli_args = [&]() {
+ std::set seen_args;
- for (int i = 1; i < argc; i++) {
- const std::string arg_prefix = "--";
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
- std::string arg = argv[i];
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
- std::replace(arg.begin(), arg.end(), '_', '-');
- }
- if (arg_to_options.find(arg) == arg_to_options.end()) {
- throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
- }
- if (!seen_args.insert(arg).second) {
- LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
- }
- auto & tmp = arg_to_options[arg];
- auto opt = *tmp.first;
- bool is_positive = tmp.second;
- if (opt.has_value_from_env()) {
- fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
- }
- try {
- if (opt.handler_void) {
- opt.handler_void(params);
- continue;
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
}
- if (opt.handler_bool) {
- opt.handler_bool(params, is_positive);
- continue;
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto & tmp = arg_to_options[arg];
+ auto opt = *tmp.first;
+ bool is_positive = tmp.second;
+ if (opt.has_value_from_env()) {
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+ }
+ try {
+ if (opt.handler_void) {
+ opt.handler_void(params);
+ continue;
+ }
+ if (opt.handler_bool) {
+ opt.handler_bool(params, is_positive);
+ continue;
+ }
- // arg with single value
- check_arg(i);
- std::string val = argv[++i];
- if (opt.handler_int) {
- opt.handler_int(params, std::stoi(val));
- continue;
- }
- if (opt.handler_string) {
- opt.handler_string(params, val);
- continue;
- }
+ // arg with single value
+ check_arg(i);
+ std::string val = argv[++i];
+ if (opt.handler_int) {
+ opt.handler_int(params, std::stoi(val));
+ continue;
+ }
+ if (opt.handler_string) {
+ opt.handler_string(params, val);
+ continue;
+ }
- // arg with 2 values
- check_arg(i);
- std::string val2 = argv[++i];
- if (opt.handler_str_str) {
- opt.handler_str_str(params, val, val2);
- continue;
+ // arg with 2 values
+ check_arg(i);
+ std::string val2 = argv[++i];
+ if (opt.handler_str_str) {
+ opt.handler_str_str(params, val, val2);
+ continue;
+ }
+ } catch (std::exception & e) {
+ throw std::invalid_argument(string_format(
+ "error while handling argument \"%s\": %s\n\n"
+ "usage:\n%s\n\nto show complete usage, run with -h",
+ arg.c_str(), e.what(), opt.to_string().c_str()));
}
- } catch (std::exception & e) {
- throw std::invalid_argument(string_format(
- "error while handling argument \"%s\": %s\n\n"
- "usage:\n%s\n\nto show complete usage, run with -h",
- arg.c_str(), e.what(), opt.to_string().c_str()));
+ }
+ };
+
+ // parse the first time to get -hf option (used for remote preset)
+ parse_cli_args();
+
+ // maybe handle remote preset
+ if (!params.model.hf_repo.empty()) {
+ std::string cli_hf_repo = params.model.hf_repo;
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+ std::string preset_hf_repo = params.model.hf_repo;
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+ if (has_preset) {
+ // re-parse CLI args to override preset values
+ parse_cli_args();
+ }
+
+ // preserve hf_repo from preset if needed
+ if (preset_has_hf_repo) {
+ params.model.hf_repo = preset_hf_repo;
}
}
@@ -679,7 +744,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-quantize",
"llama-qwen2vl-cli",
"llama-retrieval",
- "llama-run",
"llama-save-load-state",
"llama-server",
"llama-simple",
@@ -1445,7 +1509,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.warmup = value;
}
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@@ -1761,7 +1825,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
else { throw std::invalid_argument("invalid value"); }
}
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
add_opt(common_arg(
{"--attention"}, "{causal,non-causal}",
"attention type for embeddings, use model default if unspecified",
@@ -2089,11 +2153,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
+ if (value) {
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
+ }
}
).set_env("LLAMA_ARG_MMAP"));
+ add_opt(common_arg(
+ {"-dio", "--direct-io"},
+ {"-ndio", "--no-direct-io"},
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_direct_io = value;
+ }
+ ).set_env("LLAMA_ARG_DIO"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
@@ -2245,7 +2320,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::vector split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
- string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
);
}
for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2285,10 +2360,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_FIT"));
add_opt(common_arg(
- { "-fitt", "--fit-target" }, "MiB",
- string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
- [](common_params & params, int value) {
- params.fit_params_target = value * size_t(1024*1024);
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
+ string_format("target margin per device for --fit, comma-separated list of values, "
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
+ [](common_params & params, const std::string & value) {
+ std::string arg_next = value;
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ throw std::invalid_argument(
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+ );
+ }
+ if (split_arg.size() == 1) {
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+ return;
+ }
+ for (size_t i = 0; i < split_arg.size(); i++) {
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+ }
}
).set_env("LLAMA_ARG_FIT_TARGET"));
add_opt(common_arg(
@@ -2609,7 +2702,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.embd_normalize = value;
}
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2687,7 +2780,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.embedding = true;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
{"--rerank", "--reranking"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -3378,6 +3471,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"--save-logits"},
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
+ [](common_params & params) {
+ params.save_logits = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--logits-output-dir"}, "PATH",
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.logits_output_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--tensor-filter"}, "REGEX",
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
+ [](common_params & params, const std::string & value) {
+ params.tensor_filter.push_back(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
// presets
add_opt(common_arg(
diff --git a/common/arg.h b/common/arg.h
index a1b6a14e67..55782a158d 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector & args);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-struct common_remote_params {
- std::vector headers;
- long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
- long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
-};
-// get remote file content, returns
-std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params);
diff --git a/common/common.cpp b/common/common.cpp
index 41b2b6833e..744f0b4eeb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
- params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+ params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}
@@ -1366,6 +1366,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
+ mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
diff --git a/common/common.h b/common/common.h
index daea6ded5b..7794c0268b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
//
enum llama_example {
+ LLAMA_EXAMPLE_DEBUG,
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_COMPLETION,
@@ -331,12 +332,14 @@ struct common_params {
// offload params
std::vector devices; // devices to use for offloading
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
- bool fit_params = true; // whether to fit unset model/context parameters to free device memory
- size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
- int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+
+ // margin per device in bytes for fitting parameters to free memory:
+ std::vector fit_params_target = std::vector(llama_max_devices(), 1024 * 1024*1024);
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@@ -372,6 +375,11 @@ struct common_params {
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
+ // llama-debug specific options
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
+ bool save_logits = false; // whether to save logits to files // NOLINT
+ std::vector tensor_filter; // filter tensor names for debug output (regex) // NOLINT
+
std::vector in_files; // all input files
std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector kv_overrides;
@@ -422,7 +430,8 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
- bool use_mmap = true; // use mmap for faster loads
+ bool use_mmap = true; // enable mmap to use filesystem cache
+ bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
diff --git a/common/download.cpp b/common/download.cpp
index ef87472560..a1e0e518e9 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -157,6 +157,10 @@ static std::string read_etag(const std::string & path) {
return none;
}
+static bool is_http_status_ok(int status) {
+ return status >= 200 && status < 400;
+}
+
#ifdef LLAMA_USE_CURL
//
@@ -306,11 +310,14 @@ static bool common_download_head(CURL * curl,
}
// download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
const std::string & path,
- const std::string & bearer_token) {
+ const std::string & bearer_token,
+ const common_header_list & custom_headers) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
+
for (int i = 0; i < max_attempts; ++i) {
std::string etag;
@@ -330,6 +337,11 @@ static bool common_download_file_single_online(const std::string & url,
common_load_model_from_url_headers headers;
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
curl_slist_ptr http_headers;
+
+ for (const auto & h : custom_headers) {
+ std::string s = h.first + ": " + h.second;
+ http_headers.ptr = curl_slist_append(http_headers.ptr, s.c_str());
+ }
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
if (!was_perform_successful) {
head_request_ok = false;
@@ -365,7 +377,7 @@ static bool common_download_file_single_online(const std::string & url,
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
- return false;
+ return -1;
}
}
@@ -374,14 +386,14 @@ static bool common_download_file_single_online(const std::string & url,
if (std::filesystem::exists(path_temporary)) {
if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
- return false;
+ return -1;
}
}
if (std::filesystem::exists(path)) {
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
- return false;
+ return -1;
}
}
}
@@ -408,23 +420,27 @@ static bool common_download_file_single_online(const std::string & url,
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
- if (http_code < 200 || http_code >= 400) {
+
+ int status = static_cast(http_code);
+ if (!is_http_status_ok(http_code)) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
- return false;
+ return status; // TODO: maybe only return on certain codes
}
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
- return false;
+ return -1;
}
+
+ return static_cast(http_code);
} else {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
- }
- break;
+ return 304; // Not Modified - fake cached response
+ }
}
- return true;
+ return -1; // max attempts reached
}
std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params) {
@@ -454,8 +470,10 @@ std::pair> common_remote_get_content(const std::string &
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
}
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+
for (const auto & header : params.headers) {
- http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
+ std::string header_ = header.first + ": " + header.second;
+ http_headers.ptr = curl_slist_append(http_headers.ptr, header_.c_str());
}
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
@@ -617,9 +635,11 @@ static bool common_pull_file(httplib::Client & cli,
}
// download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
+// returns status code or -1 on error
+static int common_download_file_single_online(const std::string & url,
const std::string & path,
- const std::string & bearer_token) {
+ const std::string & bearer_token,
+ const common_header_list & custom_headers) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
@@ -629,6 +649,9 @@ static bool common_download_file_single_online(const std::string & url,
if (!bearer_token.empty()) {
default_headers.insert({"Authorization", "Bearer " + bearer_token});
}
+ for (const auto & h : custom_headers) {
+ default_headers.emplace(h.first, h.second);
+ }
cli.set_default_headers(default_headers);
const bool file_exists = std::filesystem::exists(path);
@@ -647,8 +670,10 @@ static bool common_download_file_single_online(const std::string & url,
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
if (file_exists) {
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
- return true;
+ return 304; // 304 Not Modified - fake cached response
}
+ return head->status; // cannot use cached file, return raw status code
+ // TODO: maybe retry only on certain codes
}
std::string etag;
@@ -680,12 +705,12 @@ static bool common_download_file_single_online(const std::string & url,
if (file_exists) {
if (!should_download_from_scratch) {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
- return true;
+ return 304; // 304 Not Modified - fake cached response
}
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
- return false;
+ return -1;
}
}
@@ -697,7 +722,7 @@ static bool common_download_file_single_online(const std::string & url,
existing_size = std::filesystem::file_size(path_temporary);
} else if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
- return false;
+ return -1;
}
}
@@ -718,15 +743,16 @@ static bool common_download_file_single_online(const std::string & url,
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
- return false;
+ return -1;
}
if (!etag.empty()) {
write_etag(path, etag);
}
- break;
+
+ return head->status; // TODO: use actual GET status?
}
- return true;
+ return -1; // max attempts reached
}
std::pair> common_remote_get_content(const std::string & url,
@@ -734,13 +760,9 @@ std::pair> common_remote_get_content(const std::string
auto [cli, parts] = common_http_client(url);
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
+
for (const auto & header : params.headers) {
- size_t pos = header.find(':');
- if (pos != std::string::npos) {
- headers.emplace(header.substr(0, pos), header.substr(pos + 1));
- } else {
- headers.emplace(header, "");
- }
+ headers.emplace(header.first, header.second);
}
if (params.timeout > 0) {
@@ -769,32 +791,45 @@ std::pair> common_remote_get_content(const std::string
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
-static bool common_download_file_single(const std::string & url,
- const std::string & path,
- const std::string & bearer_token,
- bool offline) {
+int common_download_file_single(const std::string & url,
+ const std::string & path,
+ const std::string & bearer_token,
+ bool offline,
+ const common_header_list & headers) {
if (!offline) {
- return common_download_file_single_online(url, path, bearer_token);
+ return common_download_file_single_online(url, path, bearer_token, headers);
}
if (!std::filesystem::exists(path)) {
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
- return false;
+ return -1;
}
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
- return true;
+ return 304; // Not Modified - fake cached response
}
// download multiple files from remote URLs to local paths
// the input is a vector of pairs
-static bool common_download_file_multiple(const std::vector> & urls, const std::string & bearer_token, bool offline) {
+static bool common_download_file_multiple(const std::vector> & urls,
+ const std::string & bearer_token,
+ bool offline,
+ const common_header_list & headers) {
// Prepare download in parallel
std::vector> futures_download;
+ futures_download.reserve(urls.size());
+
for (auto const & item : urls) {
- futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair & it) -> bool {
- return common_download_file_single(it.first, it.second, bearer_token, offline);
- }, item));
+ futures_download.push_back(
+ std::async(
+ std::launch::async,
+ [&bearer_token, offline, &headers](const std::pair & it) -> bool {
+ const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+ return is_http_status_ok(http_status);
+ },
+ item
+ )
+ );
}
// Wait for all downloads to complete
@@ -807,17 +842,18 @@ static bool common_download_file_multiple(const std::vector(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
@@ -893,10 +932,10 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
// headers
- std::vector headers;
- headers.push_back("Accept: application/json");
+ common_header_list headers = custom_headers;
+ headers.push_back({"Accept", "application/json"});
if (!bearer_token.empty()) {
- headers.push_back("Authorization: Bearer " + bearer_token);
+ headers.push_back({"Authorization", "Bearer " + bearer_token});
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here
@@ -952,7 +991,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+ throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
}
// check response
@@ -1031,9 +1070,10 @@ std::string common_docker_resolve_model(const std::string & docker) {
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
std::string manifest_url = url_prefix + "/manifests/" + tag;
common_remote_params manifest_params;
- manifest_params.headers.push_back("Authorization: Bearer " + token);
- manifest_params.headers.push_back(
- "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
+ manifest_params.headers.push_back({"Authorization", "Bearer " + token});
+ manifest_params.headers.push_back({"Accept",
+ "application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
+ });
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
if (manifest_res.first != 200) {
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
@@ -1070,7 +1110,8 @@ std::string common_docker_resolve_model(const std::string & docker) {
std::string local_path = fs_get_cache_file(model_filename);
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
- if (!common_download_file_single(blob_url, local_path, token, false)) {
+ const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
+ if (!is_http_status_ok(http_status)) {
throw std::runtime_error("Failed to download Docker Model");
}
@@ -1084,11 +1125,11 @@ std::string common_docker_resolve_model(const std::string & docker) {
#else
-common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
+common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
-bool common_download_model(const common_params_model &, const std::string &, bool) {
+bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
@@ -1096,6 +1137,14 @@ std::string common_docker_resolve_model(const std::string &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
+int common_download_file_single(const std::string &,
+ const std::string &,
+ const std::string &,
+ bool,
+ const common_header_list &) {
+ throw std::runtime_error("download functionality is not enabled in this build");
+}
+
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
std::vector common_list_cached_models() {
diff --git a/common/download.h b/common/download.h
index d1321e6e90..c79be2f90e 100644
--- a/common/download.h
+++ b/common/download.h
@@ -1,12 +1,21 @@
#pragma once
#include
+#include
struct common_params_model;
-//
-// download functionalities
-//
+using common_header = std::pair;
+using common_header_list = std::vector;
+
+struct common_remote_params {
+ common_header_list headers;
+ long timeout = 0; // in seconds, 0 means no timeout
+ long max_size = 0; // unlimited if 0
+};
+
+// get remote file content, returns
+std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params);
struct common_cached_model_info {
std::string manifest_path;
@@ -41,17 +50,29 @@ struct common_hf_file_res {
common_hf_file_res common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & bearer_token,
- bool offline);
+ bool offline,
+ const common_header_list & headers = {}
+);
// returns true if download succeeded
bool common_download_model(
const common_params_model & model,
const std::string & bearer_token,
- bool offline);
+ bool offline,
+ const common_header_list & headers = {}
+);
// returns list of cached models
std::vector common_list_cached_models();
+// download single file from url to local path
+// returns status code or -1 on error
+int common_download_file_single(const std::string & url,
+ const std::string & path,
+ const std::string & bearer_token,
+ bool offline,
+ const common_header_list & headers = {});
+
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);
diff --git a/common/preset.cpp b/common/preset.cpp
index e2fc18c5da..aec14e0769 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -16,6 +16,46 @@ static std::string rm_leading_dashes(const std::string & str) {
return str.substr(pos);
}
+// only allow a subset of args for remote presets for security reasons
+// do not add more args unless absolutely necessary
+// args that output to files are strictly prohibited
+static std::set get_remote_preset_whitelist(const std::map & key_to_opt) {
+ static const std::set allowed_options = {
+ "model-url",
+ "hf-repo",
+ "hf-repo-draft",
+ "hf-repo-v", // vocoder
+ "hf-file-v", // vocoder
+ "mmproj-url",
+ "pooling",
+ "jinja",
+ "batch-size",
+ "ubatch-size",
+ "cache-reuse",
+ // note: sampling params are automatically allowed by default
+ // negated args will be added automatically
+ };
+
+ std::set allowed_keys;
+
+ for (const auto & it : key_to_opt) {
+ const std::string & key = it.first;
+ const common_arg & opt = it.second;
+ if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
+ allowed_keys.insert(key);
+ // also add variant keys (args without leading dashes and env vars)
+ for (const auto & arg : opt.get_args()) {
+ allowed_keys.insert(rm_leading_dashes(arg));
+ }
+ for (const auto & env : opt.get_env()) {
+ allowed_keys.insert(env);
+ }
+ }
+ }
+
+ return allowed_keys;
+}
+
std::vector common_preset::to_args(const std::string & bin_path) const {
std::vector args;
@@ -121,6 +161,29 @@ void common_preset::merge(const common_preset & other) {
}
}
+void common_preset::apply_to_params(common_params & params) const {
+ for (const auto & [opt, val] : options) {
+ // apply each option to params
+ if (opt.handler_string) {
+ opt.handler_string(params, val);
+ } else if (opt.handler_int) {
+ opt.handler_int(params, std::stoi(val));
+ } else if (opt.handler_bool) {
+ opt.handler_bool(params, common_arg_utils::is_truthy(val));
+ } else if (opt.handler_str_str) {
+ // not supported yet
+ throw std::runtime_error(string_format(
+ "%s: option with two values is not supported yet",
+ __func__
+ ));
+ } else if (opt.handler_void) {
+ opt.handler_void(params);
+ } else {
+ GGML_ABORT("unknown handler type");
+ }
+ }
+}
+
static std::map> parse_ini_from_file(const std::string & path) {
std::map> parsed;
@@ -230,10 +293,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
return value;
}
-common_preset_context::common_preset_context(llama_example ex)
+common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
: ctx_params(common_params_parser_init(default_params, ex)) {
common_params_add_preset_options(ctx_params.options);
key_to_opt = get_map_key_opt(ctx_params);
+
+ // setup allowed keys if only_remote_allowed is true
+ if (only_remote_allowed) {
+ filter_allowed_keys = true;
+ allowed_keys = get_remote_preset_whitelist(key_to_opt);
+ }
}
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -250,6 +319,12 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
+ if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
+ throw std::runtime_error(string_format(
+ "option '%s' is not allowed in remote presets",
+ key.c_str()
+ ));
+ }
if (key_to_opt.find(key) != key_to_opt.end()) {
const auto & opt = key_to_opt.at(key);
if (is_bool_arg(opt)) {
diff --git a/common/preset.h b/common/preset.h
index 3a84d1be29..11ba6ef812 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -6,6 +6,7 @@
#include
#include
#include