Merge branch 'master' into HEAD
This commit is contained in:
commit
4d10b78e23
|
|
@ -1770,7 +1770,7 @@ jobs:
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch llama2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
ubuntu-cmake-sanitizer-riscv64-native:
|
ubuntu-cmake-sanitizer-riscv64-native:
|
||||||
runs-on: RISCV64
|
runs-on: RISCV64
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
peg-parser.cpp
|
peg-parser.cpp
|
||||||
peg-parser.h
|
peg-parser.h
|
||||||
|
preset.cpp
|
||||||
|
preset.h
|
||||||
regex-partial.cpp
|
regex-partial.cpp
|
||||||
regex-partial.h
|
regex-partial.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
using namespace common_arg_utils;
|
||||||
|
|
||||||
static std::initializer_list<enum llama_example> mmproj_examples = {
|
static std::initializer_list<enum llama_example> mmproj_examples = {
|
||||||
LLAMA_EXAMPLE_MTMD,
|
LLAMA_EXAMPLE_MTMD,
|
||||||
|
|
@ -64,6 +65,15 @@ static std::string read_file(const std::string & fname) {
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const std::vector<common_arg> & get_common_arg_defs() {
|
||||||
|
static const std::vector<common_arg> options = [] {
|
||||||
|
common_params params;
|
||||||
|
auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
|
||||||
|
return ctx.options;
|
||||||
|
}();
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||||
this->examples = examples;
|
this->examples = examples;
|
||||||
return *this;
|
return *this;
|
||||||
|
|
@ -134,7 +144,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_arg::to_string() {
|
std::string common_arg::to_string() const {
|
||||||
// params for printing to console
|
// params for printing to console
|
||||||
const static int n_leading_spaces = 40;
|
const static int n_leading_spaces = 40;
|
||||||
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
|
||||||
|
|
@ -647,6 +657,53 @@ static void add_rpc_devices(const std::string & servers) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
|
||||||
|
common_params dummy_params;
|
||||||
|
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
|
||||||
|
|
||||||
|
std::unordered_map<std::string, common_arg *> arg_to_options;
|
||||||
|
for (auto & opt : ctx_arg.options) {
|
||||||
|
for (const auto & arg : opt.args) {
|
||||||
|
arg_to_options[arg] = &opt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO @ngxson : find a way to deduplicate this code
|
||||||
|
|
||||||
|
// handle command line arguments
|
||||||
|
auto check_arg = [&](int i) {
|
||||||
|
if (i+1 >= argc) {
|
||||||
|
throw std::invalid_argument("expected value for argument");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
const std::string arg_prefix = "--";
|
||||||
|
|
||||||
|
std::string arg = argv[i];
|
||||||
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||||
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
|
}
|
||||||
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||||
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||||
|
}
|
||||||
|
auto opt = *arg_to_options[arg];
|
||||||
|
std::string val;
|
||||||
|
if (opt.value_hint != nullptr) {
|
||||||
|
// arg with single value
|
||||||
|
check_arg(i);
|
||||||
|
val = argv[++i];
|
||||||
|
}
|
||||||
|
if (opt.value_hint_2 != nullptr) {
|
||||||
|
// TODO: support arg with 2 values
|
||||||
|
throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
|
||||||
|
}
|
||||||
|
out_map[opt] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||||
|
|
@ -692,25 +749,19 @@ static std::string list_builtin_chat_templates() {
|
||||||
return msg.str();
|
return msg.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_truthy(const std::string & value) {
|
bool common_arg_utils::is_truthy(const std::string & value) {
|
||||||
return value == "on" || value == "enabled" || value == "1";
|
return value == "on" || value == "enabled" || value == "1";
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_falsey(const std::string & value) {
|
bool common_arg_utils::is_falsey(const std::string & value) {
|
||||||
return value == "off" || value == "disabled" || value == "0";
|
return value == "off" || value == "disabled" || value == "0";
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool is_autoy(const std::string & value) {
|
bool common_arg_utils::is_autoy(const std::string & value) {
|
||||||
return value == "auto" || value == "-1";
|
return value == "auto" || value == "-1";
|
||||||
}
|
}
|
||||||
|
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||||
// default values specific to example
|
|
||||||
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
|
|
||||||
if (ex == LLAMA_EXAMPLE_SERVER) {
|
|
||||||
params.use_jinja = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
params.use_color = tty_can_use_colors();
|
params.use_color = tty_can_use_colors();
|
||||||
|
|
||||||
// load dynamic backends
|
// load dynamic backends
|
||||||
|
|
@ -2550,6 +2601,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.models_dir = value;
|
params.models_dir = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--models-preset"}, "PATH",
|
||||||
|
"path to INI file containing model presets for the router server (default: disabled)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.models_preset = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--models-max"}, "N",
|
{"--models-max"}, "N",
|
||||||
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
||||||
|
|
@ -2566,14 +2624,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--jinja"},
|
{"--jinja"},
|
||||||
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.use_jinja = true;
|
params.use_jinja = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-jinja"},
|
{"--no-jinja"},
|
||||||
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.use_jinja = false;
|
params.use_jinja = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
32
common/arg.h
32
common/arg.h
|
|
@ -3,8 +3,10 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
|
|
@ -24,6 +26,8 @@ struct common_arg {
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
|
||||||
|
common_arg() = default;
|
||||||
|
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
const char * value_hint,
|
const char * value_hint,
|
||||||
|
|
@ -61,9 +65,29 @@ struct common_arg {
|
||||||
bool is_exclude(enum llama_example ex);
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output) const;
|
bool get_value_from_env(std::string & output) const;
|
||||||
bool has_value_from_env() const;
|
bool has_value_from_env() const;
|
||||||
std::string to_string();
|
std::string to_string() const;
|
||||||
|
|
||||||
|
// for using as key in std::map
|
||||||
|
bool operator<(const common_arg& other) const {
|
||||||
|
if (args.empty() || other.args.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return strcmp(args[0], other.args[0]) < 0;
|
||||||
|
}
|
||||||
|
bool operator==(const common_arg& other) const {
|
||||||
|
if (args.empty() || other.args.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return strcmp(args[0], other.args[0]) == 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
namespace common_arg_utils {
|
||||||
|
bool is_truthy(const std::string & value);
|
||||||
|
bool is_falsey(const std::string & value);
|
||||||
|
bool is_autoy(const std::string & value);
|
||||||
|
}
|
||||||
|
|
||||||
struct common_params_context {
|
struct common_params_context {
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
common_params & params;
|
common_params & params;
|
||||||
|
|
@ -76,7 +100,11 @@ struct common_params_context {
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
// parse input arguments from CLI into a map
|
||||||
|
// TODO: support repeated args in the future
|
||||||
|
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||||
|
|
||||||
|
// initialize argument parser context - used by test-arg-parser and preset
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
struct common_remote_params {
|
struct common_remote_params {
|
||||||
|
|
|
||||||
|
|
@ -469,7 +469,7 @@ struct common_params {
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
std::string api_prefix = ""; // NOLINT
|
std::string api_prefix = ""; // NOLINT
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = ""; // NOLINT
|
||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = true; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
int reasoning_budget = -1;
|
int reasoning_budget = -1;
|
||||||
|
|
@ -489,9 +489,10 @@ struct common_params {
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
// router server configs
|
// router server configs
|
||||||
std::string models_dir = ""; // directory containing models for the router server
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
int models_max = 4; // maximum number of models to load simultaneously
|
std::string models_preset = ""; // directory containing model presets for the router server
|
||||||
bool models_autoload = true; // automatically load models when requested via the router server
|
int models_max = 4; // maximum number of models to load simultaneously
|
||||||
|
bool models_autoload = true; // automatically load models when requested via the router server
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,180 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "preset.h"
|
||||||
|
#include "peg-parser.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
static std::string rm_leading_dashes(const std::string & str) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while (pos < str.size() && str[pos] == '-') {
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
return str.substr(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_preset::to_args() const {
|
||||||
|
std::vector<std::string> args;
|
||||||
|
|
||||||
|
for (const auto & [opt, value] : options) {
|
||||||
|
args.push_back(opt.args.back()); // use the last arg as the main arg
|
||||||
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||||
|
// flag option, no value
|
||||||
|
if (common_arg_utils::is_falsey(value)) {
|
||||||
|
// skip the flag
|
||||||
|
args.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (opt.value_hint != nullptr) {
|
||||||
|
// single value
|
||||||
|
args.push_back(value);
|
||||||
|
}
|
||||||
|
if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
|
||||||
|
throw std::runtime_error(string_format(
|
||||||
|
"common_preset::to_args(): option '%s' has two values, which is not supported yet",
|
||||||
|
opt.args.back()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string common_preset::to_ini() const {
|
||||||
|
std::ostringstream ss;
|
||||||
|
|
||||||
|
ss << "[" << name << "]\n";
|
||||||
|
for (const auto & [opt, value] : options) {
|
||||||
|
auto espaced_value = value;
|
||||||
|
string_replace_all(espaced_value, "\n", "\\\n");
|
||||||
|
ss << rm_leading_dashes(opt.args.back()) << " = ";
|
||||||
|
ss << espaced_value << "\n";
|
||||||
|
}
|
||||||
|
ss << "\n";
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
||||||
|
std::map<std::string, std::map<std::string, std::string>> parsed;
|
||||||
|
|
||||||
|
if (!std::filesystem::exists(path)) {
|
||||||
|
throw std::runtime_error("preset file does not exist: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.good()) {
|
||||||
|
throw std::runtime_error("failed to open server preset file: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||||
|
|
||||||
|
static const auto parser = build_peg_parser([](auto & p) {
|
||||||
|
// newline ::= "\r\n" / "\n" / "\r"
|
||||||
|
auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
|
||||||
|
|
||||||
|
// ws ::= [ \t]*
|
||||||
|
auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
|
||||||
|
|
||||||
|
// comment ::= [;#] (!newline .)*
|
||||||
|
auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
|
||||||
|
|
||||||
|
// eol ::= ws comment? (newline / EOF)
|
||||||
|
auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
|
||||||
|
|
||||||
|
// ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
|
||||||
|
auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
|
||||||
|
|
||||||
|
// value ::= (!eol-start .)*
|
||||||
|
auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
|
||||||
|
auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
|
||||||
|
|
||||||
|
// header-line ::= "[" ws ident ws "]" eol
|
||||||
|
auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
|
||||||
|
|
||||||
|
// kv-line ::= ident ws "=" ws value eol
|
||||||
|
auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
|
||||||
|
|
||||||
|
// comment-line ::= ws comment (newline / EOF)
|
||||||
|
auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
|
||||||
|
|
||||||
|
// blank-line ::= ws (newline / EOF)
|
||||||
|
auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
|
||||||
|
|
||||||
|
// line ::= header-line / kv-line / comment-line / blank-line
|
||||||
|
auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
|
||||||
|
|
||||||
|
// ini ::= line* EOF
|
||||||
|
auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
|
||||||
|
|
||||||
|
return ini;
|
||||||
|
});
|
||||||
|
|
||||||
|
common_peg_parse_context ctx(contents);
|
||||||
|
const auto result = parser.parse(ctx);
|
||||||
|
if (!result.success()) {
|
||||||
|
throw std::runtime_error("failed to parse server config file: " + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string current_section = COMMON_PRESET_DEFAULT_NAME;
|
||||||
|
std::string current_key;
|
||||||
|
|
||||||
|
ctx.ast.visit(result, [&](const auto & node) {
|
||||||
|
if (node.tag == "section-name") {
|
||||||
|
const std::string section = std::string(node.text);
|
||||||
|
current_section = section;
|
||||||
|
parsed[current_section] = {};
|
||||||
|
} else if (node.tag == "key") {
|
||||||
|
const std::string key = std::string(node.text);
|
||||||
|
current_key = key;
|
||||||
|
} else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
|
||||||
|
parsed[current_section][current_key] = std::string(node.text);
|
||||||
|
current_key.clear();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
||||||
|
std::map<std::string, common_arg> mapping;
|
||||||
|
for (const auto & opt : ctx_params.options) {
|
||||||
|
if (opt.env != nullptr) {
|
||||||
|
mapping[opt.env] = opt;
|
||||||
|
}
|
||||||
|
for (const auto & arg : opt.args) {
|
||||||
|
mapping[rm_leading_dashes(arg)] = opt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
||||||
|
common_presets out;
|
||||||
|
auto key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
auto ini_data = parse_ini_from_file(path);
|
||||||
|
|
||||||
|
for (auto section : ini_data) {
|
||||||
|
common_preset preset;
|
||||||
|
if (section.first.empty()) {
|
||||||
|
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
||||||
|
} else {
|
||||||
|
preset.name = section.first;
|
||||||
|
}
|
||||||
|
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
||||||
|
for (const auto & [key, value] : section.second) {
|
||||||
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||||
|
preset.options[key_to_opt[key]] = value;
|
||||||
|
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
|
} else {
|
||||||
|
// TODO: maybe warn about unknown key?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out[preset.name] = preset;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "arg.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
//
|
||||||
|
// INI preset parser and writer
|
||||||
|
//
|
||||||
|
|
||||||
|
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
||||||
|
|
||||||
|
struct common_preset {
|
||||||
|
std::string name;
|
||||||
|
// TODO: support repeated args in the future
|
||||||
|
std::map<common_arg, std::string> options;
|
||||||
|
|
||||||
|
// convert preset to CLI argument list
|
||||||
|
std::vector<std::string> to_args() const;
|
||||||
|
|
||||||
|
// convert preset to INI format string
|
||||||
|
std::string to_ini() const;
|
||||||
|
|
||||||
|
// TODO: maybe implement to_env() if needed
|
||||||
|
};
|
||||||
|
|
||||||
|
// interface for multiple presets in one file
|
||||||
|
using common_presets = std::map<std::string, common_preset>;
|
||||||
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
|
||||||
|
|
@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
|
@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
|
||||||
```bash
|
```bash
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With MUSA
|
## Docker With MUSA
|
||||||
|
|
@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -2305,13 +2305,11 @@ extern "C" {
|
||||||
float stop,
|
float stop,
|
||||||
float step);
|
float step);
|
||||||
|
|
||||||
#define GGML_KQ_MASK_PAD 1
|
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||||
|
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
// mask: [n_kv, n_batch, ne32, ne33]
|
||||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
||||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
||||||
//
|
//
|
||||||
// broadcast:
|
// broadcast:
|
||||||
// n_head % n_head_kv == 0
|
// n_head % n_head_kv == 0
|
||||||
|
|
|
||||||
|
|
@ -187,6 +187,9 @@ typedef void * thread_ret_t;
|
||||||
|
|
||||||
typedef pthread_t ggml_thread_t;
|
typedef pthread_t ggml_thread_t;
|
||||||
|
|
||||||
|
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
||||||
|
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <mach/mach.h>
|
#include <mach/mach.h>
|
||||||
|
|
@ -449,7 +452,7 @@ struct ggml_threadpool {
|
||||||
struct ggml_cplan * cplan;
|
struct ggml_cplan * cplan;
|
||||||
|
|
||||||
// synchronization primitives
|
// synchronization primitives
|
||||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
||||||
atomic_int GGML_CACHE_ALIGN n_barrier;
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
||||||
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
||||||
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||||
|
|
@ -457,12 +460,10 @@ struct ggml_threadpool {
|
||||||
// these are atomic as an annotation for thread-sanitizer
|
// these are atomic as an annotation for thread-sanitizer
|
||||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||||
atomic_int abort; // Used for aborting processing of a graph
|
atomic_int abort; // Used for aborting processing of a graph
|
||||||
|
|
||||||
struct ggml_compute_state * workers; // per thread state
|
struct ggml_compute_state * workers; // per thread state
|
||||||
int n_threads_max; // number of threads in the pool
|
int n_threads; // Number of threads in the pool
|
||||||
atomic_int n_threads_cur; // number of threads used in the current graph
|
|
||||||
|
|
||||||
int32_t prio; // Scheduling priority
|
int32_t prio; // Scheduling priority
|
||||||
uint32_t poll; // Polling level (0 - no polling)
|
uint32_t poll; // Polling level (0 - no polling)
|
||||||
|
|
||||||
|
|
@ -539,7 +540,7 @@ struct ggml_state {
|
||||||
static struct ggml_state g_state = {0};
|
static struct ggml_state g_state = {0};
|
||||||
|
|
||||||
void ggml_barrier(struct ggml_threadpool * tp) {
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
||||||
if (n_threads == 1) {
|
if (n_threads == 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -556,7 +557,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
// last thread
|
// last thread
|
||||||
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
||||||
|
|
||||||
// exit barrier (fill seq-cst fence)
|
// exit barrier (full seq-cst fence)
|
||||||
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -2628,7 +2629,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
||||||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||||
if (!threadpool) return;
|
if (!threadpool) return;
|
||||||
|
|
||||||
const int n_threads = threadpool->n_threads_max;
|
const int n_threads = threadpool->n_threads;
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
struct ggml_compute_state* workers = threadpool->workers;
|
struct ggml_compute_state* workers = threadpool->workers;
|
||||||
|
|
@ -2704,7 +2705,7 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
||||||
}
|
}
|
||||||
if (n_threads <= 0) {
|
if (n_threads <= 0) {
|
||||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||||
|
|
@ -2912,12 +2913,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
struct ggml_compute_params params = {
|
struct ggml_compute_params params = {
|
||||||
/*.ith =*/ state->ith,
|
/*.ith =*/ state->ith,
|
||||||
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
|
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
||||||
/*.wsize =*/ cplan->work_size,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ cplan->work_data,
|
/*.wdata =*/ cplan->work_data,
|
||||||
/*.threadpool=*/ tp,
|
/*.threadpool=*/ tp,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||||
|
|
||||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||||
|
|
||||||
|
|
@ -2939,6 +2942,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
||||||
|
|
||||||
ggml_barrier(state->threadpool);
|
ggml_barrier(state->threadpool);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -2946,27 +2951,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
|
|
||||||
// check if thread is active
|
|
||||||
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
|
||||||
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
|
||||||
return (state->ith < n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if thread is ready to proceed (exit from polling or sleeping)
|
// check if thread is ready to proceed (exit from polling or sleeping)
|
||||||
|
// returns true if loops should exit, sets state->pending to indicate new work
|
||||||
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
struct ggml_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
||||||
|
|
||||||
// check for new graph/work
|
// check for new graph/work
|
||||||
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
||||||
if (new_graph != state->last_graph) {
|
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
||||||
state->pending = ggml_graph_compute_thread_active(state);
|
if (n_graph != state->last_graph) {
|
||||||
state->last_graph = new_graph;
|
state->pending = (state->ith < n_threads);
|
||||||
|
state->last_graph = n_graph;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return state->pending;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sync thread state after polling
|
// sync thread state after polling
|
||||||
|
|
@ -2983,11 +2984,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
||||||
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||||
struct ggml_threadpool * threadpool = state->threadpool;
|
struct ggml_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
// Skip polling for unused threads
|
|
||||||
if (!ggml_graph_compute_thread_active(state)) {
|
|
||||||
return state->pending;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
||||||
// Perhaps, we can adjust it dynamically based on load and things.
|
// Perhaps, we can adjust it dynamically based on load and things.
|
||||||
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
||||||
|
|
@ -3049,7 +3045,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
||||||
ggml_graph_compute_check_for_work(state);
|
ggml_graph_compute_check_for_work(state);
|
||||||
if (state->pending) {
|
if (state->pending) {
|
||||||
state->pending = false;
|
state->pending = false;
|
||||||
|
|
||||||
ggml_graph_compute_thread(state);
|
ggml_graph_compute_thread(state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3064,14 +3059,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
||||||
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
|
// Update the number of active threads and the graph count
|
||||||
|
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
||||||
|
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
||||||
|
|
||||||
// Update the number of active threads
|
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
||||||
|
|
||||||
// Indicate the graph is ready to be processed
|
// Indicate the graph is ready to be processed
|
||||||
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
||||||
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
|
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
||||||
|
|
||||||
if (threadpool->pause) {
|
if (threadpool->pause) {
|
||||||
// Update main thread prio and affinity to match the threadpool settings
|
// Update main thread prio and affinity to match the threadpool settings
|
||||||
|
|
@ -3109,8 +3105,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
threadpool->pause = tpp->paused;
|
threadpool->pause = tpp->paused;
|
||||||
threadpool->abort = -1;
|
threadpool->abort = -1;
|
||||||
threadpool->workers = NULL;
|
threadpool->workers = NULL;
|
||||||
threadpool->n_threads_max = tpp->n_threads;
|
threadpool->n_threads = tpp->n_threads;
|
||||||
threadpool->n_threads_cur = tpp->n_threads;
|
|
||||||
threadpool->poll = tpp->poll;
|
threadpool->poll = tpp->poll;
|
||||||
threadpool->prio = tpp->prio;
|
threadpool->prio = tpp->prio;
|
||||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||||
|
|
@ -3205,7 +3200,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
{
|
{
|
||||||
// update the number of threads from the actual number of threads that we got from OpenMP
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||||
n_threads = omp_get_num_threads();
|
n_threads = omp_get_num_threads();
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply thread CPU mask and priority
|
// Apply thread CPU mask and priority
|
||||||
|
|
@ -3218,13 +3213,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
||||||
ggml_graph_compute_thread(&threadpool->workers[0]);
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (n_threads > threadpool->n_threads_max) {
|
if (n_threads > threadpool->n_threads) {
|
||||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
||||||
n_threads = threadpool->n_threads_max;
|
n_threads = threadpool->n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kick all threads to start the new graph
|
// Kick all threads to start the new graph
|
||||||
|
|
|
||||||
|
|
@ -4326,6 +4326,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_UNARY_OP_EXPM1:
|
case GGML_UNARY_OP_EXPM1:
|
||||||
case GGML_UNARY_OP_SOFTPLUS:
|
case GGML_UNARY_OP_SOFTPLUS:
|
||||||
case GGML_UNARY_OP_ELU:
|
case GGML_UNARY_OP_ELU:
|
||||||
|
case GGML_UNARY_OP_XIELU:
|
||||||
case GGML_UNARY_OP_FLOOR:
|
case GGML_UNARY_OP_FLOOR:
|
||||||
case GGML_UNARY_OP_CEIL:
|
case GGML_UNARY_OP_CEIL:
|
||||||
case GGML_UNARY_OP_ROUND:
|
case GGML_UNARY_OP_ROUND:
|
||||||
|
|
|
||||||
|
|
@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
return (1 - MIN(1, MAX(0, y)));
|
return (1 - MIN(1, MAX(0, y)));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_cache_init(const float theta_base,
|
static void rope_cache_init(const float theta_base,
|
||||||
float freq_scale,
|
const float freq_scale,
|
||||||
const float * freq_factors,
|
const float * freq_factors,
|
||||||
float * corr_dims,
|
float * corr_dims,
|
||||||
uint32_t ne0,
|
const uint32_t ne0,
|
||||||
float ext_factor,
|
const float ext_factor,
|
||||||
float mscale,
|
const float mscale,
|
||||||
float * cache,
|
float * cache,
|
||||||
float theta_scale) {
|
const float theta_scale) {
|
||||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
float theta = theta_base;
|
float theta = theta_base;
|
||||||
|
|
||||||
|
|
@ -92,18 +92,19 @@ static void rope_cache_init(const float theta_base,
|
||||||
|
|
||||||
// Get n-d rotational scaling corrected for extrapolation
|
// Get n-d rotational scaling corrected for extrapolation
|
||||||
float theta_interp = freq_scale * theta_extrap;
|
float theta_interp = freq_scale * theta_extrap;
|
||||||
float theta2 = theta_interp;
|
float theta_final = theta_interp;
|
||||||
|
float mscale_final = mscale;
|
||||||
|
|
||||||
if (ext_factor != 0.0f) {
|
if (ext_factor != 0.0f) {
|
||||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||||
theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||||
|
|
||||||
// Get n-d magnitude scaling corrected for interpolation
|
// Get n-d magnitude scaling corrected for interpolation
|
||||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
cache[i0 + 0] = cosf(theta2) * mscale;
|
cache[i0 + 0] = cosf(theta_final) * mscale_final;
|
||||||
cache[i0 + 1] = sinf(theta2) * mscale;
|
cache[i0 + 1] = sinf(theta_final) * mscale_final;
|
||||||
|
|
||||||
theta *= theta_scale;
|
theta *= theta_scale;
|
||||||
}
|
}
|
||||||
|
|
@ -151,9 +152,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
||||||
float * restrict dst,
|
float * restrict dst,
|
||||||
const int num_elems,
|
const int num_elems,
|
||||||
const float * restrict theta_cache) {
|
const float * restrict theta_cache) {
|
||||||
// for (int i = 0; i < num_elems; i += 2) {
|
// for (int i = 0; i < num_elems; i += 2) {
|
||||||
//const float cos_theta = theta_cache[i + 0];
|
//const float cos_theta = theta_cache[i + 0];
|
||||||
//const float sin_theta = theta_cache[i + 1];
|
//const float sin_theta = theta_cache[i + 1];
|
||||||
|
|
@ -192,7 +193,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
||||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||||
|
|
||||||
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
||||||
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
|
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
|
||||||
|
|
||||||
src0_curr += VLEN;
|
src0_curr += VLEN;
|
||||||
|
|
@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
const uint32_t ir1,
|
const uint32_t ir1,
|
||||||
int nth,
|
int nth,
|
||||||
int ith,
|
int ith,
|
||||||
int opt_path) {
|
const int opt_path) {
|
||||||
struct htp_ops_context * octx = rope_ctx->octx;
|
struct htp_ops_context * octx = rope_ctx->octx;
|
||||||
|
|
||||||
const struct htp_tensor * src0 = &octx->src0;
|
const struct htp_tensor * src0 = &octx->src0;
|
||||||
|
|
@ -267,8 +268,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
const struct htp_tensor * src2 = &octx->src2;
|
const struct htp_tensor * src2 = &octx->src2;
|
||||||
struct htp_tensor * dst = &octx->dst;
|
struct htp_tensor * dst = &octx->dst;
|
||||||
|
|
||||||
const int32_t mode = rope_ctx->mode;
|
const int32_t mode = rope_ctx->mode;
|
||||||
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
htp_rope_preamble;
|
htp_rope_preamble;
|
||||||
|
|
||||||
|
|
@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
freq_factors = (const float *) src2->data;
|
freq_factors = (const float *) src2->data;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ir = 0;
|
const uint32_t i1_end = MIN(ir1, ne1);
|
||||||
|
const int32_t half_dims = rope_ctx->n_dims / 2;
|
||||||
|
const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
|
||||||
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||||
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||||
const int32_t p = pos[i2];
|
const int32_t p = pos[i2];
|
||||||
|
|
@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
|
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
|
||||||
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
|
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
|
||||||
|
|
||||||
for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads
|
for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads
|
||||||
if (ir++ < ir0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (ir > ir1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
|
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
|
||||||
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
||||||
|
|
||||||
|
|
@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
} else {
|
} else {
|
||||||
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_loc += rope_ctx->n_dims;
|
||||||
|
dst_data_loc += rope_ctx->n_dims;
|
||||||
} else {
|
} else {
|
||||||
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
||||||
const float cos_theta = wp0[i0 + 0];
|
const float cos_theta = wp0[i0 + 0];
|
||||||
|
|
@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
|
|
||||||
if (is_neox) {
|
if (is_neox) {
|
||||||
const float x0 = src_loc[0];
|
const float x0 = src_loc[0];
|
||||||
const float x1 = src_loc[rope_ctx->n_dims/2];
|
const float x1 = src_loc[half_dims];
|
||||||
|
|
||||||
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
||||||
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
|
||||||
|
|
||||||
src_loc += 1;
|
src_loc += 1;
|
||||||
dst_data_loc += 1;
|
dst_data_loc += 1;
|
||||||
|
|
@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||||
dst_data_loc += 2;
|
dst_data_loc += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_loc += (is_neox ? half_dims : 0);
|
||||||
|
dst_data_loc += (is_neox ? half_dims : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
|
// TODO: use simd to speed up the remaining elements copy
|
||||||
dst_data_loc[0] = src_loc[0];
|
memcpy(dst_data_loc, src_loc, remain_bytes);
|
||||||
dst_data_loc[1] = src_loc[1];
|
|
||||||
|
|
||||||
src_loc += 2;
|
|
||||||
dst_data_loc += 2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5260,8 +5260,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||||
|
|
||||||
if (mask) {
|
if (mask) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||||
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
|
||||||
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
|
||||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||||
|
|
||||||
GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
|
GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
|
||||||
|
|
|
||||||
|
|
@ -714,7 +714,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||||
udata->output .resize(n_tokens);
|
udata->output .resize(n_tokens);
|
||||||
|
|
||||||
udata->seq_id_data.resize(n_tokens);
|
udata->seq_id_data.reserve(n_tokens);
|
||||||
|
|
||||||
seq_set_t seq_set_unq;
|
seq_set_t seq_set_unq;
|
||||||
|
|
||||||
|
|
@ -737,15 +737,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
}
|
}
|
||||||
|
|
||||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||||
udata->seq_id_data[i].reserve(udata->n_seq_id[i]);
|
|
||||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
|
||||||
udata->seq_id_data[i].push_back(batch.seq_id[idxs[i]][s]);
|
|
||||||
}
|
|
||||||
udata->seq_id[i] = udata->seq_id_data[i].data();
|
|
||||||
udata->output[i] = batch.logits[idxs[i]];
|
udata->output[i] = batch.logits[idxs[i]];
|
||||||
|
|
||||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||||
seq_set_unq.set(udata->seq_id[i][s]);
|
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
||||||
|
|
||||||
|
udata->seq_id_data.push_back(seq_id);
|
||||||
|
seq_set_unq.set(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (udata->output[i]) {
|
if (udata->output[i]) {
|
||||||
|
|
@ -753,6 +751,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
||||||
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
|
udata->seq_id[i] = seq_id_ptr;
|
||||||
|
seq_id_ptr += udata->n_seq_id[i];
|
||||||
|
}
|
||||||
|
|
||||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||||
if (seq_set_unq.test(s)) {
|
if (seq_set_unq.test(s)) {
|
||||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||||
|
|
|
||||||
|
|
@ -56,15 +56,15 @@ struct llama_ubatch {
|
||||||
std::vector<float> embd;
|
std::vector<float> embd;
|
||||||
std::vector<llama_pos> pos;
|
std::vector<llama_pos> pos;
|
||||||
std::vector<int32_t> n_seq_id;
|
std::vector<int32_t> n_seq_id;
|
||||||
std::vector<llama_seq_id *> seq_id;
|
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
||||||
std::vector<llama_seq_id> seq_id_unq;
|
std::vector<llama_seq_id> seq_id_unq;
|
||||||
std::vector<int32_t> seq_idx;
|
std::vector<int32_t> seq_idx;
|
||||||
std::vector<int8_t> output;
|
std::vector<int8_t> output;
|
||||||
|
|
||||||
std::vector<std::vector<llama_seq_id>> seq_id_data;
|
std::vector<llama_seq_id> seq_id_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
||||||
std::shared_ptr<data_t> data;
|
std::shared_ptr<data_t> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -112,14 +112,6 @@ llama_context::llama_context(
|
||||||
// with causal attention, the batch size is limited by the context size
|
// with causal attention, the batch size is limited by the context size
|
||||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||||
|
|
||||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
||||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
||||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
|
||||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
||||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
||||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
||||||
}
|
|
||||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||||
|
|
||||||
cparams.op_offload = params.op_offload;
|
cparams.op_offload = params.op_offload;
|
||||||
|
|
|
||||||
|
|
@ -386,7 +386,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -417,10 +417,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -453,7 +453,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
for (int i = n_tokens; i < n_tokens; ++i) {
|
||||||
for (int j = 0; j < n_enc; ++j) {
|
for (int j = 0; j < n_enc; ++j) {
|
||||||
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
@ -1545,13 +1545,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
||||||
|
|
||||||
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
||||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
ggml_set_input(inp->self_kq_mask_swa);
|
||||||
|
|
||||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||||
|
|
@ -1633,7 +1633,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
@ -1776,7 +1776,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||||
|
|
||||||
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
||||||
|
|
||||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->cross_kq_mask);
|
ggml_set_input(inp->cross_kq_mask);
|
||||||
|
|
||||||
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
||||||
|
|
@ -1842,7 +1842,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
||||||
|
|
||||||
|
|
@ -1858,7 +1858,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
ggml_set_input(inp->self_kq_mask_swa);
|
||||||
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
||||||
GGML_ASSERT(n_tokens%n_stream == 0);
|
GGML_ASSERT(n_tokens%n_stream == 0);
|
||||||
|
|
||||||
// n_tps == n_tokens_per_stream
|
// n_tps == n_tokens_per_stream
|
||||||
const int64_t n_tps = n_tokens/n_stream;
|
const int64_t n_tps = n_tokens/n_stream;
|
||||||
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
|
|
||||||
|
|
||||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||||
|
|
||||||
|
|
@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
||||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||||
|
|
||||||
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
|
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||||
|
|
||||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||||
if (cells.is_empty(j)) {
|
if (cells.is_empty(j)) {
|
||||||
|
|
|
||||||
|
|
@ -5875,7 +5875,7 @@ struct test_flash_attn_ext : public test_case {
|
||||||
|
|
||||||
ggml_tensor * m = nullptr;
|
ggml_tensor * m = nullptr;
|
||||||
if (mask) {
|
if (mask) {
|
||||||
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
|
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
|
||||||
ggml_set_name(m, "m");
|
ggml_set_name(m, "m");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,19 +11,7 @@
|
||||||
|
|
||||||
#define MAX_NARGS 2
|
#define MAX_NARGS 2
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
static void test_barrier(int n_threads, int n_rounds) {
|
||||||
|
|
||||||
int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
|
|
||||||
int n_rounds = 100;
|
|
||||||
|
|
||||||
if (argc > 1) {
|
|
||||||
n_threads = std::atoi(argv[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc > 2) {
|
|
||||||
n_rounds = std::atoi(argv[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ 1024*1024*1024,
|
/* .mem_size = */ 1024*1024*1024,
|
||||||
/* .mem_buffer = */ NULL,
|
/* .mem_buffer = */ NULL,
|
||||||
|
|
@ -56,7 +44,7 @@ int main(int argc, char *argv[]) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create compute plan
|
// The test runs with constant number of threads
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
|
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
|
||||||
|
|
||||||
std::vector<uint8_t> work_data(cplan.work_size);
|
std::vector<uint8_t> work_data(cplan.work_size);
|
||||||
|
|
@ -89,6 +77,160 @@ int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
ggml_threadpool_free(threadpool);
|
ggml_threadpool_free(threadpool);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void test_active(int n_threads, int n_rounds) {
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/* .mem_size = */ 1024*1024*1024,
|
||||||
|
/* .mem_buffer = */ NULL,
|
||||||
|
/* .no_alloc = */ false,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
|
// Create graph
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||||
|
|
||||||
|
// Small graph with, parallel ops with barriers
|
||||||
|
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
|
||||||
|
out = ggml_mul_mat(ctx, a, out);
|
||||||
|
|
||||||
|
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
|
||||||
|
out = ggml_mul_mat(ctx, d, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
int n_nodes = ggml_graph_n_nodes(gf);
|
||||||
|
|
||||||
|
// Create threadpool
|
||||||
|
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
|
||||||
|
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "graph-compute with"
|
||||||
|
<< "\n n_threads: " << n_threads
|
||||||
|
<< "\n n_nodes: " << n_nodes
|
||||||
|
<< "\n n_rounds: " << n_rounds
|
||||||
|
<< "\n";
|
||||||
|
// ggml_graph_print(gf);
|
||||||
|
|
||||||
|
// In this test we keep changing the number of threads every 4th iteration
|
||||||
|
// to test for race conditions in that path
|
||||||
|
|
||||||
|
for (int i=0; i < n_rounds; i++) {
|
||||||
|
struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_data(cplan.work_size);
|
||||||
|
cplan.work_data = work_data.data();
|
||||||
|
|
||||||
|
ggml_graph_compute(gf, &cplan);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void test_multi_graph(int n_threads, int n_rounds) {
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/* .mem_size = */ 1024*1024*1024,
|
||||||
|
/* .mem_buffer = */ NULL,
|
||||||
|
/* .no_alloc = */ false,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
|
// Create graphs
|
||||||
|
struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
|
||||||
|
{
|
||||||
|
// Small graph with parallel ops with barriers
|
||||||
|
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
|
||||||
|
out = ggml_mul_mat(ctx, a, out);
|
||||||
|
|
||||||
|
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
|
||||||
|
out = ggml_mul_mat(ctx, d, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf0, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
|
||||||
|
{
|
||||||
|
// Small graph with parallel ops with barriers
|
||||||
|
// Use larger tensors to make sure work_data size is larger than gf0
|
||||||
|
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 256);
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
|
||||||
|
out = ggml_mul_mat(ctx, a, out);
|
||||||
|
|
||||||
|
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
|
||||||
|
out = ggml_mul_mat(ctx, d, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf1, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Create threadpool
|
||||||
|
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
|
||||||
|
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "graph-compute with"
|
||||||
|
<< "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
|
||||||
|
<< "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
|
||||||
|
<< "\n n_threads: " << n_threads
|
||||||
|
<< "\n n_rounds: " << n_rounds
|
||||||
|
<< "\n";
|
||||||
|
|
||||||
|
// In this test we keep changing the number of threads every 4th iteration
|
||||||
|
// and we compute two graphs back to back to test graph frequent graph switching
|
||||||
|
|
||||||
|
for (int i=0; i < n_rounds; i++) {
|
||||||
|
struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
|
std::vector<uint8_t> work_data0(cplan0.work_size);
|
||||||
|
cplan0.work_data = work_data0.data();
|
||||||
|
|
||||||
|
struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
|
std::vector<uint8_t> work_data1(cplan1.work_size);
|
||||||
|
cplan1.work_data = work_data1.data();
|
||||||
|
|
||||||
|
ggml_graph_compute(gf0, &cplan0);
|
||||||
|
ggml_graph_compute(gf1, &cplan1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
|
int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
|
||||||
|
int n_rounds = 100;
|
||||||
|
|
||||||
|
if (argc > 1) {
|
||||||
|
n_threads = std::atoi(argv[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc > 2) {
|
||||||
|
n_rounds = std::atoi(argv[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
test_barrier(n_threads, n_rounds);
|
||||||
|
|
||||||
|
test_active(n_threads, n_rounds * 100);
|
||||||
|
|
||||||
|
test_multi_graph(n_threads, n_rounds * 10);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,10 @@ static void sigint_handler(int signo) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
common_params params;
|
common_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
|
// disable jinja by default
|
||||||
|
params.use_jinja = false;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -595,11 +595,12 @@ struct clip_graph {
|
||||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
cur = build_ffn(cur,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
model.mm_1_w, model.mm_1_b,
|
||||||
cur = ggml_gelu(ctx0, cur);
|
nullptr, nullptr,
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
model.mm_2_w, model.mm_2_b,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
FFN_GELU,
|
||||||
|
-1);
|
||||||
|
|
||||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
|
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
|
|
@ -667,16 +668,12 @@ struct clip_graph {
|
||||||
|
|
||||||
// LlavaMultiModalProjector (always using GELU activation)
|
// LlavaMultiModalProjector (always using GELU activation)
|
||||||
{
|
{
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
cur = build_ffn(cur,
|
||||||
if (model.mm_1_b) {
|
model.mm_1_w, model.mm_1_b,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
nullptr, nullptr,
|
||||||
}
|
model.mm_2_w, model.mm_2_b,
|
||||||
|
FFN_GELU,
|
||||||
cur = ggml_gelu(ctx0, cur);
|
-1);
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
|
||||||
if (model.mm_2_b) {
|
|
||||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// arrangement of the [IMG_BREAK] token
|
// arrangement of the [IMG_BREAK] token
|
||||||
|
|
@ -775,10 +772,6 @@ struct clip_graph {
|
||||||
|
|
||||||
// if flash attn is used, we need to pad the mask and cast to f16
|
// if flash attn is used, we need to pad the mask and cast to f16
|
||||||
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||||
int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
|
|
||||||
if (n_pad > 0) {
|
|
||||||
window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
|
|
||||||
}
|
|
||||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -791,7 +784,7 @@ struct clip_graph {
|
||||||
|
|
||||||
// loop over layers
|
// loop over layers
|
||||||
for (int il = 0; il < n_layer; il++) {
|
for (int il = 0; il < n_layer; il++) {
|
||||||
auto & layer = model.layers[il];
|
const auto & layer = model.layers[il];
|
||||||
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
||||||
|
|
||||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||||
|
|
@ -870,16 +863,12 @@ struct clip_graph {
|
||||||
// multimodal projection
|
// multimodal projection
|
||||||
ggml_tensor * embeddings = inpL;
|
ggml_tensor * embeddings = inpL;
|
||||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||||
|
embeddings = build_ffn(embeddings,
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
model.mm_0_w, model.mm_0_b,
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
nullptr, nullptr,
|
||||||
|
model.mm_1_w, model.mm_1_b,
|
||||||
// GELU activation
|
FFN_GELU,
|
||||||
embeddings = ggml_gelu(ctx0, embeddings);
|
-1);
|
||||||
|
|
||||||
// Second linear layer
|
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
|
||||||
|
|
||||||
if (use_window_attn) {
|
if (use_window_attn) {
|
||||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||||
|
|
@ -1257,11 +1246,12 @@ struct clip_graph {
|
||||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
cur = build_ffn(cur,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
model.mm_1_w, model.mm_1_b,
|
||||||
cur = ggml_gelu(ctx0, cur);
|
nullptr, nullptr,
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
model.mm_3_w, model.mm_3_b,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
FFN_GELU,
|
||||||
|
-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// build the graph
|
// build the graph
|
||||||
|
|
@ -1412,11 +1402,12 @@ struct clip_graph {
|
||||||
cb(cur, "proj_inp_normed", -1);
|
cb(cur, "proj_inp_normed", -1);
|
||||||
|
|
||||||
// projection mlp
|
// projection mlp
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
cur = build_ffn(cur,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
model.mm_1_w, model.mm_1_b,
|
||||||
cur = ggml_gelu(ctx0, cur);
|
nullptr, nullptr,
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
model.mm_2_w, model.mm_2_b,
|
||||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
FFN_GELU,
|
||||||
|
-1);
|
||||||
cb(cur, "proj_out", -1);
|
cb(cur, "proj_out", -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1887,9 +1878,12 @@ struct clip_graph {
|
||||||
|
|
||||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
||||||
// projector
|
// projector
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
cur = build_ffn(cur,
|
||||||
cur = ggml_gelu_erf(ctx0, cur);
|
model.mm_1_w, model.mm_1_b,
|
||||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
nullptr, nullptr,
|
||||||
|
model.mm_2_w, model.mm_2_b,
|
||||||
|
FFN_GELU_ERF,
|
||||||
|
-1);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unknown projector type", __func__);
|
GGML_ABORT("%s: unknown projector type", __func__);
|
||||||
|
|
@ -2074,34 +2068,66 @@ private:
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
ggml_tensor * Qcur = nullptr;
|
||||||
if (layer.q_b) {
|
ggml_tensor * Kcur = nullptr;
|
||||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
ggml_tensor * Vcur = nullptr;
|
||||||
}
|
if (layer.qkv_w != nullptr) {
|
||||||
|
// fused qkv
|
||||||
|
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||||
|
if (layer.qkv_b != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||||
if (layer.k_b) {
|
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
/* nb2 */ cur->nb[1],
|
||||||
}
|
/* offset */ 0);
|
||||||
|
|
||||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||||
if (layer.v_b) {
|
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
/* nb2 */ cur->nb[1],
|
||||||
}
|
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||||
|
|
||||||
if (layer.q_norm) {
|
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||||
cb(Qcur, "Qcur_norm", il);
|
/* nb2 */ cur->nb[1],
|
||||||
}
|
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||||
|
|
||||||
if (layer.k_norm) {
|
// TODO: q/k norm requires row size == n_embd, while here it's d_head
|
||||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
// we can add support in the future if needed
|
||||||
cb(Kcur, "Kcur_norm", il);
|
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
} else {
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
// separate q, k, v
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||||
|
if (layer.q_b) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||||
|
if (layer.k_b) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||||
|
if (layer.v_b) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layer.q_norm) {
|
||||||
|
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layer.k_norm) {
|
||||||
|
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||||
|
cb(Kcur, "Kcur_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -270,6 +270,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
common_params params;
|
common_params params;
|
||||||
|
params.use_jinja = false; // disable jinja by default
|
||||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||||
|
|
@ -317,7 +318,9 @@ int main(int argc, char ** argv) {
|
||||||
g_is_generating = true;
|
g_is_generating = true;
|
||||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||||
for (size_t i = 0; i < params.image.size(); i++) {
|
for (size_t i = 0; i < params.image.size(); i++) {
|
||||||
params.prompt += mtmd_default_marker();
|
// most models require the marker before each image
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
|
||||||
|
params.prompt = mtmd_default_marker() + params.prompt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
common_chat_msg msg;
|
common_chat_msg msg;
|
||||||
|
|
|
||||||
|
|
@ -32,23 +32,32 @@ fi
|
||||||
|
|
||||||
arr_prefix=()
|
arr_prefix=()
|
||||||
arr_hf=()
|
arr_hf=()
|
||||||
arr_tmpl=() # chat template
|
arr_extra_args=()
|
||||||
arr_file=()
|
arr_file=()
|
||||||
|
|
||||||
add_test_vision() {
|
add_test_vision() {
|
||||||
local hf=$1
|
local hf=$1
|
||||||
local tmpl=${2:-""} # default to empty string if not provided
|
shift
|
||||||
|
local extra_args=""
|
||||||
|
if [ $# -gt 0 ]; then
|
||||||
|
extra_args=$(printf " %q" "$@")
|
||||||
|
fi
|
||||||
arr_prefix+=("[vision]")
|
arr_prefix+=("[vision]")
|
||||||
arr_hf+=("$hf")
|
arr_hf+=("$hf")
|
||||||
arr_tmpl+=("$tmpl")
|
arr_extra_args+=("$extra_args")
|
||||||
arr_file+=("test-1.jpeg")
|
arr_file+=("test-1.jpeg")
|
||||||
}
|
}
|
||||||
|
|
||||||
add_test_audio() {
|
add_test_audio() {
|
||||||
local hf=$1
|
local hf=$1
|
||||||
|
shift
|
||||||
|
local extra_args=""
|
||||||
|
if [ $# -gt 0 ]; then
|
||||||
|
extra_args=$(printf " %q" "$@")
|
||||||
|
fi
|
||||||
arr_prefix+=("[audio] ")
|
arr_prefix+=("[audio] ")
|
||||||
arr_hf+=("$hf")
|
arr_hf+=("$hf")
|
||||||
arr_tmpl+=("") # no need for chat tmpl
|
arr_extra_args+=("$extra_args")
|
||||||
arr_file+=("test-2.mp3")
|
arr_file+=("test-2.mp3")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
|
||||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
|
||||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
|
||||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||||
|
|
@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
||||||
# to test the big models, run: ./tests.sh big
|
# to test the big models, run: ./tests.sh big
|
||||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
|
||||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||||
|
|
@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
|
||||||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||||
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
|
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
|
||||||
|
|
||||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||||
|
|
@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
|
||||||
bin="llama-mtmd-cli"
|
bin="llama-mtmd-cli"
|
||||||
prefix="${arr_prefix[$i]}"
|
prefix="${arr_prefix[$i]}"
|
||||||
hf="${arr_hf[$i]}"
|
hf="${arr_hf[$i]}"
|
||||||
tmpl="${arr_tmpl[$i]}"
|
extra_args="${arr_extra_args[$i]}"
|
||||||
inp_file="${arr_file[$i]}"
|
inp_file="${arr_file[$i]}"
|
||||||
|
|
||||||
echo "Running test with binary: $bin and HF model: $hf"
|
echo "Running test with binary: $bin and HF model: $hf"
|
||||||
echo ""
|
echo ""
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
output=$(\
|
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
|
||||||
"$PROJ_ROOT/build/bin/$bin" \
|
-hf $(printf %q "$hf") \
|
||||||
-hf "$hf" \
|
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||||
--image $SCRIPT_DIR/$inp_file \
|
|
||||||
-p "what is the publisher name of the newspaper?" \
|
|
||||||
--temp 0 -n 128 \
|
--temp 0 -n 128 \
|
||||||
${tmpl:+--chat-template "$tmpl"} \
|
${extra_args}"
|
||||||
2>&1 | tee /dev/tty)
|
|
||||||
|
# if extra_args does not contain -p, we add a default prompt
|
||||||
|
if ! [[ "$extra_args" =~ "-p" ]]; then
|
||||||
|
cmd+=" -p \"what is the publisher name of the newspaper?\""
|
||||||
|
fi
|
||||||
|
|
||||||
|
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
|
||||||
|
|
||||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||||
|
|
||||||
|
|
@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
|
||||||
if echo "$output" | grep -iq "new york" \
|
if echo "$output" | grep -iq "new york" \
|
||||||
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
||||||
then
|
then
|
||||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
result="$prefix \033[32mOK\033[0m: $hf"
|
||||||
else
|
else
|
||||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
result="$prefix \033[31mFAIL\033[0m: $hf"
|
||||||
fi
|
fi
|
||||||
echo -e "$result"
|
echo -e "$result"
|
||||||
arr_res+=("$result")
|
arr_res+=("$result")
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,14 @@ set(TARGET_SRCS
|
||||||
server-http.h
|
server-http.h
|
||||||
server-models.cpp
|
server-models.cpp
|
||||||
server-models.h
|
server-models.h
|
||||||
|
server-task.cpp
|
||||||
|
server-task.h
|
||||||
|
server-queue.cpp
|
||||||
|
server-queue.h
|
||||||
|
server-common.cpp
|
||||||
|
server-common.h
|
||||||
|
server-context.cpp
|
||||||
|
server-context.h
|
||||||
)
|
)
|
||||||
set(PUBLIC_ASSETS
|
set(PUBLIC_ASSETS
|
||||||
index.html.gz
|
index.html.gz
|
||||||
|
|
|
||||||
|
|
@ -1369,6 +1369,11 @@ llama-server
|
||||||
|
|
||||||
### Model sources
|
### Model sources
|
||||||
|
|
||||||
|
There are 3 possible sources for model files:
|
||||||
|
1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
|
||||||
|
2. Custom model directory (set via the `--models-dir` argument)
|
||||||
|
3. Custom preset (set via the `--models-preset` argument)
|
||||||
|
|
||||||
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
|
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|
@ -1413,6 +1418,51 @@ llama-server -ctx 8192 -n 1024 -np 2
|
||||||
|
|
||||||
Note: model instances inherit both command line arguments and environment variables from the router server.
|
Note: model instances inherit both command line arguments and environment variables from the router server.
|
||||||
|
|
||||||
|
Alternatively, you can also add GGUF based preset (see next section)
|
||||||
|
|
||||||
|
### Model presets
|
||||||
|
|
||||||
|
Model presets allow advanced users to define custom configurations using an `.ini` file:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
llama-server --models-preset ./my-models.ini
|
||||||
|
```
|
||||||
|
|
||||||
|
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
|
||||||
|
|
||||||
|
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
version = 1
|
||||||
|
|
||||||
|
; If the key corresponds to an existing model on the server,
|
||||||
|
; this will be used as the default config for that model
|
||||||
|
[ggml-org/MY-MODEL-GGUF:Q8_0]
|
||||||
|
; string value
|
||||||
|
chat-template = chatml
|
||||||
|
; numeric value
|
||||||
|
n-gpu-layer = 123
|
||||||
|
; flag value (for certain flags, you need to use the "no-" prefix for negation)
|
||||||
|
jinja = true
|
||||||
|
; shorthand argument (for example, context size)
|
||||||
|
c = 4096
|
||||||
|
; environment variable name
|
||||||
|
LLAMA_ARG_CACHE_RAM = 0
|
||||||
|
; file paths are relative to server's CWD
|
||||||
|
model-draft = ./my-models/draft.gguf
|
||||||
|
; but it's RECOMMENDED to use absolute path
|
||||||
|
model-draft = /Users/abc/my-models/draft.gguf
|
||||||
|
|
||||||
|
; If the key does NOT correspond to an existing model,
|
||||||
|
; you need to specify at least the model path
|
||||||
|
[custom_model]
|
||||||
|
model = /Users/abc/my-awesome-model-Q4_K_M.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
|
||||||
|
|
||||||
### Routing requests
|
### Routing requests
|
||||||
|
|
||||||
Requests are routed according to the requested model name.
|
Requests are routed according to the requested model name.
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,6 +1,7 @@
|
||||||
#include "server-common.h"
|
#include "server-common.h"
|
||||||
#include "server-models.h"
|
#include "server-models.h"
|
||||||
|
|
||||||
|
#include "preset.h"
|
||||||
#include "download.h"
|
#include "download.h"
|
||||||
|
|
||||||
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
|
||||||
|
|
@ -33,6 +34,10 @@
|
||||||
|
|
||||||
#define CMD_EXIT "exit"
|
#define CMD_EXIT "exit"
|
||||||
|
|
||||||
|
// address for child process, this is needed because router may run on 0.0.0.0
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
|
||||||
|
#define CHILD_ADDR "127.0.0.1"
|
||||||
|
|
||||||
static std::filesystem::path get_server_exec_path() {
|
static std::filesystem::path get_server_exec_path() {
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
||||||
|
|
@ -132,6 +137,93 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
|
||||||
return models;
|
return models;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// server_presets
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
|
||||||
|
: ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
|
||||||
|
if (!presets_path.empty()) {
|
||||||
|
presets = common_presets_load(presets_path, ctx_params);
|
||||||
|
SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// populate reserved args (will be appended by the router)
|
||||||
|
for (auto & opt : ctx_params.options) {
|
||||||
|
if (opt.env == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::string env = opt.env;
|
||||||
|
if (env == "LLAMA_ARG_PORT" ||
|
||||||
|
env == "LLAMA_ARG_HOST" ||
|
||||||
|
env == "LLAMA_ARG_ALIAS" ||
|
||||||
|
env == "LLAMA_ARG_API_KEY" ||
|
||||||
|
env == "LLAMA_ARG_MODELS_DIR" ||
|
||||||
|
env == "LLAMA_ARG_MODELS_MAX" ||
|
||||||
|
env == "LLAMA_ARG_MODELS_PRESET" ||
|
||||||
|
env == "LLAMA_ARG_MODEL" ||
|
||||||
|
env == "LLAMA_ARG_MMPROJ" ||
|
||||||
|
env == "LLAMA_ARG_HF_REPO" ||
|
||||||
|
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
|
||||||
|
control_args[env] = opt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// read base args from router's argv
|
||||||
|
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
|
||||||
|
|
||||||
|
// remove any router-controlled args from base_args
|
||||||
|
for (const auto & cargs : control_args) {
|
||||||
|
auto it = base_args.find(cargs.second);
|
||||||
|
if (it != base_args.end()) {
|
||||||
|
base_args.erase(it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
common_preset server_presets::get_preset(const std::string & name) {
|
||||||
|
auto it = presets.find(name);
|
||||||
|
if (it != presets.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
return common_preset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void server_presets::render_args(server_model_meta & meta) {
|
||||||
|
common_preset preset = meta.preset; // copy
|
||||||
|
// merging 3 kinds of args:
|
||||||
|
// 1. model-specific args (from preset)
|
||||||
|
// force removing control args if any
|
||||||
|
for (auto & cargs : control_args) {
|
||||||
|
if (preset.options.find(cargs.second) != preset.options.end()) {
|
||||||
|
SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
|
||||||
|
preset.options.erase(cargs.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 2. base args (from router)
|
||||||
|
// inherit from base args
|
||||||
|
for (const auto & [arg, value] : base_args) {
|
||||||
|
preset.options[arg] = value;
|
||||||
|
}
|
||||||
|
// 3. control args (from router)
|
||||||
|
// set control values
|
||||||
|
preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
|
||||||
|
preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
|
||||||
|
preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
|
||||||
|
if (meta.in_cache) {
|
||||||
|
preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
|
||||||
|
} else {
|
||||||
|
preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
|
||||||
|
if (!meta.path_mmproj.empty()) {
|
||||||
|
preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
meta.args = preset.to_args();
|
||||||
|
// add back the binary path at the front
|
||||||
|
meta.args.insert(meta.args.begin(), get_server_exec_path().string());
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// server_models
|
// server_models
|
||||||
//
|
//
|
||||||
|
|
@ -140,7 +232,7 @@ server_models::server_models(
|
||||||
const common_params & params,
|
const common_params & params,
|
||||||
int argc,
|
int argc,
|
||||||
char ** argv,
|
char ** argv,
|
||||||
char ** envp) : base_params(params) {
|
char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
|
||||||
for (int i = 0; i < argc; i++) {
|
for (int i = 0; i < argc; i++) {
|
||||||
base_args.push_back(std::string(argv[i]));
|
base_args.push_back(std::string(argv[i]));
|
||||||
}
|
}
|
||||||
|
|
@ -155,11 +247,58 @@ server_models::server_models(
|
||||||
LOG_WRN("failed to get server executable path: %s\n", e.what());
|
LOG_WRN("failed to get server executable path: %s\n", e.what());
|
||||||
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
|
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
|
||||||
}
|
}
|
||||||
// TODO: allow refreshing cached model list
|
load_models();
|
||||||
// add cached models
|
}
|
||||||
|
|
||||||
|
void server_models::add_model(server_model_meta && meta) {
|
||||||
|
if (mapping.find(meta.name) != mapping.end()) {
|
||||||
|
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
|
||||||
|
}
|
||||||
|
presets.render_args(meta); // populate meta.args
|
||||||
|
std::string name = meta.name;
|
||||||
|
mapping[name] = instance_t{
|
||||||
|
/* subproc */ std::make_shared<subprocess_s>(),
|
||||||
|
/* th */ std::thread(),
|
||||||
|
/* meta */ std::move(meta)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<local_model> list_custom_path_models(server_presets & presets) {
|
||||||
|
// detect any custom-path models in presets
|
||||||
|
std::vector<local_model> custom_models;
|
||||||
|
for (auto & [model_name, preset] : presets.presets) {
|
||||||
|
local_model model;
|
||||||
|
model.name = model_name;
|
||||||
|
std::vector<common_arg> to_erase;
|
||||||
|
for (auto & [arg, value] : preset.options) {
|
||||||
|
std::string env(arg.env ? arg.env : "");
|
||||||
|
if (env == "LLAMA_ARG_MODEL") {
|
||||||
|
model.path = value;
|
||||||
|
to_erase.push_back(arg);
|
||||||
|
}
|
||||||
|
if (env == "LLAMA_ARG_MMPROJ") {
|
||||||
|
model.path_mmproj = value;
|
||||||
|
to_erase.push_back(arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto & arg : to_erase) {
|
||||||
|
preset.options.erase(arg);
|
||||||
|
}
|
||||||
|
if (!model.name.empty() && !model.path.empty()) {
|
||||||
|
custom_models.push_back(model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return custom_models;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: allow refreshing cached model list
|
||||||
|
void server_models::load_models() {
|
||||||
|
// loading models from 3 sources:
|
||||||
|
// 1. cached models
|
||||||
auto cached_models = common_list_cached_models();
|
auto cached_models = common_list_cached_models();
|
||||||
for (const auto & model : cached_models) {
|
for (const auto & model : cached_models) {
|
||||||
server_model_meta meta{
|
server_model_meta meta{
|
||||||
|
/* preset */ presets.get_preset(model.to_string()),
|
||||||
/* name */ model.to_string(),
|
/* name */ model.to_string(),
|
||||||
/* path */ model.manifest_path,
|
/* path */ model.manifest_path,
|
||||||
/* path_mmproj */ "", // auto-detected when loading
|
/* path_mmproj */ "", // auto-detected when loading
|
||||||
|
|
@ -170,21 +309,18 @@ server_models::server_models(
|
||||||
/* args */ std::vector<std::string>(),
|
/* args */ std::vector<std::string>(),
|
||||||
/* exit_code */ 0
|
/* exit_code */ 0
|
||||||
};
|
};
|
||||||
mapping[meta.name] = instance_t{
|
add_model(std::move(meta));
|
||||||
/* subproc */ std::make_shared<subprocess_s>(),
|
|
||||||
/* th */ std::thread(),
|
|
||||||
/* meta */ meta
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
// add local models specificed via --models-dir
|
// 2. local models specificed via --models-dir
|
||||||
if (!params.models_dir.empty()) {
|
if (!base_params.models_dir.empty()) {
|
||||||
auto local_models = list_local_models(params.models_dir);
|
auto local_models = list_local_models(base_params.models_dir);
|
||||||
for (const auto & model : local_models) {
|
for (const auto & model : local_models) {
|
||||||
if (mapping.find(model.name) != mapping.end()) {
|
if (mapping.find(model.name) != mapping.end()) {
|
||||||
// already exists in cached models, skip
|
// already exists in cached models, skip
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
server_model_meta meta{
|
server_model_meta meta{
|
||||||
|
/* preset */ presets.get_preset(model.name),
|
||||||
/* name */ model.name,
|
/* name */ model.name,
|
||||||
/* path */ model.path,
|
/* path */ model.path,
|
||||||
/* path_mmproj */ model.path_mmproj,
|
/* path_mmproj */ model.path_mmproj,
|
||||||
|
|
@ -195,13 +331,31 @@ server_models::server_models(
|
||||||
/* args */ std::vector<std::string>(),
|
/* args */ std::vector<std::string>(),
|
||||||
/* exit_code */ 0
|
/* exit_code */ 0
|
||||||
};
|
};
|
||||||
mapping[meta.name] = instance_t{
|
add_model(std::move(meta));
|
||||||
/* subproc */ std::make_shared<subprocess_s>(),
|
|
||||||
/* th */ std::thread(),
|
|
||||||
/* meta */ meta
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 3. custom-path models specified in presets
|
||||||
|
auto custom_models = list_custom_path_models(presets);
|
||||||
|
for (const auto & model : custom_models) {
|
||||||
|
server_model_meta meta{
|
||||||
|
/* preset */ presets.get_preset(model.name),
|
||||||
|
/* name */ model.name,
|
||||||
|
/* path */ model.path,
|
||||||
|
/* path_mmproj */ model.path_mmproj,
|
||||||
|
/* in_cache */ false,
|
||||||
|
/* port */ 0,
|
||||||
|
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||||
|
/* last_used */ 0,
|
||||||
|
/* args */ std::vector<std::string>(),
|
||||||
|
/* exit_code */ 0
|
||||||
|
};
|
||||||
|
add_model(std::move(meta));
|
||||||
|
}
|
||||||
|
// log available models
|
||||||
|
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
|
||||||
|
for (const auto & [name, inst] : mapping) {
|
||||||
|
SRV_INF(" %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
|
||||||
|
|
@ -335,19 +489,7 @@ void server_models::unload_lru() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
|
void server_models::load(const std::string & name) {
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
|
||||||
if (args[i] == key && i + 1 < args.size()) {
|
|
||||||
args[i + 1] = value;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// not found, append
|
|
||||||
args.push_back(key);
|
|
||||||
args.push_back(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void server_models::load(const std::string & name, bool auto_load) {
|
|
||||||
if (!has_model(name)) {
|
if (!has_model(name)) {
|
||||||
throw std::runtime_error("model name=" + name + " is not found");
|
throw std::runtime_error("model name=" + name + " is not found");
|
||||||
}
|
}
|
||||||
|
|
@ -376,26 +518,10 @@ void server_models::load(const std::string & name, bool auto_load) {
|
||||||
{
|
{
|
||||||
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
|
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
|
||||||
|
|
||||||
std::vector<std::string> child_args;
|
presets.render_args(inst.meta); // update meta.args
|
||||||
if (auto_load && !meta.args.empty()) {
|
|
||||||
child_args = meta.args; // copy previous args
|
|
||||||
} else {
|
|
||||||
child_args = base_args; // copy
|
|
||||||
if (inst.meta.in_cache) {
|
|
||||||
add_or_replace_arg(child_args, "-hf", inst.meta.name);
|
|
||||||
} else {
|
|
||||||
add_or_replace_arg(child_args, "-m", inst.meta.path);
|
|
||||||
if (!inst.meta.path_mmproj.empty()) {
|
|
||||||
add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// set model args
|
std::vector<std::string> child_args = inst.meta.args; // copy
|
||||||
add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
|
std::vector<std::string> child_env = base_env; // copy
|
||||||
add_or_replace_arg(child_args, "--alias", inst.meta.name);
|
|
||||||
|
|
||||||
std::vector<std::string> child_env = base_env; // copy
|
|
||||||
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
|
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
|
||||||
|
|
||||||
SRV_INF("%s", "spawning server instance with args:\n");
|
SRV_INF("%s", "spawning server instance with args:\n");
|
||||||
|
|
@ -541,7 +667,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
|
||||||
}
|
}
|
||||||
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
|
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||||
load(name, true);
|
load(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
|
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
|
||||||
|
|
@ -571,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
||||||
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
||||||
auto proxy = std::make_unique<server_http_proxy>(
|
auto proxy = std::make_unique<server_http_proxy>(
|
||||||
method,
|
method,
|
||||||
base_params.hostname,
|
CHILD_ADDR,
|
||||||
meta->port,
|
meta->port,
|
||||||
req.path,
|
req.path,
|
||||||
req.headers,
|
req.headers,
|
||||||
|
|
@ -724,38 +850,6 @@ void server_models_routes::init_routes() {
|
||||||
return models.proxy_request(req, method, name, true); // update last usage for POST request only
|
return models.proxy_request(req, method, name, true); // update last usage for POST request only
|
||||||
};
|
};
|
||||||
|
|
||||||
this->get_router_models = [this](const server_http_req &) {
|
|
||||||
auto res = std::make_unique<server_http_res>();
|
|
||||||
json models_json = json::array();
|
|
||||||
auto all_models = models.get_all_meta();
|
|
||||||
std::time_t t = std::time(0);
|
|
||||||
for (const auto & meta : all_models) {
|
|
||||||
json status {
|
|
||||||
{"value", server_model_status_to_string(meta.status)},
|
|
||||||
{"args", meta.args},
|
|
||||||
};
|
|
||||||
if (meta.is_failed()) {
|
|
||||||
status["exit_code"] = meta.exit_code;
|
|
||||||
status["failed"] = true;
|
|
||||||
}
|
|
||||||
models_json.push_back(json {
|
|
||||||
{"id", meta.name},
|
|
||||||
{"object", "model"}, // for OAI-compat
|
|
||||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
|
||||||
{"created", t}, // for OAI-compat
|
|
||||||
{"in_cache", meta.in_cache},
|
|
||||||
{"path", meta.path},
|
|
||||||
{"status", status},
|
|
||||||
// TODO: add other fields, may require reading GGUF metadata
|
|
||||||
});
|
|
||||||
}
|
|
||||||
res_ok(res, {
|
|
||||||
{"data", models_json},
|
|
||||||
{"object", "list"},
|
|
||||||
});
|
|
||||||
return res;
|
|
||||||
};
|
|
||||||
|
|
||||||
this->post_router_models_load = [this](const server_http_req & req) {
|
this->post_router_models_load = [this](const server_http_req & req) {
|
||||||
auto res = std::make_unique<server_http_res>();
|
auto res = std::make_unique<server_http_res>();
|
||||||
json body = json::parse(req.body);
|
json body = json::parse(req.body);
|
||||||
|
|
@ -769,7 +863,7 @@ void server_models_routes::init_routes() {
|
||||||
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
models.load(name, false);
|
models.load(name);
|
||||||
res_ok(res, {{"success", true}});
|
res_ok(res, {{"success", true}});
|
||||||
return res;
|
return res;
|
||||||
};
|
};
|
||||||
|
|
@ -793,9 +887,12 @@ void server_models_routes::init_routes() {
|
||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
for (const auto & meta : all_models) {
|
for (const auto & meta : all_models) {
|
||||||
json status {
|
json status {
|
||||||
{"value", server_model_status_to_string(meta.status)},
|
{"value", server_model_status_to_string(meta.status)},
|
||||||
{"args", meta.args},
|
{"args", meta.args},
|
||||||
};
|
};
|
||||||
|
if (!meta.preset.name.empty()) {
|
||||||
|
status["preset"] = meta.preset.to_ini();
|
||||||
|
}
|
||||||
if (meta.is_failed()) {
|
if (meta.is_failed()) {
|
||||||
status["exit_code"] = meta.exit_code;
|
status["exit_code"] = meta.exit_code;
|
||||||
status["failed"] = true;
|
status["failed"] = true;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "preset.h"
|
||||||
#include "server-http.h"
|
#include "server-http.h"
|
||||||
|
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
@ -47,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct server_model_meta {
|
struct server_model_meta {
|
||||||
|
common_preset preset;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string path;
|
std::string path;
|
||||||
std::string path_mmproj; // only available if in_cache=false
|
std::string path_mmproj; // only available if in_cache=false
|
||||||
|
|
@ -54,7 +56,7 @@ struct server_model_meta {
|
||||||
int port = 0;
|
int port = 0;
|
||||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||||
int64_t last_used = 0; // for LRU unloading
|
int64_t last_used = 0; // for LRU unloading
|
||||||
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
|
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||||
|
|
||||||
bool is_active() const {
|
bool is_active() const {
|
||||||
|
|
@ -66,6 +68,19 @@ struct server_model_meta {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// the server_presets struct holds the presets read from presets.ini
|
||||||
|
// as well as base args from the router server
|
||||||
|
struct server_presets {
|
||||||
|
common_presets presets;
|
||||||
|
common_params_context ctx_params;
|
||||||
|
std::map<common_arg, std::string> base_args;
|
||||||
|
std::map<std::string, common_arg> control_args; // args reserved for server control
|
||||||
|
|
||||||
|
server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
|
||||||
|
common_preset get_preset(const std::string & name);
|
||||||
|
void render_args(server_model_meta & meta);
|
||||||
|
};
|
||||||
|
|
||||||
struct subprocess_s;
|
struct subprocess_s;
|
||||||
|
|
||||||
struct server_models {
|
struct server_models {
|
||||||
|
|
@ -85,14 +100,21 @@ private:
|
||||||
std::vector<std::string> base_args;
|
std::vector<std::string> base_args;
|
||||||
std::vector<std::string> base_env;
|
std::vector<std::string> base_env;
|
||||||
|
|
||||||
|
server_presets presets;
|
||||||
|
|
||||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||||
|
|
||||||
// unload least recently used models if the limit is reached
|
// unload least recently used models if the limit is reached
|
||||||
void unload_lru();
|
void unload_lru();
|
||||||
|
|
||||||
|
// not thread-safe, caller must hold mutex
|
||||||
|
void add_model(server_model_meta && meta);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
||||||
|
|
||||||
|
void load_models();
|
||||||
|
|
||||||
// check if a model instance exists
|
// check if a model instance exists
|
||||||
bool has_model(const std::string & name);
|
bool has_model(const std::string & name);
|
||||||
|
|
||||||
|
|
@ -102,8 +124,7 @@ public:
|
||||||
// return a copy of all model metadata
|
// return a copy of all model metadata
|
||||||
std::vector<server_model_meta> get_all_meta();
|
std::vector<server_model_meta> get_all_meta();
|
||||||
|
|
||||||
// if auto_load is true, load the model with previous args if any
|
void load(const std::string & name);
|
||||||
void load(const std::string & name, bool auto_load);
|
|
||||||
void unload(const std::string & name);
|
void unload(const std::string & name);
|
||||||
void unload_all();
|
void unload_all();
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue