Merge branch 'ggml-org:master' into lovedheart-mul_mat_iq2_xs_improve
This commit is contained in:
commit
05ec473280
|
|
@ -11,7 +11,7 @@ body:
|
||||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||||
If you encountered the issue while using an external UI (e.g. ollama),
|
If you encountered the issue while using an external UI (e.g. ollama),
|
||||||
please reproduce your issue using one of the examples/binaries in this repository.
|
please reproduce your issue using one of the examples/binaries in this repository.
|
||||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: version
|
id: version
|
||||||
attributes:
|
attributes:
|
||||||
|
|
@ -74,9 +74,12 @@ body:
|
||||||
Please give us a summary of the problem and tell us how to reproduce it.
|
Please give us a summary of the problem and tell us how to reproduce it.
|
||||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||||
that information would be very much appreciated by us.
|
that information would be very much appreciated by us.
|
||||||
|
|
||||||
|
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
|
||||||
|
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
|
||||||
placeholder: >
|
placeholder: >
|
||||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
|
||||||
When I use -ngl 0 it works correctly.
|
With short prompts or `-fa off` it works correctly.
|
||||||
Here are the exact commands that I used: ...
|
Here are the exact commands that I used: ...
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|
|
||||||
|
|
@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||||
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
|
@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
@ -529,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.tensor_buft_overrides.empty()) {
|
// pad tensor_buft_overrides for llama_params_fit:
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
while (params.tensor_buft_overrides.size() < ntbo) {
|
||||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2153,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MAIN_GPU"));
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fit", "--fit" }, "[on|off]",
|
||||||
|
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
params.fit_params = true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
params.fit_params = false;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(
|
||||||
|
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitt", "--fit-target" }, "MiB",
|
||||||
|
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_target = value * size_t(1024*1024);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "-fitc", "--fit-ctx" }, "N",
|
||||||
|
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.fit_params_min_ctx = value;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_FIT_CTX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--check-tensors"},
|
{"--check-tensors"},
|
||||||
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
||||||
|
|
|
||||||
|
|
@ -1088,7 +1088,15 @@ struct common_init_result::impl {
|
||||||
|
|
||||||
common_init_result::common_init_result(common_params & params) :
|
common_init_result::common_init_result(common_params & params) :
|
||||||
pimpl(new impl{}) {
|
pimpl(new impl{}) {
|
||||||
const auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
if (params.fit_params) {
|
||||||
|
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
|
@ -1103,8 +1111,6 @@ common_init_result::common_init_result(common_params & params) :
|
||||||
// TODO: fix naming
|
// TODO: fix naming
|
||||||
common_init_sampler_from_model(model, params.sampling);
|
common_init_sampler_from_model(model, params.sampling);
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sampling.ignore_eos = false;
|
params.sampling.ignore_eos = false;
|
||||||
|
|
@ -1143,8 +1149,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1176,15 +1181,13 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
llama_model * model = res->model();
|
llama_model * model = res->model();
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context * lctx = res->context();
|
llama_context * lctx = res->context();
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -306,8 +307,8 @@ struct lr_opt {
|
||||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
@ -331,6 +332,9 @@ struct common_params {
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
|
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||||
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -157,6 +157,21 @@ static std::map<std::string, common_arg> get_map_key_opt(common_params_context &
|
||||||
return mapping;
|
return mapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_bool_arg(const common_arg & arg) {
|
||||||
|
return !arg.args_neg.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
|
||||||
|
// if this is a negated arg, we need to reverse the value
|
||||||
|
for (const auto & neg_arg : arg.args_neg) {
|
||||||
|
if (rm_leading_dashes(neg_arg) == key) {
|
||||||
|
return common_arg_utils::is_truthy(value) ? "false" : "true";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// otherwise, not negated
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
||||||
common_presets out;
|
common_presets out;
|
||||||
auto key_to_opt = get_map_key_opt(ctx_params);
|
auto key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
|
@ -173,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
||||||
for (const auto & [key, value] : section.second) {
|
for (const auto & [key, value] : section.second) {
|
||||||
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
||||||
if (key_to_opt.find(key) != key_to_opt.end()) {
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
||||||
preset.options[key_to_opt[key]] = value;
|
auto & opt = key_to_opt[key];
|
||||||
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
|
if (is_bool_arg(opt)) {
|
||||||
|
preset.options[opt] = parse_bool_arg(opt, key, value);
|
||||||
|
} else {
|
||||||
|
preset.options[opt] = value;
|
||||||
|
}
|
||||||
|
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
||||||
} else {
|
} else {
|
||||||
// TODO: maybe warn about unknown key?
|
// TODO: maybe warn about unknown key?
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -136,11 +136,19 @@ class ModelBase:
|
||||||
self.remote_hf_model_id = remote_hf_model_id
|
self.remote_hf_model_id = remote_hf_model_id
|
||||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||||
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
||||||
|
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
|
||||||
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
|
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||||
|
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||||
|
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
|
||||||
|
self.rope_parameters["rope_theta"] = rope_theta
|
||||||
|
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||||
|
self.rope_parameters["rope_type"] = rope_type
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
|
|
@ -705,6 +713,9 @@ class ModelBase:
|
||||||
if "llm_config" in config:
|
if "llm_config" in config:
|
||||||
# rename for InternVL
|
# rename for InternVL
|
||||||
config["text_config"] = config["llm_config"]
|
config["text_config"] = config["llm_config"]
|
||||||
|
if "lm_config" in config:
|
||||||
|
# rename for GlmASR
|
||||||
|
config["text_config"] = config["lm_config"]
|
||||||
if "thinker_config" in config:
|
if "thinker_config" in config:
|
||||||
# rename for Qwen2.5-Omni
|
# rename for Qwen2.5-Omni
|
||||||
config["text_config"] = config["thinker_config"]["text_config"]
|
config["text_config"] = config["thinker_config"]["text_config"]
|
||||||
|
|
@ -795,7 +806,7 @@ class TextModel(ModelBase):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
|
||||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
logger.info(f"gguf: context length = {n_ctx}")
|
logger.info(f"gguf: context length = {n_ctx}")
|
||||||
|
|
||||||
|
|
@ -815,7 +826,42 @@ class TextModel(ModelBase):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
||||||
|
if (rope_type := rope_params.get("rope_type")) is not None:
|
||||||
|
rope_factor = rope_params.get("factor")
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.NONE
|
||||||
|
if rope_type == "linear" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LINEAR
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
elif rope_type == "yarn" and rope_factor is not None:
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.YARN
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_factor)
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
||||||
|
if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
|
||||||
|
if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
|
||||||
|
if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
|
||||||
|
if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
|
||||||
|
# self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
|
elif rope_type == "su" or rope_type == "longrope":
|
||||||
|
rope_gguf_type = gguf.RopeScalingType.LONGROPE
|
||||||
|
self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
|
||||||
|
elif rope_type == "dynamic":
|
||||||
|
# HunYuan, handled in model class
|
||||||
|
pass
|
||||||
|
elif rope_type.lower() == "llama3":
|
||||||
|
# Handled in generate_extra_tensors
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown RoPE type: {rope_type}")
|
||||||
|
logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
|
||||||
|
|
||||||
|
if (rope_theta := rope_params.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
logger.info(f"gguf: rope theta = {rope_theta}")
|
logger.info(f"gguf: rope theta = {rope_theta}")
|
||||||
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
|
||||||
|
|
@ -1486,6 +1532,21 @@ class TextModel(ModelBase):
|
||||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
|
def _set_vocab_glmedge(self):
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||||
|
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_interns1(self):
|
def _set_vocab_interns1(self):
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
@ -1615,7 +1676,7 @@ class MmprojModel(ModelBase):
|
||||||
preprocessor_config: dict[str, Any]
|
preprocessor_config: dict[str, Any]
|
||||||
global_config: dict[str, Any]
|
global_config: dict[str, Any]
|
||||||
|
|
||||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
||||||
|
|
||||||
has_vision_encoder: bool = True # by default
|
has_vision_encoder: bool = True # by default
|
||||||
has_audio_encoder: bool = False
|
has_audio_encoder: bool = False
|
||||||
|
|
@ -1691,7 +1752,8 @@ class MmprojModel(ModelBase):
|
||||||
return self.global_config.get(config_name)
|
return self.global_config.get(config_name)
|
||||||
|
|
||||||
def get_audio_config(self) -> dict[str, Any] | None:
|
def get_audio_config(self) -> dict[str, Any] | None:
|
||||||
return self.global_config.get("audio_config")
|
mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
|
||||||
|
return self.global_config.get(mm_config_key)
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||||
|
|
@ -1966,34 +2028,10 @@ class BaichuanModel(TextModel):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
head_count = self.hparams["num_attention_heads"]
|
head_count = self.hparams["num_attention_heads"]
|
||||||
|
|
@ -2089,34 +2127,10 @@ class XverseModel(TextModel):
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
head_count = self.hparams["num_attention_heads"]
|
super().set_gguf_parameters()
|
||||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
|
||||||
|
|
||||||
ctx_length = 0
|
|
||||||
if "max_sequence_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in self.hparams:
|
|
||||||
ctx_length = self.hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in self.hparams:
|
|
||||||
ctx_length = self.hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
raise ValueError("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
self.gguf_writer.add_context_length(ctx_length)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(head_count)
|
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -2377,8 +2391,13 @@ class LlamaModel(TextModel):
|
||||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||||
if self.hf_arch == "VLlama3ForCausalLM":
|
if self.hf_arch == "VLlama3ForCausalLM":
|
||||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||||
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||||
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
|
if self.origin_hf_arch == "GlmasrModel":
|
||||||
|
return self._set_vocab_glmedge()
|
||||||
|
|
||||||
if self.is_mistral_format:
|
if self.is_mistral_format:
|
||||||
return self._set_vocab_mistral()
|
return self._set_vocab_mistral()
|
||||||
|
|
||||||
|
|
@ -2430,11 +2449,6 @@ class LlamaModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -2454,6 +2468,7 @@ class LlamaModel(TextModel):
|
||||||
"vision_language_adapter.",
|
"vision_language_adapter.",
|
||||||
"patch_merger.",
|
"patch_merger.",
|
||||||
"pre_mm_projector_norm",
|
"pre_mm_projector_norm",
|
||||||
|
"audio_encoder.",
|
||||||
]
|
]
|
||||||
|
|
||||||
is_multimodal_tensor = "vision_tower" in name \
|
is_multimodal_tensor = "vision_tower" in name \
|
||||||
|
|
@ -2518,16 +2533,16 @@ class LlamaModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -2564,11 +2579,6 @@ class ArceeModel(LlamaModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("AfmoeForCausalLM")
|
@ModelBase.register("AfmoeForCausalLM")
|
||||||
|
|
@ -2851,17 +2861,11 @@ class Mistral3Model(LlamaModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
rope_params = self.hparams.get("rope_parameters")
|
rope_params = self.rope_parameters
|
||||||
if self.hparams.get("model_type") == "ministral3":
|
if self.hparams.get("model_type") == "ministral3":
|
||||||
assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
|
assert rope_params, "ministral3 must have 'rope_parameters' config"
|
||||||
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
|
|
||||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
|
@ -2958,7 +2962,7 @@ class DeciModel(TextModel):
|
||||||
assert self.block_count == len(self._num_kv_heads)
|
assert self.block_count == len(self._num_kv_heads)
|
||||||
assert self.block_count == len(self._num_heads)
|
assert self.block_count == len(self._num_heads)
|
||||||
assert self.block_count == len(self._ffn_dims)
|
assert self.block_count == len(self._ffn_dims)
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
self.gguf_writer.add_rope_freq_base(rope_theta)
|
||||||
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
self.gguf_writer.add_head_count_kv(self._num_kv_heads)
|
||||||
self.gguf_writer.add_head_count(self._num_heads)
|
self.gguf_writer.add_head_count(self._num_heads)
|
||||||
|
|
@ -2983,11 +2987,6 @@ class DeciModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
|
@ -3016,16 +3015,16 @@ class DeciModel(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = rope_params.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -3279,10 +3278,6 @@ class MiniCPMModel(TextModel):
|
||||||
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||||
self.gguf_writer.add_logit_scale(logit_scale)
|
self.gguf_writer.add_logit_scale(logit_scale)
|
||||||
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
|
||||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
|
|
@ -3402,17 +3397,6 @@ class QwenModel(TextModel):
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_qwen()
|
self._set_vocab_qwen()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
||||||
class Qwen2Model(TextModel):
|
class Qwen2Model(TextModel):
|
||||||
|
|
@ -3427,11 +3411,6 @@ class Qwen2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self._try_set_pooling_type()
|
self._try_set_pooling_type()
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if self.hf_arch == "Qwen2Model":
|
if self.hf_arch == "Qwen2Model":
|
||||||
|
|
@ -3499,12 +3478,6 @@ class DreamModel(TextModel):
|
||||||
|
|
||||||
# Dream models use non-causal attention for diffusion
|
# Dream models use non-causal attention for diffusion
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
# Handle RoPE scaling similar to Qwen2
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
# Add Dream-specific parameters
|
# Add Dream-specific parameters
|
||||||
mask_token_id = self.hparams.get("mask_token_id")
|
mask_token_id = self.hparams.get("mask_token_id")
|
||||||
|
|
@ -4048,13 +4021,6 @@ class Qwen2MoeModel(TextModel):
|
||||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -4656,7 +4622,7 @@ class Phi3MiniModel(TextModel):
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
sliding_window = self.hparams.get("sliding_window")
|
sliding_window = self.hparams.get("sliding_window")
|
||||||
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
# use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
|
||||||
|
|
@ -4932,7 +4898,7 @@ class Plamo2Model(TextModel):
|
||||||
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))
|
||||||
|
|
||||||
# Mamba parameters
|
# Mamba parameters
|
||||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||||
|
|
@ -5130,21 +5096,6 @@ class InternLM2Model(TextModel):
|
||||||
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
num_heads = self.hparams["num_attention_heads"]
|
num_heads = self.hparams["num_attention_heads"]
|
||||||
num_kv_heads = self.hparams["num_key_value_heads"]
|
num_kv_heads = self.hparams["num_key_value_heads"]
|
||||||
|
|
@ -5221,11 +5172,6 @@ class InternLM3Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
@ -5588,7 +5534,6 @@ class NomicBertModel(BertModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
if self.is_moe:
|
if self.is_moe:
|
||||||
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||||
|
|
@ -5711,8 +5656,6 @@ class XLMRobertaModel(BertModel):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
# jina-embeddings-v3
|
# jina-embeddings-v3
|
||||||
if rotary_emb_base := self.hparams.get("rotary_emb_base"):
|
|
||||||
self.gguf_writer.add_rope_freq_base(rotary_emb_base)
|
|
||||||
lora_alpha = self.hparams.get("lora_alpha")
|
lora_alpha = self.hparams.get("lora_alpha")
|
||||||
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
if lora_prompt_prefixes := self.hparams.get("task_instructions"):
|
||||||
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
|
||||||
|
|
@ -5840,19 +5783,16 @@ class Gemma3Model(TextModel):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# some default values are not specified in the hparams
|
# some default values are not specified in the hparams
|
||||||
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
|
||||||
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
|
||||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
|
||||||
# attn_logit_softcapping is removed in Gemma3
|
# attn_logit_softcapping is removed in Gemma3
|
||||||
assert hparams.get("attn_logit_softcapping") is None
|
assert hparams.get("attn_logit_softcapping") is None
|
||||||
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
||||||
|
|
@ -5860,19 +5800,6 @@ class Gemma3Model(TextModel):
|
||||||
if hparams.get("sliding_window_pattern") != 1:
|
if hparams.get("sliding_window_pattern") != 1:
|
||||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||||
if hparams.get("rope_scaling") is not None:
|
|
||||||
rope_scaling = hparams["rope_scaling"]
|
|
||||||
if rope_scaling["rope_type"] == "linear":
|
|
||||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
elif rope_scaling["rope_type"] == "yarn":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
@ -6776,13 +6703,6 @@ class Olmo2Model(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
if "sliding_window" in self.hparams:
|
if "sliding_window" in self.hparams:
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
|
||||||
|
|
@ -7281,16 +7201,11 @@ class DeepseekV2Model(TextModel):
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||||
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
@ -7898,11 +7813,6 @@ class Glm4Model(TextModel):
|
||||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||||
|
|
@ -8240,50 +8150,26 @@ class ExaoneModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.EXAONE
|
model_arch = gguf.MODEL_ARCH.EXAONE
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
assert (hparams["activation_function"] == "silu")
|
assert (hparams["activation_function"] == "silu")
|
||||||
|
|
||||||
max_position_embeddings = hparams["max_position_embeddings"]
|
|
||||||
embed_dim = hparams["hidden_size"]
|
|
||||||
num_heads = hparams["num_attention_heads"]
|
|
||||||
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
|
|
||||||
layer_norm_eps = hparams["layer_norm_epsilon"]
|
|
||||||
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
|
|
||||||
# attention_dropout_rate = hparams["attention_dropout"]
|
|
||||||
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
|
|
||||||
# embed_dropout_rate = hparams["embed_dropout"]
|
|
||||||
self.gguf_writer.add_embedding_length(embed_dim)
|
|
||||||
self.gguf_writer.add_head_count(num_heads)
|
|
||||||
self.gguf_writer.add_head_count_kv(num_kv_heads)
|
|
||||||
self.gguf_writer.add_context_length(max_position_embeddings)
|
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
|
|
||||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
if (rope_theta := self.hparams.get("rope_theta")) is not None:
|
|
||||||
self.gguf_writer.add_rope_freq_base(rope_theta)
|
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_params.get("factor", 8.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8338,22 +8224,17 @@ class Exaone4Model(TextModel):
|
||||||
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
||||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_params.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10_000.0)
|
base = rope_params.get("rope_theta", 10_000.0)
|
||||||
if (dim := self.hparams.get("head_dim")) is None:
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 16.0)
|
factor = rope_params.get("factor", 16.0)
|
||||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
low_freq_wavelen = old_context_len / low_freq_factor
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
|
@ -8664,13 +8545,6 @@ class BailingMoeModel(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8777,13 +8651,6 @@ class BailingMoeV2Model(TextModel):
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
else:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
|
@ -8862,13 +8729,6 @@ class GroveMoeModel(TextModel):
|
||||||
self.gguf_writer.add_experts_per_group(2)
|
self.gguf_writer.add_experts_per_group(2)
|
||||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
||||||
self.gguf_writer.add_expert_group_scale(0.05)
|
self.gguf_writer.add_expert_group_scale(0.05)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
_chunk_experts: list[dict[str, Tensor]] | None = None
|
_chunk_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
@ -9011,6 +8871,63 @@ class UltravoxModel(TextModel):
|
||||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("GlmasrModel")
|
||||||
|
class GlmASRWhisperEncoderModel(MmprojModel):
|
||||||
|
has_vision_encoder = False
|
||||||
|
has_audio_encoder = True
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||||
|
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||||
|
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||||
|
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
|
||||||
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||||
|
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||||
|
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
|
||||||
|
|
||||||
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||||
|
if ".conv" in name and ".weight" in name:
|
||||||
|
return gguf.GGMLQuantizationType.F16
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
if name.startswith("model.") or name.startswith("lm_head."):
|
||||||
|
# skip language model tensors
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.whisper."):
|
||||||
|
name = name.replace("audio_encoder.whisper.","audio_tower.")
|
||||||
|
if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
|
||||||
|
name = name.replace("audio_encoder.", "audio_encoder.adapting.")
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.audio_bos_eos_token."):
|
||||||
|
return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
|
||||||
|
|
||||||
|
if name.startswith("audio_encoder.adapting."):
|
||||||
|
name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
|
||||||
|
if ".layer_norm." in name:
|
||||||
|
name = name.replace(".layer_norm.", ".ln_pre.")
|
||||||
|
if ".0." in name:
|
||||||
|
name = name.replace(".0.", ".linear_1.")
|
||||||
|
if ".2." in name:
|
||||||
|
name = name.replace(".2.", ".linear_2.")
|
||||||
|
if ".proj." in name:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if "conv1.bias" in name or "conv2.bias" in name:
|
||||||
|
# transpose conv1 and conv2 bias
|
||||||
|
data_torch = data_torch.unsqueeze(-1)
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||||
class WhisperEncoderModel(MmprojModel):
|
class WhisperEncoderModel(MmprojModel):
|
||||||
has_vision_encoder = False # no vision encoder
|
has_vision_encoder = False # no vision encoder
|
||||||
|
|
@ -9178,7 +9095,7 @@ class FalconH1Model(Mamba2Model):
|
||||||
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
||||||
|
|
||||||
# Add any other Falcon Mamba2 specific configuration
|
# Add any other Falcon Mamba2 specific configuration
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
||||||
|
|
@ -9256,12 +9173,11 @@ class HunYuanMoEModel(TextModel):
|
||||||
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 1000)
|
alpha = self.rope_parameters.get("alpha", 1000)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9456,12 +9372,11 @@ class HunYuanModel(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
# Rope
|
# Rope
|
||||||
rope_scaling = hparams.get("rope_scaling", {})
|
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||||
if rope_scaling.get("type") == "dynamic":
|
|
||||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
alpha = rope_scaling.get("alpha", 50)
|
alpha = self.rope_parameters.get("alpha", 50)
|
||||||
base = hparams.get("rope_theta", 10000.0)
|
base = self.rope_parameters.get("rope_theta", 10000.0)
|
||||||
dim = hparams["head_dim"]
|
dim = hparams["head_dim"]
|
||||||
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
|
@ -9612,13 +9527,6 @@ class GptOssModel(TextModel):
|
||||||
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
|
|
||||||
assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
|
||||||
class LFM2Model(TextModel):
|
class LFM2Model(TextModel):
|
||||||
|
|
@ -9791,13 +9699,6 @@ class SmallThinkerModel(TextModel):
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||||
else:
|
else:
|
||||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
# YaRN is not enabled by default
|
|
||||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
|
||||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
||||||
if sliding_window_layout:
|
if sliding_window_layout:
|
||||||
|
|
|
||||||
18
docs/ops.md
18
docs/ops.md
|
|
@ -18,12 +18,12 @@ Legend:
|
||||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -31,7 +31,7 @@ Legend:
|
||||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
|
|
@ -64,7 +64,7 @@ Legend:
|
||||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||||
|
|
@ -98,14 +98,14 @@ Legend:
|
||||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
|
|
@ -113,7 +113,7 @@ Legend:
|
||||||
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
|
|
|
||||||
1158
docs/ops/SYCL.csv
1158
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||||
// call with a worst-case graph to avoid buffer reallocations
|
// call with a worst-case graph to avoid buffer reallocations
|
||||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||||
// returns false if the buffer allocation failed
|
// returns false if the buffer allocation failed
|
||||||
|
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc,
|
||||||
|
struct ggml_cgraph * graph,
|
||||||
|
const int * node_buffer_ids,
|
||||||
|
const int * leaf_buffer_ids,
|
||||||
|
size_t * sizes);
|
||||||
GGML_API bool ggml_gallocr_reserve_n(
|
GGML_API bool ggml_gallocr_reserve_n(
|
||||||
ggml_gallocr_t galloc,
|
ggml_gallocr_t galloc,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
|
|
@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||||
|
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
|
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||||
|
|
||||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||||
|
|
|
||||||
|
|
@ -2615,6 +2615,7 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
|
|
|
||||||
|
|
@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
return t->data != NULL // tensor data already set externally
|
||||||
|
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||||
|
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||||
}
|
}
|
||||||
|
|
||||||
// free the extra space at the end if the new tensor is smaller
|
// free the extra space at the end if the new tensor is smaller
|
||||||
|
|
@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
static bool ggml_gallocr_reserve_n_impl(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||||
// add 25% margin to avoid hash collisions
|
// add 25% margin to avoid hash collisions
|
||||||
min_hash_size += min_hash_size / 4;
|
min_hash_size += min_hash_size / 4;
|
||||||
|
|
@ -928,12 +931,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||||
if (cur_size > 0) {
|
if (cur_size > 0) {
|
||||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_vbuffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
|
if (no_alloc) {
|
||||||
|
galloc->buffers[i] = NULL;
|
||||||
|
} else {
|
||||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
|
|
@ -941,10 +946,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gallocr_reserve_n_size(
|
||||||
|
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||||
|
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||||
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
sizes[i] = 0;
|
||||||
|
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||||
|
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
|
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
@ -1147,7 +1168,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||||
|
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||||
|
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||||
|
|
@ -1155,6 +1177,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
ggml_backend_buffer_t * buffers = NULL;
|
ggml_backend_buffer_t * buffers = NULL;
|
||||||
size_t n_buffers = 0;
|
size_t n_buffers = 0;
|
||||||
|
*nbytes_total = 0;
|
||||||
|
|
||||||
size_t cur_buf_size = 0;
|
size_t cur_buf_size = 0;
|
||||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
|
@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||||
// allocate tensors in the current buffer
|
// allocate tensors in the current buffer
|
||||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
first = t;
|
first = t;
|
||||||
|
*nbytes_total += cur_buf_size;
|
||||||
cur_buf_size = this_size;
|
cur_buf_size = this_size;
|
||||||
} else {
|
} else {
|
||||||
cur_buf_size += this_size;
|
cur_buf_size += this_size;
|
||||||
|
|
@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
// allocate remaining tensors
|
// allocate remaining tensors
|
||||||
if (cur_buf_size > 0) {
|
if (cur_buf_size > 0) {
|
||||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
*nbytes_total += cur_buf_size;
|
||||||
|
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (no_alloc) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
GGML_ASSERT(!buffers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
} else {
|
} else {
|
||||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||||
}
|
}
|
||||||
free(buffers);
|
if (buffers) {
|
||||||
|
free(buffers); // can be NULL if context is empty or no_alloc
|
||||||
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||||
|
GGML_ASSERT(!buf);
|
||||||
|
return nbytes_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||||
|
size_t nbytes_total = 0;
|
||||||
|
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// return a dummy buffer for zero-sized allocations
|
// return a dummy buffer for zero-sized allocations
|
||||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(buft);
|
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -128,6 +127,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||||
|
// I don't know whether the above comment is correct
|
||||||
|
if (!buffer->iface.get_base) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
|
|
@ -1727,6 +1732,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
sched->is_alloc = false;
|
sched->is_alloc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
GGML_ASSERT(sizes);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
|
||||||
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
|
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
GGML_ASSERT(sched);
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
#include <sycl/sycl.hpp>
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "add-id.hpp"
|
||||||
|
|
||||||
|
static void add_id_kernel(
|
||||||
|
const float* src0,
|
||||||
|
const float* src1,
|
||||||
|
const int32_t* src2,
|
||||||
|
float* dst,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
size_t nb01,
|
||||||
|
size_t nb02,
|
||||||
|
size_t nb11,
|
||||||
|
size_t nb21,
|
||||||
|
sycl::nd_item<3> item_ct1) {
|
||||||
|
const int64_t i1 = item_ct1.get_group(2);
|
||||||
|
const int64_t i2 = item_ct1.get_group(1);
|
||||||
|
|
||||||
|
const int i11 =
|
||||||
|
*(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21);
|
||||||
|
|
||||||
|
const size_t nb1 = ne0 * sizeof(float);
|
||||||
|
const size_t nb2 = ne1 * nb1;
|
||||||
|
|
||||||
|
float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2);
|
||||||
|
const float* src0_row =
|
||||||
|
(const float*)((const char*)src0 + i1 * nb01 + i2 * nb02);
|
||||||
|
const float* src1_row = (const float*)((const char*)src1 + i11 * nb11);
|
||||||
|
|
||||||
|
for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0;
|
||||||
|
i0 += item_ct1.get_local_range(2)) {
|
||||||
|
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||||
|
const ggml_tensor* src0 = dst->src[0];
|
||||||
|
const ggml_tensor* src1 = dst->src[1];
|
||||||
|
const ggml_tensor* src2 = dst->src[2];
|
||||||
|
|
||||||
|
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb20 == sizeof(int32_t));
|
||||||
|
|
||||||
|
const float* src0_d = (const float*)src0->data;
|
||||||
|
const float* src1_d = (const float*)src1->data;
|
||||||
|
const int32_t* src2_d = (const int32_t*)src2->data;
|
||||||
|
float* dst_d = (float*)dst->data;
|
||||||
|
|
||||||
|
int threads = std::min((int)ne00, 768); // cols
|
||||||
|
ctx.stream()->parallel_for(
|
||||||
|
sycl::nd_range<3>(
|
||||||
|
sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
|
||||||
|
sycl::range<3>(1, 1, threads)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
add_id_kernel(
|
||||||
|
src0_d,
|
||||||
|
src1_d,
|
||||||
|
src2_d,
|
||||||
|
dst_d,
|
||||||
|
ne0,
|
||||||
|
ne1,
|
||||||
|
nb01,
|
||||||
|
nb02,
|
||||||
|
nb11,
|
||||||
|
nb21,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef GGML_SYCL_ADD_ID_HPP
|
||||||
|
#define GGML_SYCL_ADD_ID_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_add_id(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_ADD_ID_HPP
|
||||||
|
|
@ -642,5 +642,22 @@ static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3
|
||||||
return sycl::uint2(div_val, mod_val);
|
return sycl::uint2(div_val, mod_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
|
||||||
|
return dpct::dp4a(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
|
||||||
|
uint32_t bits;
|
||||||
|
if (x == 0) {
|
||||||
|
bits = 0x00400000;
|
||||||
|
} else {
|
||||||
|
bits = (uint32_t) x << 23;
|
||||||
|
}
|
||||||
|
|
||||||
|
float result;
|
||||||
|
memcpy(&result, &bits, sizeof(float));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // GGML_SYCL_COMMON_HPP
|
#endif // GGML_SYCL_COMMON_HPP
|
||||||
|
|
|
||||||
|
|
@ -472,6 +472,16 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename dst_t>
|
||||||
|
static void dequantize_row_mxfp4_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||||
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
dequantize_block_mxfp4(vx, y, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||||
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||||
|
|
@ -518,6 +528,7 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
|
||||||
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
|
@ -571,6 +582,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||||
return dequantize_row_iq4_xs_sycl;
|
return dequantize_row_iq4_xs_sycl;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
return dequantize_row_mxfp4_sycl;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return convert_unary_sycl<float>;
|
return convert_unary_sycl<float>;
|
||||||
#ifdef GGML_SYCL_HAS_BF16
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
|
@ -636,6 +649,8 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||||
return dequantize_row_iq4_xs_sycl;
|
return dequantize_row_iq4_xs_sycl;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
return dequantize_row_mxfp4_sycl;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_sycl<sycl::half>;
|
return convert_unary_sycl<sycl::half>;
|
||||||
#ifdef GGML_SYCL_HAS_BF16
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
|
|
||||||
|
|
@ -819,5 +819,23 @@ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename dst_t>
|
||||||
|
static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||||
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
// auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||||
|
const int64_t i = item_ct1.get_group(2);
|
||||||
|
const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
|
||||||
|
|
||||||
|
const int64_t tid = item_ct1.get_local_id(2);
|
||||||
|
const int64_t il = tid/8; // 0...3
|
||||||
|
const int64_t ib = tid%8; // 0...7
|
||||||
|
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||||
|
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||||
|
const float d = ggml_sycl_e8m0_to_fp32(x[ib].e);
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
|
||||||
|
y[j+16] = d * kvalues_mxfp4[q4[j] >> 4]*0.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
||||||
|
|
|
||||||
|
|
@ -1860,10 +1860,31 @@ namespace dpct
|
||||||
: id);
|
: id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2>
|
||||||
|
using dot_product_acc_t = std::conditional_t<
|
||||||
|
std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
|
||||||
|
uint32_t,
|
||||||
|
int32_t>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
|
||||||
|
return sycl::vec<T, 1>(val)
|
||||||
|
.template as<sycl::vec<
|
||||||
|
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>,
|
||||||
|
4>>()
|
||||||
|
.template convert<T>();
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T1, typename T2, typename T3>
|
template <typename T1, typename T2, typename T3>
|
||||||
inline auto dp4a(T1 a, T2 b, T3 c)
|
inline auto dp4a(T1 a, T2 b, T3 c) {
|
||||||
{
|
dot_product_acc_t<T1, T2> res = c;
|
||||||
return syclcompat::dp4a(a, b, c);
|
auto va = extract_and_sign_or_zero_extend4(a);
|
||||||
|
auto vb = extract_and_sign_or_zero_extend4(b);
|
||||||
|
res += va[0] * vb[0];
|
||||||
|
res += va[1] * vb[1];
|
||||||
|
res += va[2] * vb[2];
|
||||||
|
res += va[3] * vb[3];
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sub_sat
|
struct sub_sat
|
||||||
|
|
@ -2972,6 +2993,38 @@ namespace dpct
|
||||||
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline unsigned int byte_level_permute(
|
||||||
|
unsigned int a, unsigned int b, unsigned int s) {
|
||||||
|
unsigned int ret;
|
||||||
|
ret = ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff)
|
||||||
|
<< 8) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff)
|
||||||
|
<< 16) |
|
||||||
|
(((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff)
|
||||||
|
<< 24);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32_t byte_level_permute_custom(
|
||||||
|
uint32_t low32, uint32_t high32, uint32_t sel, int mode = 0) {
|
||||||
|
constexpr uint16_t lookup[6][4] = {
|
||||||
|
{0x3210, 0x4321, 0x5432, 0x6543}, // Forward 4-byte extract
|
||||||
|
{0x5670, 0x6701, 0x7012, 0x0123}, // Backward 4-byte extract
|
||||||
|
{0x0000, 0x1111, 0x2222, 0x3333}, // Replicate 8-bit values
|
||||||
|
{0x3210, 0x3211, 0x3222, 0x3333}, // Edge clamp left
|
||||||
|
{0x0000, 0x1110, 0x2210, 0x3210}, // Edge clamp right
|
||||||
|
{0x1010, 0x3232, 0x1010, 0x3232} // Replicate 16-bit values
|
||||||
|
};
|
||||||
|
|
||||||
|
if (mode >= 1 && mode <= 6) {
|
||||||
|
return byte_level_permute(low32, high32, lookup[mode - 1][sel & 0x3]);
|
||||||
|
} else if (!mode) {
|
||||||
|
return byte_level_permute(low32, high32, sel);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
} // COPY from DPCT head files
|
} // COPY from DPCT head files
|
||||||
|
|
||||||
#endif // GGML_SYCL_DPCT_HELPER_HPP
|
#endif // GGML_SYCL_DPCT_HELPER_HPP
|
||||||
|
|
|
||||||
|
|
@ -911,6 +911,98 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
|
||||||
|
x = sycl::fmin(x, limit);
|
||||||
|
g = sycl::fmax(sycl::fmin(g, limit), -limit);
|
||||||
|
|
||||||
|
float out_glu = x / (1.0f + sycl::native::exp(-x * alpha));
|
||||||
|
out_glu = out_glu * (1.0f + g);
|
||||||
|
return out_glu;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k,
|
||||||
|
const int64_t n, const int64_t o0, const int64_t o1,
|
||||||
|
float alpha, float limit, sycl::nd_item<3> item_ct1) {
|
||||||
|
const int64_t i = int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t j0 = (i / n) * o0 + (i % n);
|
||||||
|
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
||||||
|
|
||||||
|
float xi = x[j0];
|
||||||
|
float gi = g[j1];
|
||||||
|
|
||||||
|
dst[i] = ggml_sycl_op_swiglu_oai_single(xi, gi, alpha, limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void swiglu_oai_sycl(const T * x,
|
||||||
|
const T * g,
|
||||||
|
T * dst,
|
||||||
|
const int64_t k,
|
||||||
|
const int64_t n,
|
||||||
|
const int64_t o0,
|
||||||
|
const int64_t o1,
|
||||||
|
const float alpha,
|
||||||
|
const float limit,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE;
|
||||||
|
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE),
|
||||||
|
sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
void * src0_d = src0->data;
|
||||||
|
void * src1_d = src1 ? src1->data : src0->data;
|
||||||
|
const int64_t src0_o = src0->nb[1];
|
||||||
|
const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||||
|
void * dst_d = dst->data;
|
||||||
|
const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||||
|
dpct::queue_ptr stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
GGML_ASSERT(dst->ne[0] == nc);
|
||||||
|
GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
|
||||||
|
|
||||||
|
if (src1) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||||
|
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
||||||
|
GGML_ASSERT(src1->ne[0] == nc);
|
||||||
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
|
}
|
||||||
|
|
||||||
|
//const int32_t swapped = ((const int32_t *) dst->op_params)[1];
|
||||||
|
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||||
|
const float alpha = ggml_get_op_params_f32(dst, 2);
|
||||||
|
const float limit = ggml_get_op_params_f32(dst, 3);
|
||||||
|
|
||||||
|
float * src0_p = (float *) src0_d;
|
||||||
|
float * src1_p = (float *) src1_d;
|
||||||
|
|
||||||
|
if (!src1) {
|
||||||
|
src0_p += swapped ? nc : 0;
|
||||||
|
src1_p += swapped ? 0 : nc;
|
||||||
|
}
|
||||||
|
|
||||||
|
swiglu_oai_sycl(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
||||||
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
||||||
|
|
@ -1070,6 +1162,11 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_sycl_op_swiglu(ctx, dst);
|
ggml_sycl_op_swiglu(ctx, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||||
|
ggml_sycl_op_swiglu_oai(ctx, dst);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||||
ggml_sycl_op_geglu_erf(ctx, dst);
|
ggml_sycl_op_geglu_erf(ctx, dst);
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include <limits> // For std::numeric_limits
|
#include <limits> // For std::numeric_limits
|
||||||
|
|
||||||
|
#define SYCL_GLU_BLOCK_SIZE 256
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T neg_infinity() {
|
T neg_infinity() {
|
||||||
return -std::numeric_limits<T>::infinity();
|
return -std::numeric_limits<T>::infinity();
|
||||||
|
|
@ -41,6 +43,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#include "ggml-sycl/add-id.hpp"
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
#include "ggml-sycl/common.hpp"
|
#include "ggml-sycl/common.hpp"
|
||||||
#include "ggml-sycl/element_wise.hpp"
|
#include "ggml-sycl/element_wise.hpp"
|
||||||
|
|
@ -3313,6 +3314,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
||||||
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||||
|
|
@ -3320,7 +3322,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // SYCL_USE_XMX
|
#endif // SYCL_USE_XMX
|
||||||
|
|
||||||
|
|
||||||
// mmvq path is faster in the CUDA backend.
|
// mmvq path is faster in the CUDA backend.
|
||||||
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
|
||||||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
||||||
|
|
@ -3711,6 +3712,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||||
case GGML_OP_ADD1: // TODO: more efficient implementation
|
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||||
ggml_sycl_add(ctx, dst);
|
ggml_sycl_add(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_ADD_ID:
|
||||||
|
ggml_sycl_add_id(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
ggml_sycl_sub(ctx, dst);
|
ggml_sycl_sub(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -3803,6 +3807,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||||
case GGML_GLU_OP_SWIGLU:
|
case GGML_GLU_OP_SWIGLU:
|
||||||
ggml_sycl_swiglu(ctx, dst);
|
ggml_sycl_swiglu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_GLU_OP_SWIGLU_OAI:
|
||||||
|
ggml_sycl_swiglu_oai(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_GLU_OP_GEGLU_ERF:
|
case GGML_GLU_OP_GEGLU_ERF:
|
||||||
ggml_sycl_geglu_erf(ctx, dst);
|
ggml_sycl_geglu_erf(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -4397,6 +4404,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_GLU_OP_REGLU:
|
case GGML_GLU_OP_REGLU:
|
||||||
case GGML_GLU_OP_GEGLU:
|
case GGML_GLU_OP_GEGLU:
|
||||||
case GGML_GLU_OP_SWIGLU:
|
case GGML_GLU_OP_SWIGLU:
|
||||||
|
case GGML_GLU_OP_SWIGLU_OAI:
|
||||||
case GGML_GLU_OP_GEGLU_ERF:
|
case GGML_GLU_OP_GEGLU_ERF:
|
||||||
case GGML_GLU_OP_GEGLU_QUICK:
|
case GGML_GLU_OP_GEGLU_QUICK:
|
||||||
return ggml_is_contiguous_1(op->src[0]);
|
return ggml_is_contiguous_1(op->src[0]);
|
||||||
|
|
@ -4424,15 +4432,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
if (src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_MXFP4) {
|
if (src0_type == GGML_TYPE_BF16 ) {
|
||||||
// TODO: support MXFP4
|
// TODO: support GGML_TYPE_BF16
|
||||||
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
|
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
||||||
|
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||||
|
|
@ -4553,9 +4564,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
return true;
|
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
|
case GGML_OP_ADD_ID:
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_COUNT_EQUAL:
|
case GGML_OP_COUNT_EQUAL:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
|
|
|
||||||
|
|
@ -595,6 +595,25 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
|
||||||
|
dpct::queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % QK_MXFP4 == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||||
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
|
|
||||||
|
{
|
||||||
|
stream->submit([&](sycl::handler & cgh) {
|
||||||
|
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
mul_mat_vec_q<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
|
||||||
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
float *dst, const int ncols,
|
float *dst, const int ncols,
|
||||||
const int nrows,
|
const int nrows,
|
||||||
|
|
@ -1123,6 +1142,9 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_MXFP4:
|
||||||
|
mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -16,8 +16,8 @@
|
||||||
static void pad_f32(const float * src, float * dst,
|
static void pad_f32(const float * src, float * dst,
|
||||||
const int lp0, const int rp0, const int lp1, const int rp1,
|
const int lp0, const int rp0, const int lp1, const int rp1,
|
||||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||||
const int ne0, const int ne1, const int ne2, const int ne3) {
|
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
sycl::nd_item<3> item_ct1) {
|
||||||
int i0 = item_ct1.get_local_id(2) +
|
int i0 = item_ct1.get_local_id(2) +
|
||||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||||
int i1 = item_ct1.get_group(1);
|
int i1 = item_ct1.get_group(1);
|
||||||
|
|
@ -63,7 +63,7 @@ static void pad_f32_sycl(const float *src, float *dst, const int lp0,
|
||||||
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
|
pad_f32(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1,
|
||||||
ne2, ne3);
|
ne2, ne3, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||||
|
|
||||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * static_cast<int>(sizeof(float)));
|
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||||
|
|
||||||
const int src_stride_inner = ncs;
|
const int src_stride_inner = ncs;
|
||||||
const int src_stride_seq = ncs * d_inner;
|
const int src_stride_seq = ncs * d_inner;
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,18 @@
|
||||||
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
||||||
const int & iqs);
|
const int & iqs);
|
||||||
|
|
||||||
|
static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
|
||||||
|
const uint8_t * x8 = (const uint8_t *) x;
|
||||||
|
|
||||||
|
int x32 = x8[4*i32 + 0] << 0;
|
||||||
|
x32 |= x8[4*i32 + 1] << 8;
|
||||||
|
x32 |= x8[4*i32 + 2] << 16;
|
||||||
|
x32 |= x8[4*i32 + 3] << 24;
|
||||||
|
|
||||||
|
return x32;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
||||||
const uint16_t* x16 =
|
const uint16_t* x16 =
|
||||||
(const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
|
(const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
|
||||||
|
|
@ -75,6 +87,28 @@ static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
||||||
val2 = v1 | (v2 << 16);
|
val2 = v1 | (v2 << 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ sycl::int2 get_int_from_table_16(
|
||||||
|
const int& q4, const int8_t* table) {
|
||||||
|
const uint32_t* table32 = (const uint32_t*)table;
|
||||||
|
uint32_t tmp[2];
|
||||||
|
const uint32_t low_high_selection_indices =
|
||||||
|
(0x32103210 | ((q4 & 0x88888888) >> 1));
|
||||||
|
#pragma unroll
|
||||||
|
for (uint32_t i = 0; i < 2; ++i) {
|
||||||
|
const uint32_t shift = 16 * i;
|
||||||
|
|
||||||
|
const uint32_t low =
|
||||||
|
dpct::byte_level_permute(table32[0], table32[1], q4 >> shift);
|
||||||
|
const uint32_t high =
|
||||||
|
dpct::byte_level_permute(table32[2], table32[3], q4 >> shift);
|
||||||
|
tmp[i] = dpct::byte_level_permute(
|
||||||
|
low, high, low_high_selection_indices >> shift);
|
||||||
|
}
|
||||||
|
return sycl::int2(
|
||||||
|
dpct::byte_level_permute(tmp[0], tmp[1], 0x6420),
|
||||||
|
dpct::byte_level_permute(tmp[0], tmp[1], 0x7531));
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||||
|
|
||||||
// contiguous v/x values
|
// contiguous v/x values
|
||||||
|
|
@ -685,6 +719,30 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
|
||||||
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define VDR_MXFP4_Q8_1_MMVQ 2
|
||||||
|
#define VDR_MXFP4_Q8_1_MMQ 4
|
||||||
|
|
||||||
|
static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
|
||||||
|
const block_q8_1 * __restrict__ bq8_1,
|
||||||
|
const int & iqs) {
|
||||||
|
const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq;
|
||||||
|
|
||||||
|
const int * q8 = (const int *) bq8_1->qs + iqs;
|
||||||
|
|
||||||
|
int sumi = 0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
|
||||||
|
const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
|
||||||
|
const sycl::int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
|
||||||
|
sumi = ggml_sycl_dp4a(v.x(), q8[l + 0], sumi);
|
||||||
|
sumi = ggml_sycl_dp4a(v.y(), q8[l + 4], sumi);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = ggml_sycl_e8m0_to_fp32(bq4->e) * 0.5f * (bq8_1->ds)[0];
|
||||||
|
return d * sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
|
vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
|
|
|
||||||
|
|
@ -7566,6 +7566,11 @@ size_t ggml_quantize_chunk(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
*log_callback = g_logger_state.log_callback;
|
||||||
|
*user_data = g_logger_state.log_callback_user_data;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||||
g_logger_state.log_callback_user_data = user_data;
|
g_logger_state.log_callback_user_data = user_data;
|
||||||
|
|
|
||||||
|
|
@ -3320,6 +3320,7 @@ class VisionProjectorType:
|
||||||
ULTRAVOX = "ultravox"
|
ULTRAVOX = "ultravox"
|
||||||
INTERNVL = "internvl"
|
INTERNVL = "internvl"
|
||||||
QWEN2A = "qwen2a" # audio
|
QWEN2A = "qwen2a" # audio
|
||||||
|
GLMA = "glma" # audio
|
||||||
QWEN25O = "qwen2.5o" # omni
|
QWEN25O = "qwen2.5o" # omni
|
||||||
VOXTRAL = "voxtral"
|
VOXTRAL = "voxtral"
|
||||||
LFM2 = "lfm2"
|
LFM2 = "lfm2"
|
||||||
|
|
|
||||||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||||
|
bool no_alloc; // only load metadata and simulate memory allocations
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||||
|
// returns true if the parameters could be successfully modified to fit device memory
|
||||||
|
// this function is NOT thread safe because it modifies the global llama logger state
|
||||||
|
LLAMA_API bool llama_params_fit(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params * mparams,
|
||||||
|
struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||||
|
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||||
|
size_t margin, // margin of memory to leave per device in bytes
|
||||||
|
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||||
|
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||||
|
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||||
|
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
|
@ -1354,6 +1369,8 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
// The logger state is global so these functions are NOT thread safe.
|
||||||
|
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -258,6 +258,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.clear();
|
backend_buft.clear();
|
||||||
backend_ptrs.clear();
|
backend_ptrs.clear();
|
||||||
|
backend_buf_exp_size.clear();
|
||||||
|
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||||
|
|
@ -274,6 +275,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.push_back(buft);
|
backend_buft.push_back(buft);
|
||||||
backend_ptrs.push_back(backend.get());
|
backend_ptrs.push_back(backend.get());
|
||||||
|
backend_buf_exp_size.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||||
|
|
@ -389,7 +391,8 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||||
|
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||||
|
|
@ -407,7 +410,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -422,7 +425,7 @@ llama_context::llama_context(
|
||||||
//
|
//
|
||||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||||
//
|
//
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -431,11 +434,13 @@ llama_context::llama_context(
|
||||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
ggml_backend_t backend = backend_ptrs[i];
|
ggml_backend_t backend = backend_ptrs[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
if (!model.hparams.no_alloc) {
|
||||||
if (size > 1) {
|
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
|
if (backend_buf_exp_size[i] > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -454,6 +459,23 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context::~llama_context() {
|
llama_context::~llama_context() {
|
||||||
|
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||||
|
// if (!model.hparams.no_alloc) {
|
||||||
|
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
|
// ggml_backend_t backend = backend_ptrs[i];
|
||||||
|
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
|
|
||||||
|
// const size_t size_exp = backend_buf_exp_size[i];
|
||||||
|
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
// if (size_exp == size_act) {
|
||||||
|
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// } else {
|
||||||
|
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
ggml_opt_free(opt_ctx);
|
ggml_opt_free(opt_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1428,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
ggml_cgraph * llama_context::graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
GGML_ASSERT(n_outputs >= 1);
|
GGML_ASSERT(n_outputs >= 1);
|
||||||
|
|
||||||
|
|
@ -1465,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
|
|
||||||
// initialize scheduler with the specified graph
|
// initialize scheduler with the specified graph
|
||||||
if (split_only) {
|
if (split_only) {
|
||||||
|
if (sizes) {
|
||||||
|
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||||
|
} else {
|
||||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||||
|
}
|
||||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||||
|
GGML_ASSERT(!sizes);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -2088,15 +2116,26 @@ void llama_context::perf_reset() {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
for (const auto & buft_size : model.memory_breakdown()) {
|
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||||
ret[buft_size.first].model += buft_size.second;
|
ret[buft].model += size;
|
||||||
}
|
}
|
||||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
if (memory) {
|
||||||
ret[buft_size.first].context += buft_size.second;
|
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||||
|
ret[buft].context += size;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
for (size_t i = 0; i < backends.size(); ++i) {
|
||||||
|
ggml_backend_t backend = backends[i].get();
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += backend_buf_exp_size[i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for (const auto & backend_ptr : backends) {
|
for (const auto & backend_ptr : backends) {
|
||||||
ggml_backend_t backend = backend_ptr.get();
|
ggml_backend_t backend = backend_ptr.get();
|
||||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
||||||
size_t model = 0; // memory allocated for the model
|
size_t model = 0; // memory allocated for the model
|
||||||
size_t context = 0; // memory allocated for the context
|
size_t context = 0; // memory allocated for the context
|
||||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
|
||||||
|
size_t total() const {
|
||||||
|
return model + context + compute;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
|
@ -206,7 +210,8 @@ public:
|
||||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||||
|
|
||||||
// reserve a graph with a dummy ubatch of the specified size
|
// reserve a graph with a dummy ubatch of the specified size
|
||||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
ggml_cgraph * graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llm_graph_params graph_params(
|
llm_graph_params graph_params(
|
||||||
|
|
@ -281,9 +286,10 @@ private:
|
||||||
|
|
||||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||||
|
|
||||||
// buffer types used for the compute buffer of each backend
|
// pointers and buffer types used for the compute buffer of each backend
|
||||||
std::vector<ggml_backend_t> backend_ptrs;
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||||
|
|
||||||
llm_graph_result_ptr gf_res_prev;
|
llm_graph_result_ptr gf_res_prev;
|
||||||
llm_graph_result_ptr gf_res_reserve;
|
llm_graph_result_ptr gf_res_reserve;
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
bool no_alloc;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
bool use_par_res;
|
bool use_par_res;
|
||||||
bool swin_norm;
|
bool swin_norm;
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
ggml_log_get(log_callback, user_data);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
ggml_log_set(log_callback, user_data);
|
ggml_log_set(log_callback, user_data);
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
||||||
|
|
||||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||||
for (auto & [buft, ctx] : ctx_map) {
|
for (auto & [buft, ctx] : ctx_map) {
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
ggml_backend_buffer_t buf;
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||||
|
}
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||||
}
|
}
|
||||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, buf] : ctxs_bufs) {
|
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||||
|
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
|
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
|
this->no_alloc = no_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_model_loader::get_arch_name() const {
|
std::string llama_model_loader::get_arch_name() const {
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
bool use_mmap = false;
|
bool use_mmap = false;
|
||||||
bool check_tensors;
|
bool check_tensors;
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
llama_files files;
|
llama_files files;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6606,9 +6606,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||||
|
GGML_ASSERT(!ml.no_alloc);
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
||||||
|
// then we could just use metal for all layers
|
||||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||||
void * addr = nullptr;
|
void * addr = nullptr;
|
||||||
size_t first, last; // NOLINT
|
size_t first, last; // NOLINT
|
||||||
|
|
@ -6624,9 +6626,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
bufs.emplace_back(buf);
|
bufs.emplace_back(buf);
|
||||||
buf_map.emplace(idx, buf);
|
buf_map.emplace(idx, buf);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
ggml_backend_buffer_t buf;
|
||||||
|
if (ml.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||||
}
|
}
|
||||||
|
|
@ -6681,6 +6690,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ml.no_alloc) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// load tensor data
|
// load tensor data
|
||||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||||
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
|
@ -6723,11 +6736,20 @@ size_t llama_model::n_devices() const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(bufs.size() == 1);
|
||||||
|
ggml_backend_buffer_t buf = bufs[0].get();
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
for (const auto & buf : bufs) {
|
for (const auto & buf : bufs) {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -6770,6 +6792,7 @@ void llama_model::print_info() const {
|
||||||
// hparams
|
// hparams
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
||||||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||||||
|
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||||
|
|
@ -7618,6 +7641,7 @@ llama_model_params llama_model_default_params() {
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
/*.use_extra_bufts =*/ true,
|
/*.use_extra_bufts =*/ true,
|
||||||
/*.no_host =*/ false,
|
/*.no_host =*/ false,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model(llama_model_default_params());
|
llama_model model(llama_model_default_params());
|
||||||
|
|
|
||||||
650
src/llama.cpp
650
src/llama.cpp
|
|
@ -1,6 +1,9 @@
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include "llama-chat.h"
|
#include "llama-chat.h"
|
||||||
|
#include "llama-context.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
@ -11,11 +14,14 @@
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
|
@ -37,6 +43,643 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_device_memory_data {
|
||||||
|
int64_t total;
|
||||||
|
int64_t free;
|
||||||
|
llama_memory_breakdown_data mb;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||||
|
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||||
|
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||||
|
const ggml_log_level log_level) {
|
||||||
|
struct user_data_t {
|
||||||
|
struct {
|
||||||
|
ggml_log_callback callback;
|
||||||
|
void * user_data;
|
||||||
|
} original_logger;
|
||||||
|
ggml_log_level min_level; // prints below this log level go to debug log
|
||||||
|
};
|
||||||
|
user_data_t ud;
|
||||||
|
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||||
|
ud.min_level = log_level;
|
||||||
|
|
||||||
|
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
const user_data_t * ud = (const user_data_t *) user_data;
|
||||||
|
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||||
|
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||||
|
}, &ud);
|
||||||
|
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
mparams_copy.no_alloc = true;
|
||||||
|
mparams_copy.use_mmap = false;
|
||||||
|
|
||||||
|
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||||
|
if (model == nullptr) {
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to load model");
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to create llama_context from model");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||||
|
|
||||||
|
for (const auto & [buft, mb] : memory_breakdown) {
|
||||||
|
if (ggml_backend_buft_is_host(buft)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||||
|
if (!dev) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
if (model->devices[i] == dev) {
|
||||||
|
ret[i].mb.model += mb.model;
|
||||||
|
ret[i].mb.context += mb.context;
|
||||||
|
ret[i].mb.compute += mb.compute;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||||
|
ret[i].free = free;
|
||||||
|
ret[i].total = total;
|
||||||
|
}
|
||||||
|
|
||||||
|
devs = model->devices;
|
||||||
|
hp_ngl = model->hparams.n_layer;
|
||||||
|
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||||
|
hp_n_expert = model->hparams.n_expert;
|
||||||
|
|
||||||
|
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// enum to identify part of a layer for distributing its tensors:
|
||||||
|
enum layer_fraction_t {
|
||||||
|
LAYER_FRACTION_NONE = 0, // nothing
|
||||||
|
LAYER_FRACTION_ATTN = 1, // attention
|
||||||
|
LAYER_FRACTION_UP = 2, // attention + up
|
||||||
|
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||||
|
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||||
|
};
|
||||||
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||||
|
|
||||||
|
static void llama_params_fit_impl(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
constexpr int64_t MiB = 1024*1024;
|
||||||
|
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||||
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||||
|
const llama_model_params default_mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
std::vector<ggml_backend_dev_t> devs;
|
||||||
|
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||||
|
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||||
|
uint32_t hp_nex = 0; // hparams.n_expert
|
||||||
|
|
||||||
|
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||||
|
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
const size_t nd = devs.size(); // number of devices
|
||||||
|
if (nd == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> dev_names;
|
||||||
|
{
|
||||||
|
dev_names.reserve(nd);
|
||||||
|
size_t max_length = 0;
|
||||||
|
for (ggml_backend_dev_t dev : devs) {
|
||||||
|
std::string name = ggml_backend_dev_name(dev);
|
||||||
|
name += " (";
|
||||||
|
name += ggml_backend_dev_description(dev);
|
||||||
|
name += ")";
|
||||||
|
dev_names.push_back(name);
|
||||||
|
max_length = std::max(max_length, name.length());
|
||||||
|
}
|
||||||
|
for (std::string & dn : dev_names) {
|
||||||
|
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t sum_total = 0;
|
||||||
|
int64_t sum_projected_free = 0;
|
||||||
|
int64_t min_projected_free = INT64_MAX;
|
||||||
|
int64_t sum_projected_used = 0;
|
||||||
|
int64_t sum_projected_ctx = 0;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||||
|
}
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const llama_device_memory_data & dmd = dmds_full[id];
|
||||||
|
|
||||||
|
const int64_t projected_used = dmd.mb.total();
|
||||||
|
const int64_t projected_free = dmd.free - projected_used;
|
||||||
|
|
||||||
|
sum_total += dmd.total;
|
||||||
|
sum_projected_used += projected_used;
|
||||||
|
sum_projected_free += projected_free;
|
||||||
|
min_projected_free = std::min(min_projected_free, projected_free);
|
||||||
|
sum_projected_ctx += dmd.mb.context;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||||
|
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||||
|
projected_free >= 0 ? "surplus" : "deficit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||||
|
assert(sum_projected_used >= sum_projected_ctx);
|
||||||
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||||
|
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||||
|
if (min_projected_free >= margin) {
|
||||||
|
if (nd == 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 2: try reducing memory use by reducing the context size
|
||||||
|
|
||||||
|
{
|
||||||
|
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||||
|
if (global_surplus < 0) {
|
||||||
|
LLAMA_LOG_INFO(nd == 1 ?
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||||
|
__func__, margin/MiB, -global_surplus/MiB);
|
||||||
|
if (cparams->n_ctx == 0) {
|
||||||
|
if (hp_nct > n_ctx_min) {
|
||||||
|
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||||
|
const uint32_t ctx_reduction = std::min(
|
||||||
|
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||||
|
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||||
|
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||||
|
global_surplus += memory_reduction;
|
||||||
|
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||||
|
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||||
|
__func__, hp_nct, n_ctx_min);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (global_surplus >= 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||||
|
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||||
|
}
|
||||||
|
if (nd > 1) {
|
||||||
|
if (!tensor_split) {
|
||||||
|
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_split) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
if (mparams->tensor_split[id] != 0.0f) {
|
||||||
|
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||||
|
}
|
||||||
|
if (hp_ngl < 2*nd) {
|
||||||
|
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||||
|
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!tensor_buft_overrides) {
|
||||||
|
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||||
|
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3: iteratively fill the back to front with "dense" layers
|
||||||
|
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||||
|
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||||
|
|
||||||
|
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||||
|
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||||
|
constexpr size_t n_strings = 1000;
|
||||||
|
if (il >= n_strings) {
|
||||||
|
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||||
|
}
|
||||||
|
switch (lf) {
|
||||||
|
case LAYER_FRACTION_ATTN: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_UP: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_GATE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_MOE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ngl_t {
|
||||||
|
uint32_t n_layer = 0; // number of total layers
|
||||||
|
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||||
|
|
||||||
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||||
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
|
||||||
|
// utility function to set n_gpu_layers and tensor_split
|
||||||
|
auto set_ngl_tensor_split_tbo = [&](
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
llama_model_params & mparams,
|
||||||
|
const bool add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||||
|
if (nd > 1) {
|
||||||
|
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||||
|
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||||
|
|
||||||
|
if (add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers += 1;
|
||||||
|
tensor_split[nd - 1] += 1;
|
||||||
|
}
|
||||||
|
mparams.tensor_split = tensor_split;
|
||||||
|
|
||||||
|
size_t itbo = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||||
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||||
|
if (itbo + 1 >= ntbo) {
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||||
|
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||||
|
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||||
|
itbo++;
|
||||||
|
}
|
||||||
|
il0 += ngl_per_device[id].n_part;
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
};
|
||||||
|
|
||||||
|
// utility function that returns the memory use per device for given numbers of layers per device
|
||||||
|
auto get_memory_for_layers = [&](
|
||||||
|
const char * func_name,
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||||
|
|
||||||
|
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||||
|
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const ngl_t & n = ngl_per_device[id];
|
||||||
|
LLAMA_LOG_DEBUG(
|
||||||
|
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||||
|
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> ret;
|
||||||
|
ret.reserve(nd);
|
||||||
|
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||||
|
ret.push_back(dmd.mb.total());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t global_surplus_cpu_moe = 0;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
||||||
|
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||||
|
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||||
|
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||||
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||||
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||||
|
global_surplus_cpu_moe += dmd.free;
|
||||||
|
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (global_surplus_cpu_moe > 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||||
|
__func__, global_surplus_cpu_moe/MiB);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||||
|
__func__, -global_surplus_cpu_moe/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset
|
||||||
|
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||||
|
targets.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
targets.push_back(dmds_full[id].free - margin);
|
||||||
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||||
|
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||||
|
|
||||||
|
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||||
|
overflow_bufts.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd - 1; ++id) {
|
||||||
|
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||||
|
}
|
||||||
|
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device(nd);
|
||||||
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// optimize the number of layers per device using the method of false position:
|
||||||
|
// - ngl_per_device has 0 layers for each device, lower bound
|
||||||
|
// - try a "high" configuration where a device is given all unassigned layers
|
||||||
|
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||||
|
// - check memory use of our guess, replace either the low or high bound
|
||||||
|
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||||
|
if (hp_nex == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||||
|
}
|
||||||
|
uint32_t n_unassigned = hp_ngl;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||||
|
}
|
||||||
|
if (ngl_per_device_high[id].n_layer > 0) {
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
ngl_per_device_test[id].n_layer += step_size;
|
||||||
|
if (hp_nex) {
|
||||||
|
ngl_per_device_test[id].n_part += step_size;
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 4: for a MoE model where all dense tensors fit,
|
||||||
|
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||||
|
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||||
|
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||||
|
|
||||||
|
size_t id_dense_start = nd;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
if (ngl_per_device[id].n_layer > 0) {
|
||||||
|
id_dense_start = id;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert(id_dense_start < nd);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||||
|
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||||
|
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||||
|
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_part = 0;
|
||||||
|
}
|
||||||
|
size_t id_dense_start_high = nd - 1;
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||||
|
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||||
|
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
uint32_t n_converted_test = 0;
|
||||||
|
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||||
|
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||||
|
n_converted_test += n_convert_jd;
|
||||||
|
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
id_dense_start_high = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||||
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||||
|
}
|
||||||
|
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
id_dense_start = id_dense_start_high;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// try to fit at least part of one more layer
|
||||||
|
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||||
|
ngl_per_device_test[id].n_layer++;
|
||||||
|
ngl_per_device_test[id].n_part++;
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||||
|
id_dense_start_test++;
|
||||||
|
}
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||||
|
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_params_fit(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
const int64_t t0_us = llama_time_us();
|
||||||
|
bool ok = true;
|
||||||
|
try {
|
||||||
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||||
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||||
|
} catch (const std::runtime_error & e) {
|
||||||
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
const int64_t t1_us = llama_time_us();
|
||||||
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
|
|
@ -49,6 +692,10 @@ size_t llama_max_devices(void) {
|
||||||
return 16;
|
return 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t llama_max_tensor_buft_overrides() {
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_supports_mmap(void) {
|
bool llama_supports_mmap(void) {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
@ -108,11 +755,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||||
model.t_start_us = tm.t_start_us;
|
model.t_start_us = tm.t_start_us;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||||
|
|
||||||
ml.print_info();
|
ml.print_info();
|
||||||
|
|
||||||
model.hparams.vocab_only = params.vocab_only;
|
model.hparams.vocab_only = params.vocab_only;
|
||||||
|
model.hparams.no_alloc = params.no_alloc;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
model.load_arch(ml);
|
model.load_arch(ml);
|
||||||
|
|
|
||||||
|
|
@ -37,4 +37,5 @@ else()
|
||||||
add_subdirectory(cvector-generator)
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
endif()
|
endif()
|
||||||
|
add_subdirectory(fit-params)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
set(TARGET llama-fit-params)
|
||||||
|
add_executable(${TARGET} fit-params.cpp)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
if(LLAMA_TOOLS_INSTALL)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
endif()
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
# fit-params
|
||||||
|
|
||||||
|
llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime,
|
||||||
|
this is controlled using the CLI arguments starting with `-fit`/`--fit`.
|
||||||
|
Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs.
|
||||||
|
`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout.
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
# First, run llama-fit-params and store the results in a file:
|
||||||
|
> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
||||||
|
ggml_cuda_init: found 1 CUDA devices:
|
||||||
|
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
|
||||||
|
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
|
||||||
|
llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory
|
||||||
|
llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB
|
||||||
|
llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total
|
||||||
|
llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB
|
||||||
|
llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory:
|
||||||
|
llama_params_fit_impl: - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing), 19187 MiB used, 1199 MiB free
|
||||||
|
llama_params_fit: successfully fit params to free device memory
|
||||||
|
llama_params_fit: fitting params to free memory took 1.15 seconds
|
||||||
|
Printing fitted CLI arguments to stdout...
|
||||||
|
-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU
|
||||||
|
|
||||||
|
# Next, use those results for a llama.cpp binary:
|
||||||
|
> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
||||||
|
ggml_cuda_init: found 1 CUDA devices:
|
||||||
|
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
|
||||||
|
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
|
||||||
|
system info: n_threads = 16, n_threads_batch = 16, total_threads = 32
|
||||||
|
|
||||||
|
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
|
||||||
|
|
||||||
|
main: binding port with default address family
|
||||||
|
main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
|
||||||
|
main: loading model
|
||||||
|
srv load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf'
|
||||||
|
llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory
|
||||||
|
llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed
|
||||||
|
llama_params_fit: successfully fit params to free device memory
|
||||||
|
llama_params_fit: fitting params to free memory took 0.28 seconds
|
||||||
|
[...]
|
||||||
|
main: server is listening on http://127.0.0.1:8080 - starting the main loop
|
||||||
|
srv update_slots: all slots are idle
|
||||||
|
^Csrv operator(): operator(): cleaning up before exit...
|
||||||
|
|
||||||
|
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
||||||
|
llama_memory_breakdown_print: | - CUDA0 (RTX 4090) | 24077 = 945 + (19187 = 17904 + 384 + 898) + 3945 |
|
||||||
|
llama_memory_breakdown_print: | - Host | 58271 = 58259 + 0 + 12 |
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
|
||||||
|
LOG_INF("Printing fitted CLI arguments to stdout...\n");
|
||||||
|
std::cout << "-c " << cparams.n_ctx;
|
||||||
|
std::cout << " -ngl " << mparams.n_gpu_layers;
|
||||||
|
|
||||||
|
size_t nd = llama_max_devices();
|
||||||
|
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
|
||||||
|
nd--;
|
||||||
|
}
|
||||||
|
if (nd > 1) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
if (id == 0) {
|
||||||
|
std::cout << " -ts ";
|
||||||
|
}
|
||||||
|
if (id > 0) {
|
||||||
|
std::cout << ",";
|
||||||
|
}
|
||||||
|
std::cout << mparams.tensor_split[id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
|
||||||
|
if (itbo == 0) {
|
||||||
|
std::cout << " -ot ";
|
||||||
|
}
|
||||||
|
if (itbo > 0) {
|
||||||
|
std::cout << ",";
|
||||||
|
}
|
||||||
|
std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft);
|
||||||
|
}
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -112,4 +112,8 @@ struct clip_graph {
|
||||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||||
// support dynamic resolution
|
// support dynamic resolution
|
||||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||||
|
|
||||||
|
// Generic function to stack frames for audio processing
|
||||||
|
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||||
|
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -157,6 +157,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_INTERNVL,
|
PROJECTOR_TYPE_INTERNVL,
|
||||||
PROJECTOR_TYPE_LLAMA4,
|
PROJECTOR_TYPE_LLAMA4,
|
||||||
PROJECTOR_TYPE_QWEN2A,
|
PROJECTOR_TYPE_QWEN2A,
|
||||||
|
PROJECTOR_TYPE_GLMA,
|
||||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||||
PROJECTOR_TYPE_VOXTRAL,
|
PROJECTOR_TYPE_VOXTRAL,
|
||||||
PROJECTOR_TYPE_LFM2,
|
PROJECTOR_TYPE_LFM2,
|
||||||
|
|
@ -183,6 +184,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||||
|
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||||
|
|
|
||||||
|
|
@ -256,6 +256,7 @@ struct clip_model {
|
||||||
ggml_tensor * conv1d_2_w = nullptr;
|
ggml_tensor * conv1d_2_w = nullptr;
|
||||||
ggml_tensor * conv1d_2_b = nullptr;
|
ggml_tensor * conv1d_2_b = nullptr;
|
||||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||||
|
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||||
|
|
||||||
// cogvlm
|
// cogvlm
|
||||||
|
|
|
||||||
|
|
@ -720,6 +720,32 @@ ggml_tensor * clip_graph::build_rope_2d(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Generic function to stack frames for audio processing
|
||||||
|
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||||
|
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
||||||
|
if (stack_factor <= 1) {
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t total_elements = ggml_nelements(cur);
|
||||||
|
int64_t stride = n_embed * stack_factor;
|
||||||
|
|
||||||
|
// Calculate padded length
|
||||||
|
int64_t padded_len = GGML_PAD(total_elements, stride);
|
||||||
|
int64_t pad = padded_len - total_elements;
|
||||||
|
|
||||||
|
if (pad > 0) {
|
||||||
|
// Pad the tensor to make it divisible by stride
|
||||||
|
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
|
||||||
|
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reshape to [stride, padded_len / stride]
|
||||||
|
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||||
|
ggml_row_size(cur->type, stride), 0);
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||||
// support dynamic resolution
|
// support dynamic resolution
|
||||||
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
||||||
|
|
@ -796,6 +822,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
{
|
{
|
||||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -1136,10 +1163,12 @@ struct clip_model_loader {
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
{
|
{
|
||||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
|
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||||
|
model.proj_type == PROJECTOR_TYPE_GLMA;
|
||||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||||
if (hparams.n_mel_bins != 128) {
|
if (hparams.n_mel_bins != 128) {
|
||||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
||||||
|
|
@ -1510,6 +1539,21 @@ struct clip_model_loader {
|
||||||
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||||
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
{
|
||||||
|
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||||
|
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||||
|
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||||
|
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||||
|
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||||
|
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||||
|
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||||
|
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||||
|
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||||
|
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||||
|
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||||
|
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
||||||
|
|
@ -2230,7 +2274,14 @@ struct llava_uhd {
|
||||||
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||||
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||||
std::vector<slice_coordinates> slices;
|
std::vector<slice_coordinates> slices;
|
||||||
|
|
||||||
|
img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
|
||||||
|
bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
||||||
|
std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
|
||||||
|
|
||||||
|
img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
|
||||||
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
||||||
|
std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
|
||||||
};
|
};
|
||||||
|
|
||||||
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
||||||
|
|
@ -2261,6 +2312,7 @@ struct llava_uhd {
|
||||||
res.refined_size = refine_size;
|
res.refined_size = refine_size;
|
||||||
res.grid_size = clip_image_size{0, 0};
|
res.grid_size = clip_image_size{0, 0};
|
||||||
res.padding_refined = true;
|
res.padding_refined = true;
|
||||||
|
res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding
|
||||||
|
|
||||||
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
||||||
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
||||||
|
|
@ -2339,12 +2391,13 @@ struct llava_uhd {
|
||||||
|
|
||||||
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
|
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
|
||||||
std::vector<clip_image_u8_ptr> output;
|
std::vector<clip_image_u8_ptr> output;
|
||||||
img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
|
|
||||||
|
|
||||||
// resize to overview size
|
// resize to overview size
|
||||||
clip_image_u8_ptr resized_img(clip_image_u8_init());
|
clip_image_u8_ptr resized_img(clip_image_u8_init());
|
||||||
img_tool::resize(*img, *resized_img, inst.overview_size, interpolation);
|
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
|
||||||
|
inst.padding_overview, inst.pad_color_overview);
|
||||||
output.push_back(std::move(resized_img));
|
output.push_back(std::move(resized_img));
|
||||||
|
|
||||||
if (inst.slices.empty()) {
|
if (inst.slices.empty()) {
|
||||||
// no slices, just return the resized image
|
// no slices, just return the resized image
|
||||||
return output;
|
return output;
|
||||||
|
|
@ -2352,13 +2405,8 @@ struct llava_uhd {
|
||||||
|
|
||||||
// resize to refined size
|
// resize to refined size
|
||||||
clip_image_u8_ptr refined_img(clip_image_u8_init());
|
clip_image_u8_ptr refined_img(clip_image_u8_init());
|
||||||
if (inst.padding_refined) {
|
img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
|
||||||
img_tool::resize(*img, *refined_img, inst.refined_size, interpolation);
|
inst.padding_refined, inst.pad_color_refined);
|
||||||
} else {
|
|
||||||
// only algo bicubic preserves the ratio; old models rely on this behavior
|
|
||||||
// TODO: do we need to support other algos here?
|
|
||||||
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
// create slices
|
// create slices
|
||||||
for (const auto & slice : inst.slices) {
|
for (const auto & slice : inst.slices) {
|
||||||
|
|
@ -2891,6 +2939,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
n_patches /= 2;
|
n_patches /= 2;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
{
|
||||||
|
n_patches = img->nx;
|
||||||
|
// whisper downscales input token by half after conv1d
|
||||||
|
n_patches /= 2;
|
||||||
|
// reshape by merge_factor
|
||||||
|
n_patches /= ctx->model.hparams.proj_stack_factor;
|
||||||
|
// for BOI and EOI token embeddings
|
||||||
|
n_patches += 2;
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_COGVLM:
|
case PROJECTOR_TYPE_COGVLM:
|
||||||
{
|
{
|
||||||
n_patches += 2; // for BOI and EOI token embeddings
|
n_patches += 2; // for BOI and EOI token embeddings
|
||||||
|
|
@ -3226,6 +3284,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_LFM2:
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
|
|
@ -3336,6 +3395,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_model_proj->ne[1];
|
return ctx->model.mm_model_proj->ne[1];
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
return ctx->model.mm_fc_w->ne[1];
|
return ctx->model.mm_fc_w->ne[1];
|
||||||
|
case PROJECTOR_TYPE_GLMA:
|
||||||
|
return ctx->model.mm_2_w->ne[1];
|
||||||
case PROJECTOR_TYPE_LFM2:
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_KIMIVL:
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
return ctx->model.mm_2_w->ne[1];
|
return ctx->model.mm_2_w->ne[1];
|
||||||
|
|
@ -3382,6 +3443,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||||
|
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
GGML_ASSERT(model.layers[0].q_b);
|
GGML_ASSERT(model.layers[0].q_b);
|
||||||
GGML_ASSERT(model.layers[0].v_b);
|
GGML_ASSERT(model.layers[0].v_b);
|
||||||
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
||||||
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
|
|
||||||
|
|
||||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||||
ctx0, model.position_embeddings,
|
ctx0, model.position_embeddings,
|
||||||
|
|
@ -49,15 +48,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
if (model.audio_has_stack_frames()) {
|
if (model.audio_has_stack_frames()) {
|
||||||
// StackAudioFrames
|
// StackAudioFrames
|
||||||
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
||||||
int64_t stride = n_embd * hparams.proj_stack_factor;
|
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||||
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
|
|
||||||
int64_t pad = padded_len - ggml_nelements(cur);
|
|
||||||
if (pad > 0) {
|
|
||||||
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
|
|
||||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
|
||||||
}
|
|
||||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
|
||||||
ggml_row_size(cur->type, stride), 0);
|
|
||||||
cb(cur, "after_stacked", -1);
|
cb(cur, "after_stacked", -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -95,6 +86,14 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||||
FFN_GELU_ERF,
|
FFN_GELU_ERF,
|
||||||
-1);
|
-1);
|
||||||
|
|
||||||
|
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||||
|
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
|
||||||
|
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||||
|
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
|
||||||
|
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||||
|
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unknown projector type", __func__);
|
GGML_ABORT("%s: unknown projector type", __func__);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue