Merge branch 'delta_net' of github.com:ymcki/llama.cpp into delta_net
merge with latest
This commit is contained in:
commit
4dea644c0a
|
|
@ -132,6 +132,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||
- [x] [RWKV-7](https://huggingface.co/collections/shoumenchougou/rwkv7-gxx-gguf)
|
||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
|
|
|
|||
|
|
@ -1577,7 +1577,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--temp"}, "N",
|
||||
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
||||
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.temp = std::stof(value);
|
||||
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
||||
|
|
@ -1594,7 +1594,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam().set_env("LLAMA_ARG_TOP_K"));
|
||||
add_opt(common_arg(
|
||||
{"--top-p"}, "N",
|
||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||
string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.top_p = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
|
||||
|
|
@ -1602,7 +1602,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--min-p"}, "N",
|
||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||
string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.min_p = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
|
||||
|
|
@ -1610,14 +1610,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--top-nsigma"}, "N",
|
||||
string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
||||
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.top_n_sigma = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-probability"}, "N",
|
||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||
string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.xtc_probability = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
|
||||
|
|
@ -1625,7 +1625,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-threshold"}, "N",
|
||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||
string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.xtc_threshold = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
|
||||
|
|
@ -1633,7 +1633,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--typical"}, "N",
|
||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.typ_p = std::stof(value);
|
||||
}
|
||||
|
|
@ -1652,7 +1652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--repeat-penalty"}, "N",
|
||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||
string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.penalty_repeat = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
|
||||
|
|
@ -1660,21 +1660,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--presence-penalty"}, "N",
|
||||
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
||||
string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.penalty_present = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--frequency-penalty"}, "N",
|
||||
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
||||
string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.penalty_freq = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-multiplier"}, "N",
|
||||
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
||||
string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.dry_multiplier = std::stof(value);
|
||||
}
|
||||
|
|
@ -1755,14 +1755,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dynatemp-range"}, "N",
|
||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||
string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.dynatemp_range = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dynatemp-exp"}, "N",
|
||||
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
||||
string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.dynatemp_exponent = std::stof(value);
|
||||
}
|
||||
|
|
@ -1778,7 +1778,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--mirostat-lr"}, "N",
|
||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
||||
string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.mirostat_eta = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
|
||||
|
|
@ -1786,7 +1786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--mirostat-ent"}, "N",
|
||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
||||
string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.mirostat_tau = std::stof(value);
|
||||
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
|
||||
|
|
@ -1920,28 +1920,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
||||
add_opt(common_arg(
|
||||
{"--yarn-ext-factor"}, "N",
|
||||
string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
||||
string_format("YaRN: extrapolation mix factor (default: %.2f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.yarn_ext_factor = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
||||
add_opt(common_arg(
|
||||
{"--yarn-attn-factor"}, "N",
|
||||
string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
||||
string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.2f)", (double)params.yarn_attn_factor),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.yarn_attn_factor = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
||||
add_opt(common_arg(
|
||||
{"--yarn-beta-slow"}, "N",
|
||||
string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
||||
string_format("YaRN: high correction dim or alpha (default: %.2f)", (double)params.yarn_beta_slow),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.yarn_beta_slow = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
||||
add_opt(common_arg(
|
||||
{"--yarn-beta-fast"}, "N",
|
||||
string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
||||
string_format("YaRN: low correction dim or beta (default: %.2f)", (double)params.yarn_beta_fast),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.yarn_beta_fast = std::stof(value);
|
||||
}
|
||||
|
|
@ -3335,14 +3335,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-split"}, "P",
|
||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||
string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_split = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-min"}, "P",
|
||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||
string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_min = std::stof(value);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3803,7 +3803,7 @@ class Ernie4_5MoeModel(Ernie4_5Model):
|
|||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
yield from super().modify_tensors(data_torch, merged_name, bid)
|
||||
else:
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
|
@ -6394,7 +6394,8 @@ class Gemma3nVisionAudioModel(ConformerAudioModel):
|
|||
|
||||
if name.startswith("model.vision_tower.timm_model.blocks."):
|
||||
# Double-indexed block tensors through custom logic
|
||||
new_name = self.custom_map(name)
|
||||
yield (self.custom_map(name), data_torch)
|
||||
return
|
||||
else:
|
||||
# Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
|
@ -6402,7 +6403,7 @@ class Gemma3nVisionAudioModel(ConformerAudioModel):
|
|||
if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
|
||||
data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
|
||||
|
||||
yield from super().modify_tensors(data_torch, new_name, bid)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
|
||||
|
|
@ -6502,7 +6503,7 @@ class Gemma3NModel(Gemma3Model):
|
|||
|
||||
# Continue with normal processing
|
||||
name = name.replace("language_model.", "")
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
|
||||
return
|
||||
|
||||
if "altup_unembed_projections" in name:
|
||||
|
|
@ -6519,7 +6520,7 @@ class Gemma3NModel(Gemma3Model):
|
|||
raise ValueError(f"Unknown name: {name}")
|
||||
out = self._stack_matrices(self._altup_unembd)
|
||||
if out is not None:
|
||||
yield from super().modify_tensors(out, "model.altup_unembed_projections.weight", bid)
|
||||
yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid)
|
||||
return
|
||||
else:
|
||||
return
|
||||
|
|
@ -6536,7 +6537,7 @@ class Gemma3NModel(Gemma3Model):
|
|||
raise ValueError(f"Unknown name: {name}")
|
||||
out = self._stack_matrices(self._altup_proj)
|
||||
if out is not None:
|
||||
yield from super().modify_tensors(out, "model.altup_projections.weight", bid)
|
||||
yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid)
|
||||
return
|
||||
else:
|
||||
return
|
||||
|
|
@ -9052,8 +9053,8 @@ class GraniteMoeModel(GraniteModel):
|
|||
ffn_dim = self.hparams["intermediate_size"]
|
||||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
|
||||
yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
|
||||
|
||||
has_experts = bool(self.hparams.get('num_local_experts'))
|
||||
|
||||
|
|
@ -9062,15 +9063,15 @@ class GraniteMoeModel(GraniteModel):
|
|||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
if has_experts:
|
||||
yield from super().modify_tensors(gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid)
|
||||
yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid)
|
||||
return
|
||||
yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
|
||||
yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid)
|
||||
return
|
||||
|
||||
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
|
||||
yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#define GGML_FA_TILE_Q 32
|
||||
#define GGML_FA_TILE_KV 16
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <utility>
|
||||
|
|
@ -84,4 +87,9 @@ static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_pa
|
|||
return {ir0, ir1};
|
||||
}
|
||||
|
||||
struct ggml_fa_tile_config {
|
||||
static constexpr size_t Q = GGML_FA_TILE_Q;
|
||||
static constexpr size_t KV = GGML_FA_TILE_KV;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@
|
|||
#include "vec.h"
|
||||
#include "ops.h"
|
||||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
|
|
@ -2866,10 +2867,12 @@ struct ggml_cplan ggml_graph_plan(
|
|||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const int64_t ne10 = node->src[1]->ne[0]; // DK
|
||||
const int64_t ne20 = node->src[2]->ne[0]; // DV
|
||||
const int64_t DK = node->src[1]->ne[0];
|
||||
const int64_t DV = node->src[2]->ne[0];
|
||||
|
||||
cur = sizeof(float)*(1*ne10 + 2*ne20)*n_tasks; // 1x head size K + 2x head size V (per thread)
|
||||
// Tiled flash attention scratch (tile sizes defined in common.h)
|
||||
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
|
||||
cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -8164,6 +8164,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
// online softmax / attention
|
||||
// loop over n_kv and n_head_kv
|
||||
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
||||
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
|
||||
if (mv == -INFINITY) {
|
||||
|
|
@ -8271,6 +8272,280 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_tiled(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0, int ir1) {
|
||||
const ggml_tensor * q = dst->src[0];
|
||||
const ggml_tensor * k = dst->src[1];
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int64_t DK = nek0;
|
||||
const int64_t DV = nev0;
|
||||
const int64_t N = neq1;
|
||||
|
||||
GGML_ASSERT(ne0 == DV);
|
||||
GGML_ASSERT(ne2 == N);
|
||||
|
||||
// input tensor rows must be contiguous
|
||||
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
|
||||
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
|
||||
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
|
||||
|
||||
GGML_ASSERT(neq0 == DK);
|
||||
GGML_ASSERT(nek0 == DK);
|
||||
GGML_ASSERT(nev0 == DV);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(k->type == v->type);
|
||||
const ggml_type kv_type = k->type;
|
||||
|
||||
const auto * kv_type_traits_cpu = ggml_get_type_traits_cpu(kv_type);
|
||||
const ggml_from_float_t kv_from_float = kv_type_traits_cpu->from_float;
|
||||
const ggml_vec_dot_t kv_vec_dot = kv_type_traits_cpu->vec_dot;
|
||||
const size_t kv_type_size = ggml_type_size(kv_type);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t rk2 = neq2/nek2;
|
||||
const int64_t rk3 = neq3/nek3;
|
||||
|
||||
const int64_t rv2 = neq2/nev2;
|
||||
const int64_t rv3 = neq3/nev3;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap != 0) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
const uint32_t n_head = neq2;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
int ith = params->ith;
|
||||
|
||||
static constexpr int Q_TILE_SZ = ggml_fa_tile_config::Q;
|
||||
static constexpr int KV_TILE_SZ = ggml_fa_tile_config::KV;
|
||||
|
||||
GGML_ASSERT(nek1 % KV_TILE_SZ == 0 && "KV sequence length must be divisible by KV_TILE_SZ");
|
||||
|
||||
int ir = ir0;
|
||||
while (ir < ir1) {
|
||||
// q indices for the start of this tile
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
// Number of valid rows in this tile:
|
||||
// - limited by tile size (Q_TILE_SZ)
|
||||
// - limited by chunk boundary (ir1 - ir)
|
||||
// - limited by head boundary (neq1 - iq1) to avoid crossing into next head
|
||||
const int tile_rows = MIN(Q_TILE_SZ, MIN((int)(ir1 - ir), (int)(neq1 - iq1)));
|
||||
GGML_ASSERT(tile_rows > 0);
|
||||
|
||||
const uint32_t h = iq2; // head index
|
||||
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
||||
|
||||
float S[Q_TILE_SZ];
|
||||
float M[Q_TILE_SZ];
|
||||
|
||||
for (int i = 0 ; i < Q_TILE_SZ; ++i) {
|
||||
S[i] = 0.;
|
||||
M[i] = -INFINITY;
|
||||
}
|
||||
|
||||
// Per-thread scratch layout:
|
||||
// Q_q: Q_TILE_SZ * DK (converted Q tile in KV type)
|
||||
// KQ: Q_TILE_SZ * KV_TILE_SZ (attention scores in float)
|
||||
// mask: Q_TILE_SZ * KV_TILE_SZ (mask in float)
|
||||
// VKQ32: Q_TILE_SZ * DV (FP32 output accumulator)
|
||||
// V32: KV_TILE_SZ * DV (F32 buffer for V tile - used for f166 conversion)
|
||||
float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + CACHE_LINE_SIZE_F32);
|
||||
|
||||
void * Q_q = base;
|
||||
float * KQ = (float *)((char *)base + Q_TILE_SZ * DK * sizeof(float));
|
||||
float * mask32 = KQ + Q_TILE_SZ * KV_TILE_SZ;
|
||||
float * VKQ32 = mask32 + Q_TILE_SZ * KV_TILE_SZ;
|
||||
float * V32 = VKQ32 + Q_TILE_SZ * DV; // F32 buffer for V tile
|
||||
|
||||
memset(VKQ32, 0, Q_TILE_SZ * DV * sizeof(float));
|
||||
memset(mask32, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float));
|
||||
|
||||
// k indices
|
||||
const int ik3 = iq3 / rk3;
|
||||
const int ik2 = iq2 / rk2;
|
||||
|
||||
// v indices
|
||||
const int iv3 = iq3 / rv3;
|
||||
const int iv2 = iq2 / rv2;
|
||||
|
||||
for (int tq = 0; tq < tile_rows; tq++) {
|
||||
const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3));
|
||||
kv_from_float(pq, (char *)Q_q + tq * DK * kv_type_size, DK);
|
||||
}
|
||||
// Zero-pad remaining rows
|
||||
for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) {
|
||||
memset((char *)Q_q + tq * DK * kv_type_size, 0, DK * kv_type_size);
|
||||
}
|
||||
|
||||
for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
|
||||
|
||||
// skip the tile entirely if all the masks are -inf
|
||||
if (mask) {
|
||||
bool can_skip = true;
|
||||
for (int tq = 0; tq < tile_rows; tq++) {
|
||||
const ggml_fp16_t * mp_row = (const ggml_fp16_t *)((const char *) mask->data + (iq1 + tq)*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]);
|
||||
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
|
||||
mask32[tq * KV_TILE_SZ + tk] = slope * GGML_CPU_FP16_TO_FP32(mp_row[ic + tk]);
|
||||
if (mask32[tq * KV_TILE_SZ + tk] != -INFINITY) {
|
||||
can_skip = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (can_skip) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
|
||||
const void * q_row = (const char *)Q_q + tq * DK * kv_type_size;
|
||||
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
|
||||
const void * k_row = (const char *) k->data + ((ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
float s;
|
||||
kv_vec_dot(DK, &s, 0, k_row, 0, q_row, 0, 1);
|
||||
KQ[tq * KV_TILE_SZ + tk] = s * scale;
|
||||
}
|
||||
}
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
ggml_vec_tanh_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, KQ);
|
||||
ggml_vec_scale_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, logit_softcap);
|
||||
}
|
||||
|
||||
if (mask) {
|
||||
ggml_vec_add_f32(tile_rows * KV_TILE_SZ, KQ, KQ, mask32);
|
||||
}
|
||||
|
||||
bool skip[Q_TILE_SZ] = {};
|
||||
|
||||
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
|
||||
float * kq_row = KQ + tq * KV_TILE_SZ;
|
||||
|
||||
float tile_max;
|
||||
ggml_vec_max_f32(KV_TILE_SZ, &tile_max, kq_row);
|
||||
|
||||
if (tile_max == -INFINITY) {
|
||||
skip[tq] = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
const float Mold = M[tq];
|
||||
const float Mnew = fmaxf(Mold, tile_max);
|
||||
|
||||
if (Mnew > Mold) {
|
||||
const float ms = expf(Mold - Mnew);
|
||||
ggml_vec_scale_f32(DV, VKQ32 + tq * DV, ms);
|
||||
S[tq] *= ms;
|
||||
}
|
||||
M[tq] = Mnew;
|
||||
|
||||
|
||||
S[tq] += ggml_vec_soft_max_f32(KV_TILE_SZ, kq_row, kq_row, Mnew);
|
||||
}
|
||||
|
||||
// Convert V tile to F32 first (if F16), then do MAD
|
||||
// On x86, ggml_vec_mad_f16 internall converts F16<->F32 on every load/store, so pre-converting is faster.
|
||||
// TODO: on ARM, native f16 should be faster
|
||||
if (kv_type == GGML_TYPE_F16) {
|
||||
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
|
||||
const ggml_fp16_t * v_row = (const ggml_fp16_t *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
ggml_fp16_to_fp32_row(v_row, V32 + tk * DV, DV);
|
||||
}
|
||||
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
|
||||
if (skip[tq]) continue;
|
||||
float * vkq_row = VKQ32 + tq * DV;
|
||||
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
|
||||
const float p = KQ[tq * KV_TILE_SZ + tk];
|
||||
ggml_vec_mad_f32(DV, vkq_row, V32 + tk * DV, p);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
|
||||
if (skip[tq]) continue;
|
||||
float * vkq_row = VKQ32 + tq * DV;
|
||||
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
|
||||
const float p = KQ[tq * KV_TILE_SZ + tk];
|
||||
const float * v_row = (const float *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
ggml_vec_mad_f32(DV, vkq_row, v_row, p);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sinks (apply only to valid rows in the tile)
|
||||
if (sinks) {
|
||||
const float s = ((float *)((char *) sinks->data))[h];
|
||||
|
||||
for (int tq = 0; tq < tile_rows; tq++) {
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
||||
if (s > M[tq]) {
|
||||
ms = expf(M[tq] - s);
|
||||
ggml_vec_scale_f32(DV, VKQ32 + tq * DV, ms);
|
||||
} else {
|
||||
vs = expf(s - M[tq]);
|
||||
}
|
||||
|
||||
S[tq] = S[tq] * ms + vs;
|
||||
}
|
||||
}
|
||||
|
||||
for (int tq = 0; tq < tile_rows; tq++) {
|
||||
// V /= S
|
||||
const float S_inv = S[tq] == 0.0f ? 0.0f : 1.0f / S[tq];
|
||||
ggml_vec_scale_f32(DV, VKQ32 + tq * DV, S_inv);
|
||||
|
||||
// dst indices
|
||||
const int i1 = iq1 + tq;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// permute(0, 2, 1, 3)
|
||||
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32 + tq * DV, nb1);
|
||||
}
|
||||
|
||||
ir += tile_rows;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
|
@ -8343,6 +8618,15 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
// The number of elements in each chunk
|
||||
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
||||
|
||||
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
|
||||
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
|
||||
const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16);
|
||||
const bool use_tiled = (q->type == GGML_TYPE_F32 &&
|
||||
kv_is_f32_or_f16 &&
|
||||
k->type == v->type &&
|
||||
nek1 % KV_TILE_SZ == 0 &&
|
||||
neq1 >= Q_TILE_SZ); // Only use tiled for batch >= tile size
|
||||
|
||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||
int current_chunk = ith;
|
||||
|
||||
|
|
@ -8350,7 +8634,11 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
const int64_t ir0 = dr * current_chunk;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
|
||||
if (use_tiled) {
|
||||
ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
|
||||
} else {
|
||||
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
|
||||
}
|
||||
|
||||
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -643,9 +643,10 @@ static __global__ void flash_attn_stream_k_fixup(
|
|||
|
||||
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
||||
const int iter_z = (ne02 + (ncols2 - 1)) / ncols2;
|
||||
|
||||
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x;
|
||||
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x;
|
||||
|
||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
||||
|
|
@ -654,15 +655,15 @@ static __global__ void flash_attn_stream_k_fixup(
|
|||
return;
|
||||
}
|
||||
|
||||
const int sequence = kbc0 / (iter_k*iter_j*(ne02/ncols2));
|
||||
const int head = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
|
||||
const int jt = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.
|
||||
const int sequence = kbc0 / (iter_k*iter_j*iter_z);
|
||||
const int zt = (kbc0 - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j);
|
||||
const int jt = (kbc0 - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
|
||||
if (jt*ncols1 + j >= ne01) {
|
||||
if (jt*ncols1 + j >= ne01 || zt*ncols2 + c >= ne02) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + head*(ncols2*D) + (j*ne02 + c)*D + tid;
|
||||
dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt*(ncols2*D) + (j*ne02 + c)*D + tid;
|
||||
|
||||
// Load the partial result that needs a fixup:
|
||||
float dst_val = 0.0f;
|
||||
|
|
@ -681,7 +682,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
|||
int bidx = bidx0 - 1;
|
||||
int kbc_stop = kbc0;
|
||||
while(true) {
|
||||
const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc = int64_t(bidx)*(iter_k*iter_j*iter_z*ne03) / gridDim.x;
|
||||
if (kbc == kbc_stop) { // Did not have any data.
|
||||
bidx--;
|
||||
kbc_stop = kbc;
|
||||
|
|
@ -782,7 +783,7 @@ void launch_fattn(
|
|||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
|
||||
const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
|
|
@ -883,7 +884,8 @@ void launch_fattn(
|
|||
}
|
||||
|
||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||
const int ntiles_z = ((Q->ne[2] + ncols2 - 1) / ncols2);
|
||||
const int ntiles_total = ntiles_x * ntiles_z * Q->ne[3];
|
||||
|
||||
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
||||
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||
|
|
@ -958,7 +960,7 @@ void launch_fattn(
|
|||
|
||||
blocks_num.x = ntiles_x;
|
||||
blocks_num.y = parallel_blocks;
|
||||
blocks_num.z = (Q->ne[2]/ncols2)*Q->ne[3];
|
||||
blocks_num.z = ntiles_z*Q->ne[3];
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
|
|
|
|||
|
|
@ -940,6 +940,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
const int stride_V,
|
||||
const int stride_mask,
|
||||
const int jt,
|
||||
const int zt,
|
||||
const int kb0_start,
|
||||
const int kb0_stop) {
|
||||
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
|
||||
|
|
@ -1022,7 +1023,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
const int j = jc / ncols2;
|
||||
const int c = jc % ncols2;
|
||||
|
||||
if (jt*ncols1 + j < int(ne01.z)) {
|
||||
if ((ncols1 == 1 || jt*ncols1 + j < int(ne01.z)) && (ncols2 == 1 || zt*ncols2 + c < ne02)) {
|
||||
#pragma unroll
|
||||
for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
|
||||
const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
|
||||
|
|
@ -1408,7 +1409,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
const int j_dst = jc_dst / ncols2;
|
||||
const int c_dst = jc_dst % ncols2;
|
||||
|
||||
if (!is_fixup && jt*ncols1 + j_dst >= int(ne01.z)) {
|
||||
if (!is_fixup && ((ncols1 > 1 && jt*ncols1 + j_dst >= int(ne01.z)) || (ncols2 > 1 && zt*ncols2 + c_dst >= ne02))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -1522,10 +1523,11 @@ static __global__ void flash_attn_ext_f16(
|
|||
|
||||
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
||||
const int iter_z = (ne02 + (ncols2 - 1)) / ncols2;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x;
|
||||
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x;
|
||||
|
||||
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
||||
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
||||
|
|
@ -1536,9 +1538,9 @@ static __global__ void flash_attn_ext_f16(
|
|||
int kb0_stop = min(iter_k, kb0_start + kbc_stop - kbc);
|
||||
|
||||
while (kbc < kbc_stop && kb0_stop == iter_k) {
|
||||
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
|
||||
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
const int sequence = kbc / (iter_k*iter_j*iter_z);
|
||||
const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
|
||||
const int head0 = zt * ncols2;
|
||||
|
||||
|
|
@ -1561,12 +1563,12 @@ static __global__ void flash_attn_ext_f16(
|
|||
constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, V_is_K_view, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop);
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop);
|
||||
} else {
|
||||
constexpr bool needs_fixup = true; // CUDA block is missing the beginning of a tile.
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, V_is_K_view, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop);
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
kbc += iter_k;
|
||||
|
|
@ -1580,9 +1582,9 @@ static __global__ void flash_attn_ext_f16(
|
|||
return;
|
||||
}
|
||||
|
||||
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
|
||||
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
const int sequence = kbc / (iter_k*iter_j*iter_z);
|
||||
const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2
|
||||
const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
|
||||
|
||||
const int head0 = zt * ncols2;
|
||||
|
||||
|
|
@ -1605,7 +1607,7 @@ static __global__ void flash_attn_ext_f16(
|
|||
constexpr bool needs_fixup = false;
|
||||
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, use_logit_softcap, V_is_K_view, needs_fixup, is_fixup>
|
||||
(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop);
|
||||
ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop);
|
||||
#else
|
||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||
max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
|
|
@ -1739,3 +1741,5 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
|||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32);
|
||||
|
|
|
|||
|
|
@ -18,9 +18,11 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
|
|||
}
|
||||
}
|
||||
|
||||
if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
if constexpr (ncols2 <= 16) {
|
||||
if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) {
|
||||
|
|
@ -33,6 +35,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
|
|||
|
||||
template <int DKQ, int DV>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
|
|
@ -60,17 +63,38 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
|||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
// On Volta the GQA optimizations aren't as impactful vs. minimizing wasted compute:
|
||||
if (cc == GGML_CUDA_CC_VOLTA) {
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 4 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio > 4) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 4 == 0) {
|
||||
if (use_gqa_opt && gqa_ratio > 2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
if (use_gqa_opt && gqa_ratio > 1) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
|
@ -79,6 +103,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
|||
}
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
|
|
@ -121,8 +146,30 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
|||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
GGML_ASSERT(gqa_ratio % 4 == 0);
|
||||
if (gqa_ratio % 16 == 0) {
|
||||
if (gqa_ratio == 20) { // GLM 4.7 Flash
|
||||
if (cc >= GGML_CUDA_CC_BLACKWELL) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
break;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
if (Q->ne[1] <= 4) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
break;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_TURING) {
|
||||
if (Q->ne[1] <= 4) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 32>(ctx, dst);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
break;
|
||||
}
|
||||
// Volta:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
} else if (gqa_ratio % 16 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
|
|
@ -234,7 +281,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
|||
|
||||
// The effective batch size for the kernel can be increased by gqa_ratio.
|
||||
// The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
|
||||
bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
bool gqa_opt_applies = gqa_ratio >= 2 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
for (const ggml_tensor * t : {Q, K, V, mask}) {
|
||||
if (t == nullptr || ggml_is_quantized(t->type)) {
|
||||
continue;
|
||||
|
|
@ -247,7 +294,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
|||
}
|
||||
}
|
||||
|
||||
const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
|
||||
const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[device].cc;
|
||||
|
||||
|
|
@ -268,7 +315,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
|||
if (V->ne[0] != 512) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (!gqa_opt_applies || gqa_ratio % 4 != 0) {
|
||||
if (!gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (!V_is_K_view) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32);
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32);
|
||||
|
|
@ -71,7 +71,7 @@ for type_k in TYPES_KV:
|
|||
f.write(SOURCE_FATTN_VEC.format(type_k=type_k, type_v=type_v))
|
||||
|
||||
for ncols in [8, 16, 32, 64]:
|
||||
for ncols2 in [1, 2, 4, 8, 16]:
|
||||
for ncols2 in [1, 2, 4, 8, 16, 32]:
|
||||
if ncols2 > ncols:
|
||||
continue
|
||||
ncols1 = ncols // ncols2
|
||||
|
|
@ -83,9 +83,9 @@ for ncols in [8, 16, 32, 64]:
|
|||
continue
|
||||
if head_size_kq == 72:
|
||||
continue
|
||||
if head_size_kq != 576 and ncols2 == 16:
|
||||
if head_size_kq != 576 and ncols2 in (16, 32):
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 not in (4, 16):
|
||||
if head_size_kq == 576 and ncols2 not in (4, 16, 32):
|
||||
continue
|
||||
head_size_v = head_size_kq if head_size_kq != 576 else 512
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
|
|
|
|||
|
|
@ -785,8 +785,12 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||
} else {
|
||||
dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
|
||||
}
|
||||
|
||||
strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
|
||||
|
||||
|
|
|
|||
|
|
@ -1476,12 +1476,12 @@ extern "C" {
|
|||
/// @details Build a split GGUF final path for this chunk.
|
||||
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
// Returns the split_path length.
|
||||
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||
LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
|
||||
|
||||
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||
// Returns the split_prefix length.
|
||||
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||
LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
|
|
|||
|
|
@ -793,7 +793,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
|||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
|
||||
const uint32_t n_embd_out = model.hparams.n_embd_out();
|
||||
return embd + j*n_embd_out;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
|
|
@ -1279,7 +1279,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
|
||||
|
|
@ -1688,7 +1688,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
const uint32_t n_embd_out = hparams.n_embd_out();
|
||||
float * embd_out = embd + n_outputs_prev*n_embd_out;
|
||||
|
||||
if (n_outputs) {
|
||||
|
|
@ -1821,7 +1821,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
|
|||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd_out = hparams.get_n_embd_out();
|
||||
const auto n_embd_out = hparams.n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
|
|
@ -2559,6 +2559,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|||
}
|
||||
}
|
||||
|
||||
// [TAG_CONTEXT_STATE_LOGITS]
|
||||
// write logits
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
|
|
|||
|
|
@ -407,6 +407,27 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
|
@ -1596,11 +1617,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||
v = ggml_transpose(ctx0, v);
|
||||
}
|
||||
|
||||
// TODO: update llama_kv_cache to not store V cache in the MLA case and automatically return a view of K
|
||||
if (v_mla) {
|
||||
v = ggml_view_4d(ctx0, k, v->ne[0], v->ne[1], v->ne[2], v->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
|
||||
}
|
||||
|
||||
// this can happen when KV cache is not used (e.g. an embedding model with non-causal attn)
|
||||
if (k->type == GGML_TYPE_F32) {
|
||||
k = ggml_cast(ctx0, k, GGML_TYPE_F16);
|
||||
|
|
@ -1823,9 +1839,11 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
ggml_tensor * v_mla, // TODO: remove
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
GGML_ASSERT(v_mla == nullptr);
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||
|
|
@ -1868,6 +1886,93 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||
return cur;
|
||||
}
|
||||
|
||||
static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
|
||||
ggml_context * ctx0,
|
||||
const llama_ubatch & ubatch,
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_context * mctx_cur) {
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
|
||||
|
||||
{
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
|
||||
|
||||
const auto n_kv = mctx_cur->get_n_kv();
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||
|
||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
|
||||
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
|
||||
|
||||
auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
|
||||
|
||||
return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_k * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks,
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
ggml_build_forward_expand(gf, v_cur);
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
// store to KV cache
|
||||
{
|
||||
const auto & k_idxs = inp->get_k_idxs();
|
||||
|
||||
ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
|
||||
}
|
||||
|
||||
const auto & kq_mask = inp->get_kq_mask();
|
||||
|
||||
ggml_tensor * q = q_cur;
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
cur = ggml_add(ctx0, cur, wo_b);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_iswa * inp,
|
||||
ggml_tensor * wo,
|
||||
|
|
|
|||
|
|
@ -317,6 +317,39 @@ public:
|
|||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
// V-less input for the KV cache
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19067
|
||||
class llm_graph_input_attn_k : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_k(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache_context * mctx) :
|
||||
hparams(hparams),
|
||||
cparams(cparams),
|
||||
mctx(mctx) {
|
||||
}
|
||||
~llm_graph_input_attn_k() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
|
||||
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
||||
|
||||
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
||||
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
const llama_kv_cache_context * mctx;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_kv_iswa(
|
||||
|
|
@ -833,6 +866,21 @@ struct llm_graph_context {
|
|||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_k * build_attn_inp_k() const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_k * inp,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
|
|
|||
|
|
@ -72,8 +72,8 @@ uint32_t llama_hparams::n_embd_inp() const {
|
|||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_out() const {
|
||||
return n_embd_out > 0 ? n_embd_out : n_embd;
|
||||
uint32_t llama_hparams::n_embd_out() const {
|
||||
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
|
|
@ -189,6 +189,21 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bool llama_hparams::is_mla() const {
|
||||
assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
|
||||
(n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
|
||||
|
||||
return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k_mla() const {
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v_mla() const {
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
if (n_layer_kv_from_start >= 0) {
|
||||
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||
|
|
|
|||
|
|
@ -53,8 +53,8 @@ struct llama_hparams {
|
|||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla = 0;
|
||||
uint32_t n_embd_head_v_mla = 0;
|
||||
uint32_t n_embd_head_k_mla_impl = 0;
|
||||
uint32_t n_embd_head_v_mla_impl = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
|
|
@ -167,7 +167,7 @@ struct llama_hparams {
|
|||
uint32_t n_cls_out = 1;
|
||||
|
||||
// output embedding dimension (0 = use n_embd)
|
||||
uint32_t n_embd_out = 0;
|
||||
uint32_t n_embd_out_impl = 0;
|
||||
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
|
|
@ -242,7 +242,7 @@ struct llama_hparams {
|
|||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t get_n_embd_out() const;
|
||||
uint32_t n_embd_out() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
|
@ -272,6 +272,12 @@ struct llama_hparams {
|
|||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
// note: currently only support if either all or none of the layers are MLA
|
||||
bool is_mla() const;
|
||||
|
||||
uint32_t n_embd_head_k_mla() const;
|
||||
uint32_t n_embd_head_v_mla() const;
|
||||
|
||||
bool has_kv(uint32_t il) const;
|
||||
|
||||
// number of layers for which has_kv() returns true
|
||||
|
|
|
|||
|
|
@ -97,6 +97,8 @@ llama_kv_cache::llama_kv_cache(
|
|||
__func__, hparams.n_embd_v_gqa_max());
|
||||
}
|
||||
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
if (!hparams.has_kv(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
||||
|
|
@ -130,18 +132,21 @@ llama_kv_cache::llama_kv_cache(
|
|||
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||
}
|
||||
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
|
||||
const bool has_k = true;
|
||||
const bool has_v = !is_mla;
|
||||
|
||||
ggml_format_name(k, "cache_k_l%d", il);
|
||||
ggml_format_name(v, "cache_v_l%d", il);
|
||||
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
|
||||
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
|
||||
|
||||
has_k && ggml_format_name(k, "cache_k_l%d", il);
|
||||
has_v && ggml_format_name(v, "cache_v_l%d", il);
|
||||
|
||||
std::vector<ggml_tensor *> k_stream;
|
||||
std::vector<ggml_tensor *> v_stream;
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
|
||||
v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
|
||||
k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
|
||||
v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
|
||||
}
|
||||
|
||||
map_layer_ids[il] = layers.size();
|
||||
|
|
@ -647,7 +652,10 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co
|
|||
const auto & layer = layers[il];
|
||||
|
||||
ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
|
||||
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
|
||||
|
||||
if (layer.v_stream[ssrc]) {
|
||||
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1516,7 +1524,7 @@ size_t llama_kv_cache::size_v_bytes() const {
|
|||
size_t size_v_bytes = 0;
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
size_v_bytes += ggml_nbytes(layer.v);
|
||||
size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
|
||||
}
|
||||
|
||||
return size_v_bytes;
|
||||
|
|
@ -1798,6 +1806,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[cr.strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t) v->type;
|
||||
|
|
@ -1824,6 +1835,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[cr.strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t) v->type;
|
||||
|
|
@ -2027,6 +2041,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
|
|
@ -2068,6 +2085,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
auto * v = layer.v_stream[strm];
|
||||
if (!v) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
|
|
|
|||
|
|
@ -146,8 +146,8 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
if (hparams.n_embd_out > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
||||
if (hparams.n_embd_out_impl > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
|
|
|
|||
|
|
@ -513,7 +513,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
|
|
@ -1698,15 +1698,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
if (!is_lite) {
|
||||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
}
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
|
|
@ -1737,6 +1738,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 47: type = LLM_TYPE_30B_A3B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
case 61: type = LLM_TYPE_671B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
|
@ -4942,14 +4944,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||
|
||||
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
||||
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
||||
|
|
@ -4974,13 +4973,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
if (!is_lite) {
|
||||
if (q_lora_rank > 0) {
|
||||
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
||||
}
|
||||
|
||||
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
||||
|
||||
if (!is_lite) {
|
||||
if (q_lora_rank > 0) {
|
||||
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
||||
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
||||
} else {
|
||||
|
|
@ -6630,7 +6629,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
// for LFM2-ColBert-350M
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
|
|
@ -7487,8 +7486,8 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
|
|
@ -8337,7 +8336,7 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|||
}
|
||||
|
||||
int32_t llama_model_n_embd_out(const llama_model * model) {
|
||||
return model->hparams.get_n_embd_out();
|
||||
return model->hparams.n_embd_out();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
|
|
|
|||
|
|
@ -1095,25 +1095,55 @@ int32_t llama_chat_apply_template(
|
|||
// model split
|
||||
//
|
||||
|
||||
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
||||
int32_t llama_split_path(
|
||||
char * split_path,
|
||||
size_t maxlen,
|
||||
const char * path_prefix,
|
||||
int32_t split_no,
|
||||
int32_t split_count) {
|
||||
|
||||
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
||||
return strlen(split_path);
|
||||
|
||||
const int written = snprintf(
|
||||
split_path,
|
||||
maxlen,
|
||||
SPLIT_PATH_FORMAT,
|
||||
path_prefix,
|
||||
split_no + 1,
|
||||
split_count
|
||||
);
|
||||
|
||||
if (written < 0 || (size_t) written >= maxlen) {
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
|
||||
return (int32_t) written;
|
||||
}
|
||||
|
||||
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
std::string str_split_path(split_path);
|
||||
char postfix[32];
|
||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
std::string str_postfix(postfix);
|
||||
int32_t llama_split_prefix(
|
||||
char * split_prefix,
|
||||
size_t maxlen,
|
||||
const char * split_path,
|
||||
int32_t split_no,
|
||||
int32_t split_count) {
|
||||
|
||||
// check if split_prefix ends with postfix
|
||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
return size_prefix;
|
||||
const std::string str_split_path(split_path);
|
||||
|
||||
char postfix[32];
|
||||
snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
|
||||
const std::string str_postfix(postfix);
|
||||
if (str_split_path.size() <= str_postfix.size()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const size_t size_prefix = str_split_path.size() - str_postfix.size();
|
||||
|
||||
if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
|
||||
const size_t copy_len = std::min(size_prefix + 1, maxlen);
|
||||
snprintf(split_prefix, copy_len, "%s", split_path);
|
||||
|
||||
return (int32_t) size_prefix;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -2,14 +2,11 @@
|
|||
|
||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
||||
|
||||
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
||||
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
||||
|
|
@ -43,7 +40,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
|
||||
auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr;
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
|
|
@ -57,6 +55,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
// self_attention
|
||||
{
|
||||
ggml_tensor * q = NULL;
|
||||
|
||||
const bool is_lite = model.layers[il].wq;
|
||||
|
||||
if (!is_lite) {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
||||
cb(q, "q", il);
|
||||
|
|
@ -145,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn,
|
||||
cur = build_attn(inp_attn_k,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||
} else {
|
||||
|
|
@ -182,7 +183,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
}
|
||||
|
||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||
cur = build_attn(inp_attn,
|
||||
cur = build_attn(inp_attn_kv,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,10 +45,10 @@
|
|||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
||||
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||
|
|
@ -109,30 +109,30 @@
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) |
|
||||
| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
|
||||
| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
|
||||
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
|
||||
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
|
||||
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
|
|
@ -173,12 +173,12 @@
|
|||
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
|
|
|
|||
|
|
@ -128,10 +128,10 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
||||
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||
|
|
@ -192,28 +192,30 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
|
||||
| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
|
||||
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
|
||||
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
|
||||
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
|
|
@ -251,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
|
||||
<!-- HELP_END -->
|
||||
|
|
|
|||
|
|
@ -342,44 +342,51 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// debug message about similarity of saved session, if applicable
|
||||
size_t n_matching_session_tokens = 0;
|
||||
if (!session_tokens.empty()) {
|
||||
for (llama_token id : session_tokens) {
|
||||
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
|
||||
break;
|
||||
bool session_do_save = false;
|
||||
|
||||
{
|
||||
size_t n_match = 0;
|
||||
|
||||
if (!session_tokens.empty()) {
|
||||
for (llama_token id : session_tokens) {
|
||||
if (n_match >= embd_inp.size() || id != embd_inp[n_match]) {
|
||||
break;
|
||||
}
|
||||
n_match++;
|
||||
}
|
||||
if (params.prompt.empty() && n_match == embd_inp.size()) {
|
||||
LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_match >= embd_inp.size()) {
|
||||
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_match < (embd_inp.size() / 2)) {
|
||||
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_match, embd_inp.size());
|
||||
} else {
|
||||
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_match, embd_inp.size());
|
||||
}
|
||||
|
||||
if (session_tokens.size() == n_match) {
|
||||
// [TAG_CONTEXT_STATE_LOGITS]
|
||||
// in this case, we are going to reuse the logits from the session
|
||||
// if we ever decide to remove the logits from the session, we need to handle this somehow
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
if (session_tokens.size() > n_match) {
|
||||
if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
|
||||
LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
|
||||
llama_memory_clear(mem, true);
|
||||
session_tokens.clear();
|
||||
n_match = 0;
|
||||
} else {
|
||||
session_tokens.resize(n_match);
|
||||
}
|
||||
}
|
||||
n_matching_session_tokens++;
|
||||
}
|
||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||
LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) {
|
||||
LOG_INF("%s: unable to resuse common prefix\n", __func__);
|
||||
n_matching_session_tokens = 0;
|
||||
llama_memory_seq_rm(mem, -1, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
|
||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||
// reevaluation of the last token to recalculate the cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
|
|
@ -521,10 +528,9 @@ int main(int argc, char ** argv) {
|
|||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool is_antiprompt = false;
|
||||
bool input_echo = true;
|
||||
bool display = true;
|
||||
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
|
||||
bool is_antiprompt = false;
|
||||
bool input_echo = true;
|
||||
bool display = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
|
|
@ -700,8 +706,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||
need_to_save_session = false;
|
||||
if (session_do_save) {
|
||||
session_do_save = false;
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
|
||||
LOG_DBG("saved session to %s\n", path_session.c_str());
|
||||
|
|
|
|||
|
|
@ -63,10 +63,10 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
|||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
||||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
||||
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||
|
|
@ -126,30 +126,30 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) |
|
||||
| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
|
||||
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
|
||||
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
|
||||
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
|
||||
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||
| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
|
||||
| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
|
||||
| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
|
||||
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
|
||||
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
|
||||
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
|
||||
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
|
||||
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
|
||||
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
|
|
@ -199,7 +199,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
|||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
|
||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||
| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||
|
|
@ -212,8 +213,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
|||
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
|
|
@ -222,7 +223,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
|||
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
|
|
|
|||
Loading…
Reference in New Issue