diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9fe1401df4..af4c60be64 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,8 @@ on: '**/*.swift', '**/*.m', '**/*.metal', - '**/*.comp' + '**/*.comp', + '**/*.glsl' ] pull_request: @@ -40,7 +41,8 @@ on: '**/*.swift', '**/*.m', '**/*.metal', - '**/*.comp' + '**/*.comp', + '**/*.glsl' ] concurrency: diff --git a/.gitignore b/.gitignore index 428f084110..05eb578a82 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,7 @@ /out/ /tmp/ /autogen-*.md +/common/build-info.cpp # Deprecated diff --git a/common/arg.cpp b/common/arg.cpp index e7bb44f8f5..1e211eab4a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1415,7 +1415,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.top_k = value; params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K; } - ).set_sparam()); + ).set_sparam().set_env("LLAMA_ARG_TOP_K")); add_opt(common_arg( {"--top-p"}, "N", string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), diff --git a/common/common.cpp b/common/common.cpp index 0497f90a28..b76dfa10ea 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1013,31 +1013,40 @@ bool tty_can_use_colors() { // Model utils // -static inline void common_init_sampler_from_model( +// TODO: move to common/sampling +static void common_init_sampler_from_model( const llama_model * model, common_params_sampling & sparams) { const uint64_t config = sparams.user_sampling_config; auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) { - if (config & user_config) return; + if (config & user_config) { + return; + } char buf[64] = {0}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { char * end = nullptr; int32_t v = strtol(buf, &end, 10); - if (end && end != buf) dst = v; + if (end && end != buf) { + dst = v; + } } }; auto get_float = [&](const char * key, float & dst, uint64_t user_config) { - if (config & user_config) return; + if (config & user_config) { + return; + } char buf[128] = {0}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { char * end = nullptr; float v = strtof(buf, &end); - if (end && end != buf) dst = v; + if (end && end != buf) { + dst = v; + } } }; @@ -1065,31 +1074,122 @@ static inline void common_init_sampler_from_model( get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA); } -struct common_init_result common_init_from_params(common_params & params) { - common_init_result iparams; - auto mparams = common_model_params_to_llama(params); +struct common_init_result::impl { + impl() = default; + ~impl() = default; + + llama_model_ptr model; + llama_context_ptr context; + + std::vector lora; + + std::vector samplers; +}; + +common_init_result::common_init_result(common_params & params) : + pimpl(new impl{}) { + const auto mparams = common_model_params_to_llama(params); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { - LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", - __func__, params.model.path.c_str()); - return iparams; + return; } - common_init_sampler_from_model(model, params.sampling); + pimpl->model.reset(model); const llama_vocab * vocab = llama_model_get_vocab(model); + // updates params.sampling + // TODO: fix naming + common_init_sampler_from_model(model, params.sampling); + auto cparams = common_context_params_to_llama(params); + if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); + params.sampling.ignore_eos = false; + } + + // initialize once + for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { + if (llama_vocab_is_eog(vocab, i)) { + LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY); + params.sampling.logit_bias_eog.push_back({i, -INFINITY}); + } + } + + if (params.sampling.ignore_eos) { + // add EOG biases to the active set of logit biases + params.sampling.logit_bias.insert( + params.sampling.logit_bias.end(), + params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end()); + } + + //if (params.sampling.penalty_last_n == -1) { + // LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // params.sampling.penalty_last_n = llama_n_ctx(lctx); + //} + + //if (params.sampling.dry_penalty_last_n == -1) { + // LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); + //} + + pimpl->samplers.resize(cparams.n_seq_max); + + for (int i = 0; i < (int) cparams.n_seq_max; ++i) { + pimpl->samplers[i].reset(common_sampler_init(model, params.sampling)); + } + llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", - __func__, params.model.path.c_str()); - llama_model_free(model); - return iparams; + __func__, params.model.path.c_str()); + return; } + pimpl->context.reset(lctx); +} + +llama_model * common_init_result::model() { + return pimpl->model.get(); +} + +llama_context * common_init_result::context() { + return pimpl->context.get(); +} + +common_sampler * common_init_result::sampler(llama_seq_id seq_id) { + return pimpl->samplers[seq_id].get(); +} + +std::vector & common_init_result::lora() { + return pimpl->lora; +} + +void common_init_result::free_context() { + pimpl->context.reset(); +} + +common_init_result_ptr common_init_from_params(common_params & params) { + common_init_result_ptr res(new common_init_result(params)); + + llama_model * model = res->model(); + if (model == NULL) { + LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", + __func__, params.model.path.c_str()); + return res; + } + + llama_context * lctx = res->context(); + if (lctx == NULL) { + LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", + __func__, params.model.path.c_str()); + return res; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) { LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; @@ -1101,10 +1201,7 @@ struct common_init_result common_init_from_params(common_params & params) { const auto cvec = common_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } int err = llama_apply_adapter_cvec( @@ -1115,10 +1212,7 @@ struct common_init_result common_init_from_params(common_params & params) { params.control_vector_layer_start, params.control_vector_layer_end); if (err) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } } @@ -1142,10 +1236,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (!ok) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } } @@ -1155,9 +1246,7 @@ struct common_init_result common_init_from_params(common_params & params) { lora.reset(llama_adapter_lora_init(model, la.path.c_str())); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_model_free(model); - return iparams; + return res; } char buf[1024]; @@ -1166,43 +1255,13 @@ struct common_init_result common_init_from_params(common_params & params) { la.task_name = buf; llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); la.prompt_prefix = buf; - iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters } if (!params.lora_init_without_apply) { common_set_adapter_lora(lctx, params.lora_adapters); } - if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sampling.ignore_eos = false; - } - - // initialize once - for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { - if (llama_vocab_is_eog(vocab, i)) { - LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); - params.sampling.logit_bias_eog.push_back({i, -INFINITY}); - } - } - - if (params.sampling.ignore_eos) { - // add EOG biases to the active set of logit biases - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), - params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end()); - } - - if (params.sampling.penalty_last_n == -1) { - LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.penalty_last_n = llama_n_ctx(lctx); - } - - if (params.sampling.dry_penalty_last_n == -1) { - LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); - } - if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); @@ -1241,12 +1300,11 @@ struct common_init_result common_init_from_params(common_params & params) { llama_set_warmup(lctx, false); } - iparams.model.reset(model); - iparams.context.reset(lctx); - - return iparams; + return res; } +common_init_result::~common_init_result() = default; + std::string get_model_endpoint() { const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. @@ -1255,7 +1313,9 @@ std::string get_model_endpoint() { std::string model_endpoint = "https://huggingface.co/"; if (endpoint_env) { model_endpoint = endpoint_env; - if (model_endpoint.back() != '/') model_endpoint += '/'; + if (model_endpoint.back() != '/') { + model_endpoint += '/'; + } } return model_endpoint; } diff --git a/common/common.h b/common/common.h index 7fe62b4111..7231cbc5b8 100644 --- a/common/common.h +++ b/common/common.h @@ -218,6 +218,10 @@ struct common_params_sampling { std::vector logit_bias; // logit biases to apply std::vector logit_bias_eog; // pre-calculated logit biases for EOG tokens + bool has_logit_bias() const { + return !logit_bias.empty(); + } + // print the parameters into a string std::string print() const; }; @@ -671,15 +675,29 @@ bool tty_can_use_colors(); // Model utils // -// note: defines object's lifetime -struct common_init_result { - llama_model_ptr model; - llama_context_ptr context; +struct common_sampler; - std::vector lora; +// note: defines the model, context, samplers, ets. lifetimes +struct common_init_result { + common_init_result(common_params & params); + ~common_init_result(); + + llama_model * model(); + llama_context * context(); + common_sampler * sampler(llama_seq_id seq_id); + + std::vector & lora(); + + void free_context(); + +private: + struct impl; + std::unique_ptr pimpl; }; -struct common_init_result common_init_from_params(common_params & params); +using common_init_result_ptr = std::unique_ptr; + +common_init_result_ptr common_init_from_params(common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/common/preset.cpp b/common/preset.cpp index 729c27f2cf..60746aad58 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -157,6 +157,21 @@ static std::map get_map_key_opt(common_params_context & return mapping; } +static bool is_bool_arg(const common_arg & arg) { + return !arg.args_neg.empty(); +} + +static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) { + // if this is a negated arg, we need to reverse the value + for (const auto & neg_arg : arg.args_neg) { + if (rm_leading_dashes(neg_arg) == key) { + return common_arg_utils::is_truthy(value) ? "false" : "true"; + } + } + // otherwise, not negated + return value; +} + common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) { common_presets out; auto key_to_opt = get_map_key_opt(ctx_params); @@ -173,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte for (const auto & [key, value] : section.second) { LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str()); if (key_to_opt.find(key) != key_to_opt.end()) { - preset.options[key_to_opt[key]] = value; - LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str()); + auto & opt = key_to_opt[key]; + if (is_bool_arg(opt)) { + preset.options[opt] = parse_bool_arg(opt, key, value); + } else { + preset.options[opt] = value; + } + LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str()); } else { // TODO: maybe warn about unknown key? } diff --git a/common/sampling.cpp b/common/sampling.cpp index a8494a679d..ee58aa50b3 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -104,9 +104,10 @@ struct ring_buffer { struct common_sampler { common_params_sampling params; - struct llama_sampler * grmr; struct llama_sampler * chain; + bool grammar; + ring_buffer prev; std::vector cur; @@ -116,7 +117,6 @@ struct common_sampler { void reset() { prev.clear(); - llama_sampler_reset(grmr); llama_sampler_reset(chain); } @@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co lparams.no_perf = params.no_perf; - struct llama_sampler * grmr; + llama_sampler * chain = llama_sampler_chain_init(lparams); + + bool grammar = false; + std::vector samplers; + if (params.grammar.compare(0, 11, "%llguidance") == 0) { #ifdef LLAMA_USE_LLGUIDANCE - grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); + samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str())); + grammar = true; #else GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); #endif // LLAMA_USE_LLGUIDANCE @@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co trigger_patterns_c.push_back(regex.c_str()); } - grmr = params.grammar_lazy - ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", - trigger_patterns_c.data(), trigger_patterns_c.size(), - trigger_tokens.data(), trigger_tokens.size()) - : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); - if (!grmr) { - return nullptr; + if (!params.grammar.empty()) { + if (params.grammar_lazy) { + samplers.push_back( + llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", + trigger_patterns_c.data(), trigger_patterns_c.size(), + trigger_tokens.data(), trigger_tokens.size())); + } else { + samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root")); + } + + grammar = true; } } - auto * result = new common_sampler { - /* .params = */ params, - /* .grmr = */ grmr, - /* .chain = */ llama_sampler_chain_init(lparams), - /* .prev = */ ring_buffer(std::max(32, params.n_prev)), - /* .cur = */ {}, - /* .cur_p = */ {}, - }; - - llama_sampler_chain_add(result->chain, - llama_sampler_init_logit_bias( - llama_vocab_n_tokens(vocab), - params.logit_bias.size(), - params.logit_bias.data())); + if (params.has_logit_bias()) { + samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data())); + } if (params.mirostat == 0) { // if this flag is set, we will not need to add `dist` at the end of the sampler chain @@ -257,34 +255,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co } llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + samplers.push_back(llama_sampler_init_top_k (params.top_k)); break; case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); + samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma)); break; case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); break; case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); + samplers.push_back(llama_sampler_init_infill (vocab)); break; case COMMON_SAMPLER_TYPE_PENALTIES: - llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); @@ -298,23 +297,36 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co if (!has_distribution_sampler) { llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); } + + samplers.push_back(llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + samplers.push_back(llama_sampler_init_temp(params.temp)); + samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); + samplers.push_back(llama_sampler_init_temp(params.temp)); + samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { GGML_ASSERT(false && "unknown mirostat version"); } + for (auto * smpl : samplers) { + llama_sampler_chain_add(chain, smpl); + } + + auto * result = new common_sampler { + /* .params = */ params, + /* .chain = */ chain, + /* .grammar = */ grammar, + /* .prev = */ ring_buffer(std::max(32, params.n_prev)), + /* .cur = */ {}, + /* .cur_p = */ {}, + }; + return result; } void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { - llama_sampler_free(gsmpl->grmr); - llama_sampler_free(gsmpl->chain); delete gsmpl; @@ -324,11 +336,24 @@ void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { const auto tm = gsmpl->tm(); - if (accept_grammar) { - llama_sampler_accept(gsmpl->grmr, token); - } + if (gsmpl->grammar) { + const int n_smpl = llama_sampler_chain_n(gsmpl->chain); - llama_sampler_accept(gsmpl->chain, token); + for (int i = 0; i < n_smpl; i++) { + auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); + + // the grammar sampler is always the first one + if (i == 0) { + if (accept_grammar) { + llama_sampler_accept(smpl, token); + } + } else { + llama_sampler_accept(smpl, token); + } + } + } else { + llama_sampler_accept(gsmpl->chain, token); + } gsmpl->prev.push_back(token); } @@ -339,12 +364,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { return new common_sampler { - /* .params = */ gsmpl->params, - /* .grmr = */ llama_sampler_clone(gsmpl->grmr), - /* .chain = */ llama_sampler_clone(gsmpl->chain), - /* .prev = */ gsmpl->prev, - /* .cur = */ gsmpl->cur, - /* .cur_p = */ gsmpl->cur_p, + /* .params = */ gsmpl->params, + /* .chain = */ llama_sampler_clone(gsmpl->chain), + /* .grammar = */ gsmpl->grammar, + /* .prev = */ gsmpl->prev, + /* .cur = */ gsmpl->cur, + /* .cur_p = */ gsmpl->cur_p, }; } @@ -393,58 +418,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam } } -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) { + return gsmpl->chain; +} + +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) { llama_synchronize(ctx); // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations const auto tm = gsmpl->tm(); - gsmpl->set_logits(ctx, idx); + llama_token id = LLAMA_TOKEN_NULL; - auto & grmr = gsmpl->grmr; auto & chain = gsmpl->chain; auto & cur_p = gsmpl->cur_p; // initialized by set_logits - if (grammar_first) { - llama_sampler_apply(grmr, &cur_p); - } + gsmpl->set_logits(ctx, idx); llama_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); - const llama_token id = cur_p.data[cur_p.selected].id; + id = cur_p.data[cur_p.selected].id; - if (grammar_first) { - return id; - } - - // check if it the sampled token fits the grammar - { - llama_token_data single_token_data = { id, 1.0f, 0.0f }; - llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; - - llama_sampler_apply(grmr, &single_token_data_array); - - const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; - if (is_valid) { - return id; - } - } - - // resampling: - // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain - gsmpl->set_logits(ctx, idx); - - llama_sampler_apply(grmr, &cur_p); - llama_sampler_apply(chain, &cur_p); - - GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); - - return cur_p.data[cur_p.selected].id; + return id; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft) { GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); std::vector result; @@ -452,7 +452,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample size_t i = 0; for (; i < draft.size(); i++) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); common_sampler_accept(gsmpl, id, true); @@ -464,7 +464,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample } if (i == draft.size()) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); common_sampler_accept(gsmpl, id, true); @@ -474,13 +474,13 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample return result; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) { std::vector idxs(draft.size() + 1); for (size_t i = 0; i < idxs.size(); ++i) { idxs[i] = i; } - return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft); } uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { @@ -525,7 +525,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) { for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); - result += std::string("-> ") + llama_sampler_name(smpl) + " "; + result += std::string("-> "); + result += std::string(llama_sampler_name(smpl)) + " "; } return result; diff --git a/common/sampling.h b/common/sampling.h index e198eecda3..ace5d3d020 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); +struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl); + // extended sampling implementation: // // - set logits @@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // - check if the token fits the grammar (if any) // - if not: resample by first applying the grammar constraints and then sampling again (slower path) // -// if grammar_first is true, the grammar is applied before the samplers (slower) -// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar -// -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx); // generalized version of common_sampler_sample // @@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // // returns at least 1 token, up to idxs.size() // -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft); // assume idxs == [ 0, 1, 2, ..., draft.size() ] -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); @@ -107,3 +106,9 @@ std::vector common_sampler_types_from_chars(const std: llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind, const char * grammar_data); + +struct common_sampler_deleter { + void operator()(common_sampler * s) { common_sampler_free(s); } +}; + +typedef std::unique_ptr common_sampler_ptr; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3e83b0964c..1e12383ae6 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft( for (int i = 0; i < params.n_draft; ++i) { common_batch_clear(batch); - common_sampler_sample(smpl, ctx_dft, 0, true); + common_sampler_sample(smpl, ctx_dft, 0); const auto * cur_p = common_sampler_get_candidates(smpl, true); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 151608d56b..3f861f2a6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -136,11 +136,19 @@ class ModelBase: self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams + self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: + if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None: + self.rope_parameters["rope_theta"] = rope_theta + if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: + self.rope_parameters["rope_type"] = rope_type + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. @@ -795,7 +803,7 @@ class TextModel(ModelBase): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -815,7 +823,42 @@ class TextModel(ModelBase): self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") - if (rope_theta := self.hparams.get("rope_theta")) is not None: + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + if (rope_type := rope_params.get("rope_type")) is not None: + rope_factor = rope_params.get("factor") + rope_gguf_type = gguf.RopeScalingType.NONE + if rope_type == "linear" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.LINEAR + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + elif rope_type == "yarn" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.YARN + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) + if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: + self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) + if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: + self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) + if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) + if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) + # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + elif rope_type == "su" or rope_type == "longrope": + rope_gguf_type = gguf.RopeScalingType.LONGROPE + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + elif rope_type == "dynamic": + # HunYuan, handled in model class + pass + elif rope_type.lower() == "llama3": + # Handled in generate_extra_tensors + pass + else: + logger.warning(f"Unknown RoPE type: {rope_type}") + logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + + if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: @@ -1966,34 +2009,10 @@ class BaichuanModel(TextModel): self._set_vocab_sentencepiece() def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") + super().set_gguf_parameters() self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] @@ -2089,34 +2108,10 @@ class XverseModel(TextModel): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") + super().set_gguf_parameters() self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -2430,11 +2425,6 @@ class LlamaModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -2518,16 +2508,16 @@ class LlamaModel(TextModel): return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -2564,11 +2554,6 @@ class ArceeModel(LlamaModel): def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) @ModelBase.register("AfmoeForCausalLM") @@ -2851,17 +2836,11 @@ class Mistral3Model(LlamaModel): def set_gguf_parameters(self): super().set_gguf_parameters() - rope_params = self.hparams.get("rope_parameters") + rope_params = self.rope_parameters if self.hparams.get("model_type") == "ministral3": - assert rope_params is not None, "ministral3 must have 'rope_parameters' config" + assert rope_params, "ministral3 must have 'rope_parameters' config" assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_params["factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"]) self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) - self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"]) self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): @@ -2958,7 +2937,7 @@ class DeciModel(TextModel): assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.hparams.get("rope_theta")) is not None: + if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_head_count(self._num_heads) @@ -2983,11 +2962,6 @@ class DeciModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -3016,16 +2990,16 @@ class DeciModel(TextModel): return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -3279,10 +3253,6 @@ class MiniCPMModel(TextModel): logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -3402,17 +3372,6 @@ class QwenModel(TextModel): def set_vocab(self): self._set_vocab_qwen() - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration") class Qwen2Model(TextModel): @@ -3427,11 +3386,6 @@ class Qwen2Model(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if self.hf_arch == "Qwen2Model": @@ -3499,12 +3453,6 @@ class DreamModel(TextModel): # Dream models use non-causal attention for diffusion self.gguf_writer.add_causal_attention(False) - # Handle RoPE scaling similar to Qwen2 - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # Add Dream-specific parameters mask_token_id = self.hparams.get("mask_token_id") @@ -4048,13 +3996,6 @@ class Qwen2MoeModel(TextModel): if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None @@ -4656,7 +4597,7 @@ class Phi3MiniModel(TextModel): self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_layer_norm_rms_eps(rms_eps) self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) self.gguf_writer.add_file_type(self.ftype) sliding_window = self.hparams.get("sliding_window") # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models @@ -4932,7 +4873,7 @@ class Plamo2Model(TextModel): self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) # Mamba parameters self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) @@ -5130,21 +5071,6 @@ class InternLM2Model(TextModel): special_vocab.add_to_gguf(self.gguf_writer) - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_file_type(self.ftype) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] @@ -5221,11 +5147,6 @@ class InternLM3Model(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -5588,7 +5509,6 @@ class NomicBertModel(BertModel): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) if self.is_moe: self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) self.gguf_writer.add_expert_count(self.hparams["num_experts"]) @@ -5711,8 +5631,6 @@ class XLMRobertaModel(BertModel): super().set_gguf_parameters() # jina-embeddings-v3 - if rotary_emb_base := self.hparams.get("rotary_emb_base"): - self.gguf_writer.add_rope_freq_base(rotary_emb_base) lora_alpha = self.hparams.get("lora_alpha") if lora_prompt_prefixes := self.hparams.get("task_instructions"): assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) @@ -5840,19 +5758,16 @@ class Gemma3Model(TextModel): self._set_vocab_gpt2() def set_gguf_parameters(self): + super().set_gguf_parameters() hparams = self.hparams # some default values are not specified in the hparams self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers # attn_logit_softcapping is removed in Gemma3 assert hparams.get("attn_logit_softcapping") is None if (final_logit_softcap := hparams.get("final_logit_softcapping")): @@ -5860,19 +5775,6 @@ class Gemma3Model(TextModel): if hparams.get("sliding_window_pattern") != 1: self.gguf_writer.add_sliding_window(hparams["sliding_window"]) self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) - if hparams.get("rope_scaling") is not None: - rope_scaling = hparams["rope_scaling"] - if rope_scaling["rope_type"] == "linear": - # important: this rope_scaling is only applied for global layers, and not used by 1B model - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - elif rope_scaling["rope_type"] == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -6776,13 +6678,6 @@ class Olmo2Model(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - if "sliding_window" in self.hparams: self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) @@ -7281,16 +7176,11 @@ class DeepseekV2Model(TextModel): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul # ref https://github.com/ggml-org/llama.cpp/pull/17945 - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) _experts: list[dict[str, Tensor]] | None = None @@ -7898,11 +7788,6 @@ class Glm4Model(TextModel): if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v @@ -8240,50 +8125,26 @@ class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): + super().set_gguf_parameters() hparams = self.hparams assert (hparams["activation_function"] == "silu") - max_position_embeddings = hparams["max_position_embeddings"] - embed_dim = hparams["hidden_size"] - num_heads = hparams["num_attention_heads"] - num_kv_heads = hparams.get("num_key_value_heads", num_heads) - layer_norm_eps = hparams["layer_norm_epsilon"] - intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim - # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 - # attention_dropout_rate = hparams["attention_dropout"] - # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 - # embed_dropout_rate = hparams["embed_dropout"] - self.gguf_writer.add_embedding_length(embed_dim) - self.gguf_writer.add_head_count(num_heads) - self.gguf_writer.add_head_count_kv(num_kv_heads) - self.gguf_writer.add_context_length(max_position_embeddings) - self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_file_type(self.ftype) - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) rotary_factor = rotary_factor if rotary_factor is not None else 1.0 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = self.rope_parameters.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -8338,22 +8199,17 @@ class Exaone4Model(TextModel): if len(sliding_window_pattern) == hparams["num_hidden_layers"]: self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10_000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10_000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 16.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 16.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -8664,13 +8520,6 @@ class BailingMoeModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) @@ -8777,13 +8626,6 @@ class BailingMoeV2Model(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) @@ -8862,13 +8704,6 @@ class GroveMoeModel(TextModel): self.gguf_writer.add_experts_per_group(2) # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 self.gguf_writer.add_expert_group_scale(0.05) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None _chunk_experts: list[dict[str, Tensor]] | None = None @@ -9178,7 +9013,7 @@ class FalconH1Model(Mamba2Model): assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) @ModelBase.register("HunYuanMoEV1ForCausalLM") @@ -9256,12 +9091,11 @@ class HunYuanMoEModel(TextModel): self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) # Rope - rope_scaling = hparams.get("rope_scaling", {}) - if rope_scaling.get("type") == "dynamic": + if self.rope_parameters.get("rope_type") == "dynamic": # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = rope_scaling.get("alpha", 1000) - base = hparams.get("rope_theta", 10000.0) + alpha = self.rope_parameters.get("alpha", 1000) + base = self.rope_parameters.get("rope_theta", 10000.0) dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 self.gguf_writer.add_rope_freq_base(scaled_base) @@ -9456,12 +9290,11 @@ class HunYuanModel(TextModel): hparams = self.hparams # Rope - rope_scaling = hparams.get("rope_scaling", {}) - if rope_scaling.get("type") == "dynamic": + if self.rope_parameters.get("rope_type") == "dynamic": # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = rope_scaling.get("alpha", 50) - base = hparams.get("rope_theta", 10000.0) + alpha = self.rope_parameters.get("alpha", 50) + base = self.rope_parameters.get("rope_theta", 10000.0) dim = hparams["head_dim"] scaled_base = base * (alpha ** (dim / (dim - 2))) self.gguf_writer.add_rope_freq_base(scaled_base) @@ -9612,13 +9445,6 @@ class GptOssModel(TextModel): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) - rope_scaling = self.hparams.get("rope_scaling") or {} - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type")) - assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}" - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096)) - @ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") class LFM2Model(TextModel): @@ -9791,13 +9617,6 @@ class SmallThinkerModel(TextModel): self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) sliding_window_layout = self.hparams.get("sliding_window_layout") if sliding_window_layout: diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 1a5de5928a..36a12d299f 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "sampling.h" #include #include @@ -64,17 +65,23 @@ int main(int argc, char ** argv) { ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); - llama_context * ctx = llama_init_from_model(model, ctx_params); - auto sparams = llama_sampler_chain_default_params(); sparams.no_perf = false; - llama_sampler * smpl = llama_sampler_chain_init(sparams); + std::vector samplers; - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); + for (int32_t i = 0; i < n_parallel; ++i) { + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); + + samplers.push_back(smpl); + } + + llama_context * ctx = llama_init_from_model(model, ctx_params); if (ctx == NULL) { LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); @@ -173,7 +180,7 @@ int main(int argc, char ** argv) { continue; } - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); + const llama_token new_token_id = llama_sampler_sample(samplers[i], ctx, i_batch[i]); // is it an end of generation? -> mark the stream as finished if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { @@ -229,14 +236,17 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG("\n"); - llama_perf_sampler_print(smpl); + llama_perf_sampler_print(samplers[0]); llama_perf_context_print(ctx); fprintf(stderr, "\n"); llama_batch_free(batch); - llama_sampler_free(smpl); + for (auto & sampler_config : samplers) { + llama_sampler_free(sampler_config); + } + llama_free(ctx); llama_model_free(model); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index fe91b308cd..81111e81b2 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -131,10 +131,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 80c693ce61..408338f1af 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -202,10 +202,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 1e26d8221b..f54cfdd77f 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -55,10 +55,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); auto * mem = llama_get_memory(ctx); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 3da45ed9e0..bb94a8fe06 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -18,16 +18,16 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model_ptr & model = llama_init.model; - llama_context_ptr & ctx = llama_init.context; + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); GGML_ASSERT(model != nullptr); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx.get(), params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); common_ngram_cache ngram_cache; diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index fcb289abe0..135f6fcab9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -28,13 +28,13 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_context_ptr & ctx = llama_init.context; + llama_context * ctx = llama_init->context(); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx.get(), params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); common_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_dynamic; @@ -65,7 +65,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx = llama_n_ctx(ctx); int n_drafted = 0; int n_accept = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 2bfa26b55f..27f159940a 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -29,10 +29,10 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); const llama_vocab * vocab = llama_model_get_vocab(model); diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py index 7d2b80057c..da1132c003 100755 --- a/examples/model-conversion/scripts/causal/run-org-model.py +++ b/examples/model-conversion/scripts/causal/run-org-model.py @@ -200,7 +200,7 @@ with torch.no_grad(): logits = outputs.logits # Extract logits for the last token (next token prediction) - last_logits = logits[0, -1, :].cpu().numpy() + last_logits = logits[0, -1, :].float().cpu().numpy() print(f"Logits shape: {logits.shape}") print(f"Last token logits shape: {last_logits.shape}") diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index e48f48fc32..c92173ae29 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -192,10 +192,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); auto * mem = llama_get_memory(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 042e12c2bf..2c2143ad10 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -149,10 +149,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 4cd3071f76..39d4464663 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -34,10 +34,10 @@ int main(int argc, char ** argv) { std::string result2; // init - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 0d11d0f803..8141052a22 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -40,10 +40,10 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - common_init_result llama_init_tgt = common_init_from_params(params); + auto llama_init_tgt = common_init_from_params(params); - model_tgt = llama_init_tgt.model.get(); - ctx_tgt = llama_init_tgt.context.get(); + model_tgt = llama_init_tgt->model(); + ctx_tgt = llama_init_tgt->context(); const llama_vocab * vocab = llama_model_get_vocab(model_tgt); @@ -61,10 +61,10 @@ int main(int argc, char ** argv) { params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; - common_init_result llama_init_dft = common_init_from_params(params); + auto llama_init_dft = common_init_from_params(params); - //model_dft = llama_init_dft.model.get(); - ctx_dft = llama_init_dft.context.get(); + //model_dft = llama_init_dft->model(); + ctx_dft = llama_init_dft->context(); if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str()); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 5f5ac5eb64..2fb7f6374e 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -71,10 +71,10 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - common_init_result llama_init_tgt = common_init_from_params(params); + auto llama_init_tgt = common_init_from_params(params); - model_tgt = llama_init_tgt.model.get(); - ctx_tgt = llama_init_tgt.context.get(); + model_tgt = llama_init_tgt->model(); + ctx_tgt = llama_init_tgt->context(); // load the draft model params.devices = params.speculative.devices; @@ -87,10 +87,10 @@ int main(int argc, char ** argv) { params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; - common_init_result llama_init_dft = common_init_from_params(params); + auto llama_init_dft = common_init_from_params(params); - model_dft = llama_init_dft.model.get(); - ctx_dft = llama_init_dft.context.get(); + model_dft = llama_init_dft->model(); + ctx_dft = llama_init_dft->context(); const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); @@ -242,7 +242,7 @@ int main(int argc, char ** argv) { bool accept = false; if (params.sampling.temp > 0) { // stochastic verification - common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); + common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); auto & dist_tgt = *common_sampler_get_candidates(smpl, true); @@ -491,7 +491,7 @@ int main(int argc, char ** argv) { continue; } - common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); + common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft); const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 416d8d8f6c..c82de8d35d 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -39,9 +39,10 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); // load the model and apply lora adapter, if any - common_init_result llama_init = common_init_from_params(params); - llama_model_ptr & model = llama_init.model; - llama_context_ptr & ctx = llama_init.context; + auto llama_init = common_init_from_params(params); + + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); @@ -54,8 +55,8 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - std::vector tokens = common_tokenize(ctx.get(), params.prompt, true); - ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2); + std::vector tokens = common_tokenize(ctx, params.prompt, true); + ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2); struct lr_opt & lr = params.lr; LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n", @@ -70,7 +71,7 @@ int main(int argc, char ** argv) { /*get_opt_pars_ud =*/¶ms.lr, /*optimizer_type =*/params.optimizer, }; - llama_opt_init(ctx.get(), model.get(), lopt_params); + llama_opt_init(ctx, model, lopt_params); const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split); @@ -78,7 +79,7 @@ int main(int argc, char ** argv) { ggml_opt_result_t result_eval = ggml_opt_result_init(); for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { - llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split, + llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split, ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); fprintf(stderr, "\n"); @@ -88,7 +89,7 @@ int main(int argc, char ** argv) { ggml_opt_result_free(result_train); ggml_opt_result_free(result_eval); - llama_model_save_to_file(model.get(), params.out_file.c_str()); + llama_model_save_to_file(model, params.out_file.c_str()); llama_backend_free(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index ab5b4760e2..a65dcfbe1e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) # TODO else() set(GGML_STANDALONE OFF) + + if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + endif() endif() if (EMSCRIPTEN) diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 683ed8d2df..fb7f074a85 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -24,6 +24,7 @@ #define UNUSED GGML_UNUSED +#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD)) static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in, int16x8_t * out_mins, int8_t * out_scales) { @@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in, scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4); memcpy(out_scales, scales_u32, 8); } +#endif void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 4bef48b006..0379e5d502 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -256,6 +256,9 @@ void main() { barrier(); } + // prevent race on tmpsh + barrier(); + // reduce across threads [[unroll]] for (uint32_t r = 0; r < Br; ++r) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index cd82e4abfa..c995ab140e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -302,6 +302,9 @@ void main() { barrier(); } + // prevent race on tmpsh + barrier(); + // reduce across threads float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp index 0b74b33212..c5f5e9cbb2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp @@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; -void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) { - const uint y_idx = i * QUANT_K + 32 * ib32; - - uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i; - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const float d = float(data_a[ibi].d); - const uint qh = data_a[ibi].qh[ib32]; - const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1); - const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; - +void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, + const uint num_blocks_per_row, const uint first_row, const uint num_rows) { + const uint y_idx_base = i * QUANT_K + 32 * ib32; + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4; [[unroll]] for (uint l = 0; l < 4; ++l) { - const uint qs = data_a[ibi].qs[4 * ib32 + l]; - const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3); - const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]); + const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]); + const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]); - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]); - vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]); + // index for data_a + uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const float d = float(data_a[ibi].d); + const uint qh = data_a[ibi].qh[ib32]; + + const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1); + const uint qs = data_a[ibi].qs[4 * ib32 + l]; + const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3); + const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]); + + const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; + const vec4 delta_v = vec4(delta_val); + const vec4 fbits0 = vec4( + float(bitfieldExtract(grid, 0, 2)), + float(bitfieldExtract(grid, 2, 2)), + float(bitfieldExtract(grid, 4, 2)), + float(bitfieldExtract(grid, 6, 2)) + ); + const vec4 fbits1 = vec4( + float(bitfieldExtract(grid, 8, 2)), + float(bitfieldExtract(grid, 10, 2)), + float(bitfieldExtract(grid, 12, 2)), + float(bitfieldExtract(grid, 14, 2)) + ); + + vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0)); + sum_v = fma(b_val_1, fbits1 + delta_v, sum_v); + FLOAT_TYPE sum = dot(sum_v, vec4(1.0)); - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int k = 0; k < 4; ++k) { - sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta, - fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum)); - } temp[j][n] = fma(dl, sum, temp[j][n]); + ibi += num_blocks_per_row; } } - ibi += num_blocks_per_row; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index ee5ded2e8d..58ede04400 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint iqs = idx % 128; // 0..127 const uint n = iqs / 64; // 0,1 - const uint b = (iqs % 64) / 32; // 0,1 + const uint b = ((iqs % 64) / 32) * 4; // 0,4 const uint is_b = (iqs % 16) / 8; // 0,1 const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 - const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint qhi = n * 16 + (iqs % 16); // 0..31 const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32), - dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); + const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F; + const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303; + const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y); #elif defined(DATA_A_IQ1_S) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index a879940eae..5823efac2d 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -55bc9320a4aae82af18e23eefd5de319a755d7b9 +130bc125a88bb57664b88932c48c38a1cb316fac diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9914b3276b..2a17e44ecd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,6 +9,7 @@ #include "llama-model.h" #include +#include #include #include #include @@ -72,6 +73,43 @@ llama_context::llama_context( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + if (cparams.yarn_ext_factor != 0) { + static auto get_mscale = [](float scale, float mscale) { + return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); + }; + + const float factor = 1.0f / cparams.rope_freq_scale; + + // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348 + if (hparams.rope_yarn_log_mul != 0.0f) { + // note: here we assume `mscale == 1.0f` + // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f + float mscale = 1.0f; + const float mscale_all_dims = hparams.rope_yarn_log_mul; + + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // special-case DEEPSEEK v2: + // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43 + if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) { + mscale = mscale_all_dims; + } + + cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); + + LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n", + __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims); + } else { + cparams.yarn_attn_factor = get_mscale(factor, 1.0f); + } + + // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor: + // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544 + // + // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 + // https://github.com/ggml-org/llama.cpp/pull/17945 + cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor)); + } + cparams.yarn_attn_factor *= hparams.rope_attn_factor; if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a1a32494b7..8909bbfb95 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { for (int i = 0; i < n_tokens; ++i) { const float pos = ubatch->pos[i]; attn_scale_data[i] = std::log( - std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0 + std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0 ) * f_attn_temp_scale + 1.0; } @@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), - attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)), + attn_factor (cparams.yarn_attn_factor), beta_fast (cparams.yarn_beta_fast), beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), @@ -1203,7 +1203,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset); auto & cur = inp->attn_scale; diff --git a/src/llama-graph.h b/src/llama-graph.h index d0c3934f67..e9d387bd7c 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -132,8 +132,8 @@ public: // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; @@ -142,6 +142,7 @@ public: const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; + const float f_attn_temp_offset; }; class llm_graph_input_pos_bucket : public llm_graph_input_i { diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 277d0bcfd3..96c9598c24 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -3,7 +3,6 @@ #include "ggml.h" #include -#include void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { @@ -231,13 +230,3 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama return false; } - -float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) { - GGML_ASSERT(ext_factor >= 0.0f); - - if (ext_factor != 0.0f) { - attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); - } - - return attn_factor; -} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c9960e9169..a467c64a14 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -165,6 +165,7 @@ struct llama_hparams { uint32_t n_no_rope_layer_step = 4; uint32_t n_attn_temp_floor_scale = 0; float f_attn_temp_scale = 0.0f; + float f_attn_temp_offset = 0.0f; // offset position index // gemma3n altup uint32_t n_altup = 4; // altup_num_inputs @@ -268,13 +269,6 @@ struct llama_hparams { // TODO: think of a better place for this function // TODO: pack the SWA params in a struct? static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); - - // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor: - // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544 - // - // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 - // https://github.com/ggml-org/llama.cpp/pull/17945 - static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor); }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 8f94c8820c..bf3de2f2ef 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1372,7 +1372,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift( const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_beta_fast = cparams.yarn_beta_fast; const auto & yarn_beta_slow = cparams.yarn_beta_slow; - const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor); + const auto & yarn_attn_factor = cparams.yarn_attn_factor; const auto & n_rot = hparams.n_rot; const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e4808b1e1e..28f06b4e61 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -668,6 +668,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_swa = 8192; hparams.n_attn_temp_floor_scale = 8192; hparams.f_attn_temp_scale = 0.1f; + hparams.f_attn_temp_offset = 1.0f; hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full } @@ -1646,6 +1647,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); + hparams.f_attn_temp_offset = 0.0f; + switch (hparams.n_layer) { case 27: type = LLM_TYPE_16B; break; case 60: type = LLM_TYPE_236B; break; @@ -2276,6 +2279,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f); + hparams.f_attn_temp_offset = 0.0f; + // TODO: maybe add n_attn_temp_floor_scale as a separate KV? if (hparams.f_attn_temp_scale != 0.0f) { hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn; @@ -2294,32 +2299,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: throw std::runtime_error("unsupported model architecture"); } - // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348 - if (hparams.rope_yarn_log_mul != 0.0f) { - const float factor = 1.0f / hparams.rope_freq_scale_train; - - // note: here we assume `mscale == 1.0f` - // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f - float mscale = 1.0f; - const float mscale_all_dims = hparams.rope_yarn_log_mul; - - // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] - // special-case DEEPSEEK v2: - // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43 - if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) { - mscale = mscale_all_dims; - } - - static auto get_mscale = [](float scale, float mscale) { - return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); - }; - - hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); - - LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n", - __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims); - } - pimpl->n_bytes = ml.n_bytes; pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name(); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index cb2641ae0a..85480f3369 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -141,13 +141,15 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); - model = llama_init.model.get(); - ctx = llama_init.context.get(); + auto llama_init = common_init_from_params(params); - if (model == NULL) { - LOG_ERR("%s: error: unable to load model\n", __func__); + ctx = llama_init->context(); + model = llama_init->model(); + smpl = llama_init->sampler(0); + + if (ctx == NULL) { + LOG_ERR("%s: error: unable to create context\n", __func__); return 1; } @@ -474,12 +476,6 @@ int main(int argc, char ** argv) { } } - smpl = common_sampler_init(model, sparams); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); - return 1; - } - LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); @@ -993,8 +989,6 @@ int main(int argc, char ** argv) { LOG("\n\n"); common_perf_print(ctx, smpl); - common_sampler_free(smpl); - llama_backend_free(); ggml_threadpool_free_fn(threadpool); diff --git a/tools/cvector-generator/cvector-generator.cpp b/tools/cvector-generator/cvector-generator.cpp index d2d97e05ce..3ba7c52950 100644 --- a/tools/cvector-generator/cvector-generator.cpp +++ b/tools/cvector-generator/cvector-generator.cpp @@ -419,10 +419,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); // int n_ctx = llama_n_ctx(ctx); int n_layers = llama_model_n_layer(model); diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index f28a036dee..669de55ddb 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -1265,10 +1265,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index bb922e30b4..5cc4e4b1fa 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2230,7 +2230,14 @@ struct llava_uhd { clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices std::vector slices; + + img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_overview = {0, 0, 0}; + + img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC; bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_refined = {0, 0, 0}; }; static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { @@ -2257,10 +2264,11 @@ struct llava_uhd { auto refine_size = llava_uhd::select_best_resolution( original_size, ctx->model.hparams.image_res_candidates); - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = refine_size; - res.grid_size = clip_image_size{0, 0}; - res.padding_refined = true; + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding LOG_DBG("%s: using pinpoints for slicing\n", __func__); LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", @@ -2339,12 +2347,13 @@ struct llava_uhd { static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { std::vector output; - img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); - img_tool::resize(*img, *resized_img, inst.overview_size, interpolation); + img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, + inst.padding_overview, inst.pad_color_overview); output.push_back(std::move(resized_img)); + if (inst.slices.empty()) { // no slices, just return the resized image return output; @@ -2352,13 +2361,8 @@ struct llava_uhd { // resize to refined size clip_image_u8_ptr refined_img(clip_image_u8_init()); - if (inst.padding_refined) { - img_tool::resize(*img, *refined_img, inst.refined_size, interpolation); - } else { - // only algo bicubic preserves the ratio; old models rely on this behavior - // TODO: do we need to support other algos here? - img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false); - } + img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined, + inst.padding_refined, inst.pad_color_refined); // create slices for (const auto & slice : inst.slices) { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 25d24603db..332d2049e5 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -65,7 +65,7 @@ static void sigint_handler(int signo) { struct mtmd_cli_context { mtmd::context_ptr ctx_vision; - common_init_result llama_init; + common_init_result_ptr llama_init; llama_model * model; llama_context * lctx; @@ -89,8 +89,8 @@ struct mtmd_cli_context { llama_pos n_past = 0; mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { - model = llama_init.model.get(); - lctx = llama_init.context.get(); + model = llama_init->model(); + lctx = llama_init->context(); vocab = llama_model_get_vocab(model); smpl = common_sampler_init(model, params.sampling); n_threads = params.cpuparams.n_threads; diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index caf080e8d1..1ead9c871e 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -2024,10 +2024,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model and apply lora adapter, if any - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5a67f508df..90898b5ec4 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -153,7 +153,7 @@ struct server_slot { // sampling json json_schema; - struct common_sampler * smpl = nullptr; + common_sampler_ptr smpl; llama_token sampled; // in speculative mode, this is the last accepted token llama_tokens drafted; @@ -510,8 +510,8 @@ struct server_context_impl { common_params params_base; // note: keep these alive - they determine the lifetime of the model, context, etc. - common_init_result llama_init; - common_init_result llama_init_dft; + common_init_result_ptr llama_init; + common_init_result_ptr llama_init_dft; llama_model * model = nullptr; llama_context * ctx = nullptr; @@ -557,9 +557,6 @@ struct server_context_impl { // Clear any sampling context for (server_slot & slot : slots) { - common_sampler_free(slot.smpl); - slot.smpl = nullptr; - llama_free(slot.ctx_dft); slot.ctx_dft = nullptr; @@ -580,8 +577,8 @@ struct server_context_impl { llama_init = common_init_from_params(params_base); - model = llama_init.model.get(); - ctx = llama_init.context.get(); + model = llama_init->model(); + ctx = llama_init->context(); if (model == nullptr) { SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); @@ -613,25 +610,25 @@ struct server_context_impl { llama_init_dft = common_init_from_params(params_dft); - model_dft = llama_init_dft.model.get(); + model_dft = llama_init_dft->model(); if (model_dft == nullptr) { SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); return false; } - vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); + vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft->context()); if (!vocab_dft_compatible) { SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); } - const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); + const int n_ctx_dft = llama_n_ctx(llama_init_dft->context()); cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; // the context is not needed - we will create one for each slot - llama_init_dft.context.reset(); + llama_init_dft->free_context(); } chat_templates = common_chat_templates_init(model, params_base.chat_template); @@ -1051,18 +1048,15 @@ struct server_context_impl { // initialize samplers { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); - } + slot.smpl.reset(common_sampler_init(model, task.params.sampling)); - slot.smpl = common_sampler_init(model, task.params.sampling); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); return false; } - SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str()); + SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); } // initialize draft batch @@ -1216,11 +1210,10 @@ struct server_context_impl { } void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const { - size_t n_probs = slot.task->params.sampling.n_probs; - size_t n_vocab = llama_vocab_n_tokens(vocab); + const size_t n_probs = slot.task->params.sampling.n_probs; if (post_sampling) { - const auto * cur_p = common_sampler_get_candidates(slot.smpl, true); + const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true); const size_t max_probs = cur_p->size; // set probability for sampled token @@ -1245,7 +1238,7 @@ struct server_context_impl { std::vector cur = get_token_probabilities(ctx, idx); // set probability for sampled token - for (size_t i = 0; i < n_vocab; i++) { + for (size_t i = 0; i < cur.size(); i++) { // set probability for sampled token if (cur[i].id == result.tok) { result.prob = cur[i].p; @@ -1255,7 +1248,7 @@ struct server_context_impl { // set probability for top n_probs tokens result.probs.reserve(n_probs); - for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { + for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) { result.probs.push_back({ cur[i].id, common_token_to_piece(ctx, cur[i].id, special), @@ -2301,13 +2294,13 @@ struct server_context_impl { GGML_ASSERT(batch.n_tokens > 0); - common_sampler_reset(slot.smpl); + common_sampler_reset(slot.smpl.get()); // Process all prompt tokens through sampler system for (int i = 0; i < slot.task->n_tokens(); ++i) { llama_token id = input_tokens[i]; if (id != LLAMA_TOKEN_NULL) { - common_sampler_accept(slot.smpl, id, false); + common_sampler_accept(slot.smpl.get(), id, false); } } @@ -2525,11 +2518,11 @@ struct server_context_impl { const int tok_idx = slot.i_batch - i; - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); + llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx); slot.i_batch = -1; - common_sampler_accept(slot.smpl, id, true); + common_sampler_accept(slot.smpl.get(), id, true); slot.n_decoded += 1; @@ -2570,7 +2563,7 @@ struct server_context_impl { size_t n_draft = slot.drafted.size(); // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, slot.i_batch_dft, slot.drafted); + const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted); slot.i_batch_dft.clear(); slot.drafted.clear(); diff --git a/tools/server/tests/unit/test_compat_anthropic.py b/tools/server/tests/unit/test_compat_anthropic.py index d55dd1d945..e0a003557e 100644 --- a/tools/server/tests/unit/test_compat_anthropic.py +++ b/tools/server/tests/unit/test_compat_anthropic.py @@ -684,7 +684,7 @@ def test_anthropic_streaming_content_block_indices(): # Request that might produce both text and tool use res = server.make_stream_request("POST", "/v1/messages", data={ "model": "test", - "max_tokens": 200, + "max_tokens": 400, "stream": True, "tools": [{ "name": "test_tool", diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index eaf56591d9..8c39fce8ba 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -568,10 +568,10 @@ int main(int argc, char ** argv) { llama_context * ctx_ttc = NULL; llama_context * ctx_cts = NULL; - common_init_result llama_init_ttc = common_init_from_params(params); + auto llama_init_ttc = common_init_from_params(params); - model_ttc = llama_init_ttc.model.get(); - ctx_ttc = llama_init_ttc.context.get(); + model_ttc = llama_init_ttc->model(); + ctx_ttc = llama_init_ttc->context(); if (model_ttc == nullptr || ctx_ttc == nullptr) { return ENOENT; @@ -583,10 +583,10 @@ int main(int argc, char ** argv) { params.embedding = true; params.n_ubatch = params.n_batch; - common_init_result llama_init_cts = common_init_from_params(params); + auto llama_init_cts = common_init_from_params(params); - model_cts = llama_init_cts.model.get(); - ctx_cts = llama_init_cts.context.get(); + model_cts = llama_init_cts->model(); + ctx_cts = llama_init_cts->context(); if (model_cts == nullptr || ctx_cts == nullptr) { return ENOENT;