Compare commits

...

9 Commits

Author SHA1 Message Date
htxsrl e290904d7e
Merge c7fa677314 into 05fa625eac 2026-02-16 16:50:49 -07:00
DAN™ 05fa625eac
convert : add JoyAI-LLM-Flash (#19651)
* convert_hf_to_gguf: add JoyAI-LLM-Flash tokenizer hash mapping to deepseek-v3

* llama-vocab: create a new pre-tokenizer name for joyai-llm.

* add missing vocab type section

* Update convert_hf_to_gguf_update.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-02-16 22:49:57 +01:00
AesSedai d612901116
perplexity: add proper batching (#19661) 2026-02-16 18:44:44 +02:00
Ivan Chikish cceb1b4e33
common : inline functions (#18639) 2026-02-16 17:52:24 +02:00
Judd d23a55997d
ggml : make `ggml_is_view` as API (#19539)
* make `ggml_is_view` as API

* introduce `ggml_aux_is_view` as inline version for internal use.

* change `ggml_aux_is_view` to  `ggml_impl_is_view`
2026-02-16 17:43:34 +02:00
Saurabh Dash 5f28c53d11
model: Add support for Tiny Aya Models (#19611)
* changes for tiny aya

* changes to hash

* changes to vocab

* fix some tokenizer regex edge cases

* update comment

* add some comments for regex

* Apply suggestion from @ngxson

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2026-02-16 16:28:46 +01:00
Adrien Gallouët 4408494144
build : rework llama_option_depr to handle LLAMA_CURL (#19658)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-16 16:06:48 +01:00
Mario Limonciello 2ba9adc093
Adjust workaround for ROCWMMA_FATTN/GFX9 to only newer ROCm veresions (#19591)
Avoids issues with ROCm 6.4.4.

Closes: https://github.com/ggml-org/llama.cpp/issues/19580
Fixes: 6845f7f87 ("Add a workaround for compilation with ROCWMMA_FATTN and gfx9 (#19461)")

Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
2026-02-16 14:46:08 +01:00
Francesco Menegoni c7fa677314 chat: add thinking/reasoning support for Kimi K2
Pass enable_thinking to the Kimi K2 chat template via additional_context
and handle <think> tags at the end of the generated prompt:
- When thinking is disabled, immediately close the tag with </think>
- When thinking is enabled, set thinking_forced_open = true

This allows proper reasoning mode support for Kimi 2.5 / K2 models.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 19:52:28 +01:00
14 changed files with 182 additions and 96 deletions

View File

@ -115,11 +115,6 @@ option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# deprecated
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
if (LLAMA_CURL)
message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
@ -147,10 +142,15 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
endif()
# transition helpers
function (llama_option_depr TYPE OLD NEW)
function (llama_option_depr TYPE OLD)
if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON PARENT_SCOPE)
set(NEW "${ARGV2}")
if(NEW)
message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
set(${NEW} ON PARENT_SCOPE)
else()
message(${TYPE} "${OLD} is deprecated and will be ignored")
endif()
endif()
endfunction()
@ -163,6 +163,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
llama_option_depr(WARNING LLAMA_CURL)
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")

View File

@ -1913,9 +1913,22 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.prompt = apply(tmpl, params);
json additional_context = {
{"thinking", params.enable_thinking},
};
data.prompt = apply(tmpl, params, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
data.format = COMMON_CHAT_FORMAT_KIMI_K2;
// Handle thinking tags based on prompt ending
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
if (!params.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
data.preserved_tokens = {
"<think>",
"</think>",

View File

@ -670,7 +670,7 @@ static std::vector<T> string_split(const std::string & str, char delim) {
}
template<>
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
inline std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
{
std::vector<std::string> parts;
size_t begin_pos = 0;
@ -685,7 +685,7 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts;
}
static bool string_starts_with(const std::string & str,
inline bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
@ -870,11 +870,11 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
static std::string llm_ffn_exps_block_regex(int idx) {
inline std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}

View File

@ -1049,6 +1049,9 @@ class TextModel(ModelBase):
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
# ref: https://huggingface.co/zai-org/GLM-4.5-Air
res = "glm4"
if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
# ref: https://huggingface.co/zai-org/GLM-4.7-Flash
res = "glm4"
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
@ -1082,9 +1085,6 @@ class TextModel(ModelBase):
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
# ref: https://huggingface.co/aari1995/German_Semantic_V3
res = "jina-v2-de"
if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
# ref: https://huggingface.co/zai-org/GLM-4.7-Flash
res = "glm4"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@ -1124,6 +1124,9 @@ class TextModel(ModelBase):
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
# ref: https://huggingface.co/CohereLabs/tiny-aya-base
res = "tiny_aya"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
@ -1265,6 +1268,9 @@ class TextModel(ModelBase):
if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4":
# ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct
res = "qwen35"
if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d":
# ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash
res = "joyai-llm"
if res is None:
logger.warning("\n")
@ -7360,6 +7366,17 @@ class Cohere2Model(TextModel):
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Cohere2 runtime in llama.cpp expects no bias tensors;
# the actual weight only contains 0-value tensors as bias, we can skip them
if name.endswith(".bias"):
if torch.any(data_torch != 0):
raise ValueError(f"Bias tensor {name!r} is not zero.")
logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.")
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("OlmoForCausalLM")
@ModelBase.register("OLMoForCausalLM")

View File

@ -99,6 +99,7 @@ models = [
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@ -148,7 +149,8 @@ models = [
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@ -158,6 +160,7 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
@ -171,7 +174,6 @@ pre_computed_hashes = [
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
]

View File

@ -752,6 +752,7 @@ extern "C" {
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_view (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);

View File

@ -17,11 +17,6 @@
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
#define AT_PRINTF(...)
static bool ggml_is_view(const struct ggml_tensor * t) {
return t->view_src != NULL;
}
// ops that return true for this function must not use restrict pointers for their backend implementations
bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
@ -627,7 +622,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_impl_is_view(node)) {
hn->allocated = true;
assert(hn->addr.offset == 0);
@ -658,7 +653,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
if (p_hn->n_children == 1 && p_hn->n_views == 0) {
if (ggml_is_view(parent)) {
if (ggml_impl_is_view(parent)) {
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
@ -739,7 +734,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
if (ggml_impl_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
@ -806,7 +801,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
if (ggml_is_view(parent)) {
if (ggml_impl_is_view(parent)) {
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
view_src_hn->n_views -= 1;

View File

@ -63,7 +63,7 @@ static __global__ void flash_attn_ext_f16(
constexpr int frag_m = ncols == 8 ? 32 : 16;
constexpr int frag_n = ncols == 8 ? 8 : 16;
static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
#if defined(GGML_USE_HIP)
#if defined(GGML_USE_HIP) && HIP_VERSION >= 60500000
typedef wmma::fragment<wmma::matrix_a, frag_m, frag_n, 16, _Float16, wmma::row_major> frag_a_K;
typedef wmma::fragment<wmma::matrix_a, frag_m, frag_n, 16, _Float16, wmma::col_major> frag_a_V;
typedef wmma::fragment<wmma::matrix_b, frag_m, frag_n, 16, _Float16, wmma::col_major> frag_b;
@ -135,7 +135,7 @@ static __global__ void flash_attn_ext_f16(
__shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
half2 * VKQ2 = (half2 *) VKQ;
#if defined(GGML_USE_HIP)
#if defined(GGML_USE_HIP) && HIP_VERSION >= 60500000
const _Float16 * K_h_f16 = reinterpret_cast<const _Float16 *>(K_h);
const _Float16 * V_h_f16 = reinterpret_cast<const _Float16 *>(V_h);
_Float16 * KQ_f16 = reinterpret_cast<_Float16 *>(KQ);

View File

@ -98,6 +98,10 @@ static bool ggml_op_is_empty(enum ggml_op op) {
}
}
static inline bool ggml_impl_is_view(const struct ggml_tensor * t) {
return t->view_src != NULL;
}
static inline float ggml_compute_softplus_f32(float input) {
return (input > 20.0f) ? input : logf(1 + expf(input));
}

View File

@ -1496,6 +1496,10 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
(t0->nb[3] == t1->nb[3]);
}
bool ggml_is_view(const struct ggml_tensor * t) {
return ggml_impl_is_view(t);
}
// check if t1 can be represented as a repetition of t0
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

View File

@ -308,6 +308,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
case LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM:
regex_exprs = {
"\\p{N}{1,3}",
"[一-龥぀-ゟ゠-ヿ]+",
@ -422,6 +423,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
regex_exprs = {
// original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
"\\d{1,3}(?=(?:\\d{3})*\\b)",
// original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
regex_exprs = {
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
@ -2005,10 +2014,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
tokenizer_pre == "gpt-4o" ||
tokenizer_pre == "llama4") {
tokenizer_pre == "gpt-4o" ||
tokenizer_pre == "llama4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false;
} else if (
tokenizer_pre == "tiny_aya") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
clean_spaces = false;
} else if (
tokenizer_pre == "superbpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
@ -2039,6 +2052,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "hunyuan-dense") {
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
clean_spaces = false;
} else if (
tokenizer_pre == "joyai-llm") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "kimi-k2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;

View File

@ -55,6 +55,8 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
};
struct LLM_KV;

View File

@ -769,6 +769,12 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
} else if (regex_expr == "\\p{AFMoE_digits}") {
// AFMOE digit pattern - use custom implementation for proper splitting
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
} else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
// tiny_aya digit grouping pattern from tokenizer.json:
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
// Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
// TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
}
return bpe_offsets;

View File

@ -347,7 +347,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
int count = 0;
double nll = 0.0;
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
const int n_seq = std::max(1, n_batch / n_ctx);
LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
@ -1737,11 +1738,21 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int num_batches = (static_cast<int>(n_ctx) + n_batch - 1) / n_batch;
// Calculate n_seq based on the logits file's n_ctx, but cap it at what the context supports
const int n_seq_max = llama_n_seq_max(ctx);
int n_seq = std::max(1, n_batch / static_cast<int>(n_ctx));
if (n_seq > n_seq_max) {
LOG_WRN("%s: calculated n_seq=%d exceeds context's n_seq_max=%d, capping at %d\n",
__func__, n_seq, n_seq_max, n_seq_max);
n_seq = n_seq_max;
}
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
llama_batch batch = llama_batch_init(std::min(n_batch, static_cast<int>(n_ctx)*n_seq), 0, 1);
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@ -1750,6 +1761,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
logits.reserve(size_t(n_ctx) * n_vocab);
}
LOG_INF("%s: computing over %d chunks, n_ctx=%u, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
@ -1774,107 +1787,122 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
auto kld_ptr = kld_values.data();
auto p_diff_ptr = p_diff_values.data();
for (int i = 0; i < n_chunk; ++i) {
const int first = n_ctx/2;
for (int i = 0; i < n_chunk; i += n_seq) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const auto t_start = std::chrono::high_resolution_clock::now();
const int n_seq_batch = std::min(n_seq, n_chunk - i);
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_memory_clear(llama_get_memory(ctx), true);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_vocab_bos(vocab);
}
int n_outputs = 0;
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
for (int seq = 0; seq < n_seq_batch; seq++) {
int seq_start = batch_start + seq*n_ctx;
// save original token and restore it after eval
const auto token_org = tokens[seq_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[seq_start] = llama_vocab_bos(vocab);
}
for (int k = 0; k < batch_size; ++k) {
const int pos = j*n_batch + k;
const bool need_logits = pos >= first;
common_batch_add(batch, tokens[seq_start + k], pos, { seq }, need_logits);
n_outputs += need_logits;
}
// restore the original token in case it was set to BOS
tokens[seq_start] = token_org;
}
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to decode\n", __func__);
llama_batch_free(batch);
return;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (num_batches > 1) {
if (num_batches > 1 && n_outputs > 0) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
int total_seconds = (int)(t_total * n_chunk / n_seq);
if (total_seconds >= 60*60) {
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
LOG("%.2f minutes\n", total_seconds / 60.0);
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
}
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first;
// Read log probs for each sequence in the batch
for (int seq = 0; seq < n_seq_batch; seq++) {
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i + seq);
llama_batch_free(batch);
return;
}
LOG("%4d", i+1);
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
process_logits(n_vocab, all_logits, tokens.data() + start + seq*n_ctx + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first;
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
LOG("%4d", i + seq + 1);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second;
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
LOG("\n");
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
LOG("\n");
}
logits.clear();
}
llama_batch_free(batch);
LOG("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
@ -1996,7 +2024,7 @@ int main(int argc, char ** argv) {
const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
if (ppl) {
if (ppl || params.kl_divergence) {
const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
const int32_t n_kv = n_seq * n_ctx;
@ -2006,12 +2034,8 @@ int main(int argc, char ** argv) {
params.n_batch = std::min(params.n_batch, n_kv);
} else {
params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.kl_divergence) {
params.n_parallel = 1;
} else {
// ensure there's at least enough seq_ids for HellaSwag
params.n_parallel = std::max(4, params.n_parallel);
}
// ensure there's at least enough seq_ids for HellaSwag
params.n_parallel = std::max(4, params.n_parallel);
}
if (params.ppl_stride > 0) {