revert: remove VAETKI tokenizer implementation

This commit is contained in:
suhyun-hwang 2026-01-14 00:08:17 +09:00
parent 487909ae0e
commit ca85717886
6 changed files with 5 additions and 152 deletions

View File

@ -1255,9 +1255,6 @@ class TextModel(ModelBase):
if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
# ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
res = "exaone-moe"
if chkhsh == "f5f8b79793693cfcca1c36aac854ab481ae887cf7dde234b889f8f4bf009891a":
# ref: https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B
res = "vaetki"
if res is None:
logger.warning("\n")
@ -7682,84 +7679,6 @@ class VaetkiModel(TextModel):
"sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)}
}
def set_vocab(self):
# VAETKI uses Metaspace-based BPE tokenizer, load vocab from tokenizer.json
import json
import re
from transformers import AutoTokenizer
dir_model = self.dir_model
hparams = self.hparams
tokenizer_json_path = dir_model / "tokenizer.json"
if not tokenizer_json_path.is_file():
raise FileNotFoundError(f"VAETKI tokenizer.json not found: {tokenizer_json_path}")
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
# Get vocab from tokenizer.json
vocab = tokenizer_json["model"]["vocab"]
merges = tokenizer_json["model"].get("merges", [])
vocab_size = hparams.get("vocab_size", len(vocab))
# Build reverse vocab
reverse_vocab = {v: k for k, v in vocab.items()}
# Get added tokens from tokenizer.json
added_tokens = {}
for token_info in tokenizer_json.get("added_tokens", []):
added_tokens[token_info["id"]] = {
"content": token_info["content"],
"special": token_info.get("special", False)
}
tokens: list[str] = []
toktypes: list[int] = []
for i in range(vocab_size):
if i in added_tokens:
token = added_tokens[i]["content"]
if added_tokens[i]["special"]:
toktypes.append(gguf.TokenType.CONTROL)
else:
# pre-normalize user-defined spaces (Metaspace → space)
token = token.replace("\u2581", " ")
toktypes.append(gguf.TokenType.USER_DEFINED)
tokens.append(token)
elif i in reverse_vocab:
token = reverse_vocab[i]
# Check for byte tokens (format: <0xXX>)
if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token):
toktypes.append(gguf.TokenType.BYTE)
else:
toktypes.append(gguf.TokenType.NORMAL)
tokens.append(token)
else:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
# Get pre-tokenizer type
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
tokpre = self.get_vocab_base_pre(tokenizer)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
# Add merges (convert from [['a', 'b'], ...] to ['a b', ...] format)
if merges:
# tokenizer.json stores merges as list of pairs, GGUF expects space-separated strings
if isinstance(merges[0], list):
merges = [' '.join(pair) for pair in merges]
self.gguf_writer.add_token_merges(merges)
# Add special tokens
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()

View File

@ -148,7 +148,6 @@ models = [
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
{"name": "vaetki", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View File

@ -468,12 +468,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_VAETKI:
regex_exprs = {
"[^\r\n]+",
"[\r\n]+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -531,23 +525,8 @@ struct llm_tokenizer_bpe_session {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const bool skip_byte_encoding = (vocab.get_pre_type() == LLAMA_VOCAB_PRE_TYPE_VAETKI);
std::string normalized;
const std::string * input = &text;
if (skip_byte_encoding) {
normalized.reserve(text.size() * 3);
for (char c : text) {
if (c == ' ') {
normalized += "\xe2\x96\x81";
} else {
normalized += c;
}
}
input = &normalized;
}
const auto word_collection = unicode_regex_split(*input, tokenizer.regex_exprs, skip_byte_encoding);
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
symbols_final.clear();
@ -637,13 +616,8 @@ struct llm_tokenizer_bpe_session {
if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) {
llama_token token_byte;
if (skip_byte_encoding) {
token_byte = vocab.byte_to_token(static_cast<uint8_t>(*j));
} else {
std::string byte_str(1, *j);
token_byte = vocab.text_to_token(byte_str);
}
std::string byte_str(1, *j);
auto token_byte = vocab.text_to_token(byte_str);
if (token_byte != LLAMA_TOKEN_NULL) {
output.push_back(token_byte);
}
@ -2068,11 +2042,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "solar-open") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
clean_spaces = false;
} else if (
tokenizer_pre == "vaetki") {
pre_type = LLAMA_VOCAB_PRE_TYPE_VAETKI;
clean_spaces = false;
add_space_prefix = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -2707,11 +2676,6 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
// VAETKI uses <0xXX> format for byte tokens
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
GGML_ABORT("fatal error");
}
case LLAMA_VOCAB_TYPE_WPM: {
@ -3180,21 +3144,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
std::string result = token_text;
llama_unescape_whitespace(result);
return _try_copy(result.data(), result.size());
}
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
// VAETKI uses <0xXX> format for byte tokens
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
char byte = (char) token_to_byte(token);
return _try_copy(&byte, 1);
}
}
break;
}
case LLAMA_VOCAB_TYPE_RWKV: {
@ -3467,19 +3419,6 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {
if (pimpl->pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = pimpl->token_to_id.find(buf);
if (token != pimpl->token_to_id.end()) {
return (*token).second;
}
const char buf2[2] = { (char)ch, 0 };
auto token2 = pimpl->token_to_id.find(buf2);
if (token2 != pimpl->token_to_id.end()) {
return (*token2).second;
}
return LLAMA_TOKEN_NULL;
}
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
}
case LLAMA_VOCAB_TYPE_PLAMO2: {

View File

@ -54,7 +54,6 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
LLAMA_VOCAB_PRE_TYPE_VAETKI = 46,
};
struct LLM_KV;

View File

@ -956,7 +956,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
return false;
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding) {
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
{ "\\p{N}", unicode_cpt_flags::NUMBER },
@ -1143,8 +1143,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
start += offset;
}
if (skip_byte_encoding) {
return bpe_words;
}
return unicode_byte_encoding_process(bpe_words);
}

View File

@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
bool unicode_cpt_is_han(uint32_t cpt);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding = false);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);