revert: remove VAETKI tokenizer implementation
This commit is contained in:
parent
487909ae0e
commit
ca85717886
|
|
@ -1255,9 +1255,6 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
|
||||
res = "exaone-moe"
|
||||
if chkhsh == "f5f8b79793693cfcca1c36aac854ab481ae887cf7dde234b889f8f4bf009891a":
|
||||
# ref: https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B
|
||||
res = "vaetki"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
@ -7682,84 +7679,6 @@ class VaetkiModel(TextModel):
|
|||
"sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)}
|
||||
}
|
||||
|
||||
def set_vocab(self):
|
||||
# VAETKI uses Metaspace-based BPE tokenizer, load vocab from tokenizer.json
|
||||
import json
|
||||
import re
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
|
||||
tokenizer_json_path = dir_model / "tokenizer.json"
|
||||
if not tokenizer_json_path.is_file():
|
||||
raise FileNotFoundError(f"VAETKI tokenizer.json not found: {tokenizer_json_path}")
|
||||
|
||||
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
# Get vocab from tokenizer.json
|
||||
vocab = tokenizer_json["model"]["vocab"]
|
||||
merges = tokenizer_json["model"].get("merges", [])
|
||||
|
||||
vocab_size = hparams.get("vocab_size", len(vocab))
|
||||
|
||||
# Build reverse vocab
|
||||
reverse_vocab = {v: k for k, v in vocab.items()}
|
||||
|
||||
# Get added tokens from tokenizer.json
|
||||
added_tokens = {}
|
||||
for token_info in tokenizer_json.get("added_tokens", []):
|
||||
added_tokens[token_info["id"]] = {
|
||||
"content": token_info["content"],
|
||||
"special": token_info.get("special", False)
|
||||
}
|
||||
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in added_tokens:
|
||||
token = added_tokens[i]["content"]
|
||||
if added_tokens[i]["special"]:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
# pre-normalize user-defined spaces (Metaspace → space)
|
||||
token = token.replace("\u2581", " ")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
tokens.append(token)
|
||||
elif i in reverse_vocab:
|
||||
token = reverse_vocab[i]
|
||||
# Check for byte tokens (format: <0xXX>)
|
||||
if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token):
|
||||
toktypes.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
else:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
|
||||
# Get pre-tokenizer type
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# Add merges (convert from [['a', 'b'], ...] to ['a b', ...] format)
|
||||
if merges:
|
||||
# tokenizer.json stores merges as list of pairs, GGUF expects space-separated strings
|
||||
if isinstance(merges[0], list):
|
||||
merges = [' '.join(pair) for pair in merges]
|
||||
self.gguf_writer.add_token_merges(merges)
|
||||
|
||||
# Add special tokens
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
|
|
|
|||
|
|
@ -148,7 +148,6 @@ models = [
|
|||
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
|
||||
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
|
||||
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
|
||||
{"name": "vaetki", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
|
|
|||
|
|
@ -468,12 +468,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_VAETKI:
|
||||
regex_exprs = {
|
||||
"[^\r\n]+",
|
||||
"[\r\n]+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
@ -531,23 +525,8 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
int final_prev_index = -1;
|
||||
const bool skip_byte_encoding = (vocab.get_pre_type() == LLAMA_VOCAB_PRE_TYPE_VAETKI);
|
||||
|
||||
std::string normalized;
|
||||
const std::string * input = &text;
|
||||
if (skip_byte_encoding) {
|
||||
normalized.reserve(text.size() * 3);
|
||||
for (char c : text) {
|
||||
if (c == ' ') {
|
||||
normalized += "\xe2\x96\x81";
|
||||
} else {
|
||||
normalized += c;
|
||||
}
|
||||
}
|
||||
input = &normalized;
|
||||
}
|
||||
|
||||
const auto word_collection = unicode_regex_split(*input, tokenizer.regex_exprs, skip_byte_encoding);
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
||||
|
||||
symbols_final.clear();
|
||||
|
||||
|
|
@ -637,13 +616,8 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
if (token == LLAMA_TOKEN_NULL) {
|
||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||
llama_token token_byte;
|
||||
if (skip_byte_encoding) {
|
||||
token_byte = vocab.byte_to_token(static_cast<uint8_t>(*j));
|
||||
} else {
|
||||
std::string byte_str(1, *j);
|
||||
token_byte = vocab.text_to_token(byte_str);
|
||||
}
|
||||
std::string byte_str(1, *j);
|
||||
auto token_byte = vocab.text_to_token(byte_str);
|
||||
if (token_byte != LLAMA_TOKEN_NULL) {
|
||||
output.push_back(token_byte);
|
||||
}
|
||||
|
|
@ -2068,11 +2042,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
tokenizer_pre == "solar-open") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "vaetki") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_VAETKI;
|
||||
clean_spaces = false;
|
||||
add_space_prefix = false;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
|
@ -2707,11 +2676,6 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
|
|||
return strtol(buf.c_str(), NULL, 16);
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
// VAETKI uses <0xXX> format for byte tokens
|
||||
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
|
||||
auto buf = token_data.text.substr(3, 2);
|
||||
return strtol(buf.c_str(), NULL, 16);
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_WPM: {
|
||||
|
|
@ -3180,21 +3144,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|||
return _try_copy(token_text.data(), token_text.size());
|
||||
}
|
||||
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
|
||||
std::string result = token_text;
|
||||
llama_unescape_whitespace(result);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
std::string result = llama_decode_text(token_text);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
|
||||
// VAETKI uses <0xXX> format for byte tokens
|
||||
if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
|
||||
char byte = (char) token_to_byte(token);
|
||||
return _try_copy(&byte, 1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_RWKV: {
|
||||
|
|
@ -3467,19 +3419,6 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
|
|||
}
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
if (pimpl->pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
|
||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||
auto token = pimpl->token_to_id.find(buf);
|
||||
if (token != pimpl->token_to_id.end()) {
|
||||
return (*token).second;
|
||||
}
|
||||
const char buf2[2] = { (char)ch, 0 };
|
||||
auto token2 = pimpl->token_to_id.find(buf2);
|
||||
if (token2 != pimpl->token_to_id.end()) {
|
||||
return (*token2).second;
|
||||
}
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
||||
|
|
|
|||
|
|
@ -54,7 +54,6 @@ enum llama_vocab_pre_type {
|
|||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_VAETKI = 46,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
|
|
|||
|
|
@ -956,7 +956,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
|
|||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding) {
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||
|
|
@ -1143,8 +1143,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
start += offset;
|
||||
}
|
||||
|
||||
if (skip_byte_encoding) {
|
||||
return bpe_words;
|
||||
}
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
|
|||
|
||||
bool unicode_cpt_is_han(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding = false);
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
|
|
|||
Loading…
Reference in New Issue