revert: remove VAETKI tokenizer implementation

2026-01-14 00:08:17 +09:00 · 2026-01-14 00:08:17 +09:00 · ca85717886
parent 487909ae0e
commit ca85717886
6 changed files with 5 additions and 152 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -1255,9 +1255,6 @@ class TextModel(ModelBase):
        if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
            # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
            res = "exaone-moe"
-        if chkhsh == "f5f8b79793693cfcca1c36aac854ab481ae887cf7dde234b889f8f4bf009891a":
-            # ref: https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B
-            res = "vaetki"

        if res is None:
            logger.warning("\n")
@ -7682,84 +7679,6 @@ class VaetkiModel(TextModel):
            "sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)}
        }

-    def set_vocab(self):
-        # VAETKI uses Metaspace-based BPE tokenizer, load vocab from tokenizer.json
-        import json
-        import re
-        from transformers import AutoTokenizer
-
-        dir_model = self.dir_model
-        hparams = self.hparams
-
-        tokenizer_json_path = dir_model / "tokenizer.json"
-        if not tokenizer_json_path.is_file():
-            raise FileNotFoundError(f"VAETKI tokenizer.json not found: {tokenizer_json_path}")
-
-        with open(tokenizer_json_path, "r", encoding="utf-8") as f:
-            tokenizer_json = json.load(f)
-
-        # Get vocab from tokenizer.json
-        vocab = tokenizer_json["model"]["vocab"]
-        merges = tokenizer_json["model"].get("merges", [])
-
-        vocab_size = hparams.get("vocab_size", len(vocab))
-
-        # Build reverse vocab
-        reverse_vocab = {v: k for k, v in vocab.items()}
-
-        # Get added tokens from tokenizer.json
-        added_tokens = {}
-        for token_info in tokenizer_json.get("added_tokens", []):
-            added_tokens[token_info["id"]] = {
-                "content": token_info["content"],
-                "special": token_info.get("special", False)
-            }
-
-        tokens: list[str] = []
-        toktypes: list[int] = []
-
-        for i in range(vocab_size):
-            if i in added_tokens:
-                token = added_tokens[i]["content"]
-                if added_tokens[i]["special"]:
-                    toktypes.append(gguf.TokenType.CONTROL)
-                else:
-                    # pre-normalize user-defined spaces (Metaspace → space)
-                    token = token.replace("\u2581", " ")
-                    toktypes.append(gguf.TokenType.USER_DEFINED)
-                tokens.append(token)
-            elif i in reverse_vocab:
-                token = reverse_vocab[i]
-                # Check for byte tokens (format: <0xXX>)
-                if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token):
-                    toktypes.append(gguf.TokenType.BYTE)
-                else:
-                    toktypes.append(gguf.TokenType.NORMAL)
-                tokens.append(token)
-            else:
-                tokens.append(f"[PAD{i}]")
-                toktypes.append(gguf.TokenType.UNUSED)
-
-        # Get pre-tokenizer type
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        tokpre = self.get_vocab_base_pre(tokenizer)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        # Add merges (convert from [['a', 'b'], ...] to ['a b', ...] format)
-        if merges:
-            # tokenizer.json stores merges as list of pairs, GGUF expects space-separated strings
-            if isinstance(merges[0], list):
-                merges = [' '.join(pair) for pair in merges]
-            self.gguf_writer.add_token_merges(merges)
-
-        # Add special tokens
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.add_to_gguf(self.gguf_writer)
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -148,7 +148,6 @@ models = [
    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
    {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
-    {"name": "vaetki",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -468,12 +468,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_VAETKI:
-                regex_exprs = {
-                    "[^\r\n]+",
-                    "[\r\n]+",
-                };
-                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -531,23 +525,8 @@ struct llm_tokenizer_bpe_session {

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        int final_prev_index = -1;
-        const bool skip_byte_encoding = (vocab.get_pre_type() == LLAMA_VOCAB_PRE_TYPE_VAETKI);

-        std::string normalized;
-        const std::string * input = &text;
-        if (skip_byte_encoding) {
-            normalized.reserve(text.size() * 3);
-            for (char c : text) {
-                if (c == ' ') {
-                    normalized += "\xe2\x96\x81";
-                } else {
-                    normalized += c;
-                }
-            }
-            input = &normalized;
-        }
-
-        const auto word_collection = unicode_regex_split(*input, tokenizer.regex_exprs, skip_byte_encoding);
+        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);

        symbols_final.clear();

@ -637,13 +616,8 @@ struct llm_tokenizer_bpe_session {

                if (token == LLAMA_TOKEN_NULL) {
                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        llama_token token_byte;
-                        if (skip_byte_encoding) {
-                            token_byte = vocab.byte_to_token(static_cast<uint8_t>(*j));
-                        } else {
-                            std::string byte_str(1, *j);
-                            token_byte = vocab.text_to_token(byte_str);
-                        }
+                        std::string byte_str(1, *j);
+                        auto token_byte = vocab.text_to_token(byte_str);
                        if (token_byte != LLAMA_TOKEN_NULL) {
                            output.push_back(token_byte);
                        }
@ -2068,11 +2042,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "solar-open") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "vaetki") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_VAETKI;
-                clean_spaces = false;
-                add_space_prefix = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -2707,11 +2676,6 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
            return strtol(buf.c_str(), NULL, 16);
        }
        case LLAMA_VOCAB_TYPE_BPE: {
-            // VAETKI uses <0xXX> format for byte tokens
-            if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
-                auto buf = token_data.text.substr(3, 2);
-                return strtol(buf.c_str(), NULL, 16);
-            }
            GGML_ABORT("fatal error");
        }
        case LLAMA_VOCAB_TYPE_WPM: {
@ -3180,21 +3144,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
                    return _try_copy(token_text.data(), token_text.size());
                }
                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
-                    if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
-                        std::string result = token_text;
-                        llama_unescape_whitespace(result);
-                        return _try_copy(result.data(), result.size());
-                    }
                    std::string result = llama_decode_text(token_text);
                    return _try_copy(result.data(), result.size());
                }
-                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
-                    // VAETKI uses <0xXX> format for byte tokens
-                    if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
-                        char byte = (char) token_to_byte(token);
-                        return _try_copy(&byte, 1);
-                    }
-                }
                break;
            }
            case LLAMA_VOCAB_TYPE_RWKV: {
@ -3467,19 +3419,6 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
        }
        case LLAMA_VOCAB_TYPE_WPM:
        case LLAMA_VOCAB_TYPE_BPE: {
-            if (pimpl->pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) {
-                const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-                auto token = pimpl->token_to_id.find(buf);
-                if (token != pimpl->token_to_id.end()) {
-                    return (*token).second;
-                }
-                const char buf2[2] = { (char)ch, 0 };
-                auto token2 = pimpl->token_to_id.find(buf2);
-                if (token2 != pimpl->token_to_id.end()) {
-                    return (*token2).second;
-                }
-                return LLAMA_TOKEN_NULL;
-            }
            return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
        }
        case LLAMA_VOCAB_TYPE_PLAMO2: {
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -54,7 +54,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
    LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE      = 45,
-    LLAMA_VOCAB_PRE_TYPE_VAETKI          = 46,
 };

 struct LLM_KV;
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -956,7 +956,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
    return false;
 }

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding) {
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", unicode_cpt_flags::NUMBER },
@ -1143,8 +1143,5 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        start += offset;
    }

-    if (skip_byte_encoding) {
-        return bpe_words;
-    }
    return unicode_byte_encoding_process(bpe_words);
 }
--- a/src/unicode.h
+++ b/src/unicode.h
@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);

 bool unicode_cpt_is_han(uint32_t cpt);

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool skip_byte_encoding = false);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);