diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c4cb9d4388..6eec4dae30 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1255,9 +1255,6 @@ class TextModel(ModelBase): if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B res = "exaone-moe" - if chkhsh == "f5f8b79793693cfcca1c36aac854ab481ae887cf7dde234b889f8f4bf009891a": - # ref: https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B - res = "vaetki" if res is None: logger.warning("\n") @@ -7682,84 +7679,6 @@ class VaetkiModel(TextModel): "sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)} } - def set_vocab(self): - # VAETKI uses Metaspace-based BPE tokenizer, load vocab from tokenizer.json - import json - import re - from transformers import AutoTokenizer - - dir_model = self.dir_model - hparams = self.hparams - - tokenizer_json_path = dir_model / "tokenizer.json" - if not tokenizer_json_path.is_file(): - raise FileNotFoundError(f"VAETKI tokenizer.json not found: {tokenizer_json_path}") - - with open(tokenizer_json_path, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - - # Get vocab from tokenizer.json - vocab = tokenizer_json["model"]["vocab"] - merges = tokenizer_json["model"].get("merges", []) - - vocab_size = hparams.get("vocab_size", len(vocab)) - - # Build reverse vocab - reverse_vocab = {v: k for k, v in vocab.items()} - - # Get added tokens from tokenizer.json - added_tokens = {} - for token_info in tokenizer_json.get("added_tokens", []): - added_tokens[token_info["id"]] = { - "content": token_info["content"], - "special": token_info.get("special", False) - } - - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i in added_tokens: - token = added_tokens[i]["content"] - if added_tokens[i]["special"]: - toktypes.append(gguf.TokenType.CONTROL) - else: - # pre-normalize user-defined spaces (Metaspace → space) - token = token.replace("\u2581", " ") - toktypes.append(gguf.TokenType.USER_DEFINED) - tokens.append(token) - elif i in reverse_vocab: - token = reverse_vocab[i] - # Check for byte tokens (format: <0xXX>) - if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token): - toktypes.append(gguf.TokenType.BYTE) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - else: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - - # Get pre-tokenizer type - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # Add merges (convert from [['a', 'b'], ...] to ['a b', ...] format) - if merges: - # tokenizer.json stores merges as list of pairs, GGUF expects space-separated strings - if isinstance(merges[0], list): - merges = [' '.join(pair) for pair in merges] - self.gguf_writer.add_token_merges(merges) - - # Add special tokens - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - def set_gguf_parameters(self): super().set_gguf_parameters() diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index fb7a943de5..aa9843ea17 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -148,7 +148,6 @@ models = [ {"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", }, {"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", }, {"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", }, - {"name": "vaetki", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/nc-ai-consortium/VAETKI-VL-7B-A1B", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5a7d17ece6..c1be41e452 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -468,12 +468,6 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }; break; - case LLAMA_VOCAB_PRE_TYPE_VAETKI: - regex_exprs = { - "[^\r\n]+", - "[\r\n]+", - }; - break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -531,23 +525,8 @@ struct llm_tokenizer_bpe_session { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - const bool skip_byte_encoding = (vocab.get_pre_type() == LLAMA_VOCAB_PRE_TYPE_VAETKI); - std::string normalized; - const std::string * input = &text; - if (skip_byte_encoding) { - normalized.reserve(text.size() * 3); - for (char c : text) { - if (c == ' ') { - normalized += "\xe2\x96\x81"; - } else { - normalized += c; - } - } - input = &normalized; - } - - const auto word_collection = unicode_regex_split(*input, tokenizer.regex_exprs, skip_byte_encoding); + const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs); symbols_final.clear(); @@ -637,13 +616,8 @@ struct llm_tokenizer_bpe_session { if (token == LLAMA_TOKEN_NULL) { for (auto j = str.begin(); j != str.end(); ++j) { - llama_token token_byte; - if (skip_byte_encoding) { - token_byte = vocab.byte_to_token(static_cast(*j)); - } else { - std::string byte_str(1, *j); - token_byte = vocab.text_to_token(byte_str); - } + std::string byte_str(1, *j); + auto token_byte = vocab.text_to_token(byte_str); if (token_byte != LLAMA_TOKEN_NULL) { output.push_back(token_byte); } @@ -2068,11 +2042,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "solar-open") { pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN; clean_spaces = false; - } else if ( - tokenizer_pre == "vaetki") { - pre_type = LLAMA_VOCAB_PRE_TYPE_VAETKI; - clean_spaces = false; - add_space_prefix = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -2707,11 +2676,6 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const { return strtol(buf.c_str(), NULL, 16); } case LLAMA_VOCAB_TYPE_BPE: { - // VAETKI uses <0xXX> format for byte tokens - if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) { - auto buf = token_data.text.substr(3, 2); - return strtol(buf.c_str(), NULL, 16); - } GGML_ABORT("fatal error"); } case LLAMA_VOCAB_TYPE_WPM: { @@ -3180,21 +3144,9 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t return _try_copy(token_text.data(), token_text.size()); } if (attr & LLAMA_TOKEN_ATTR_NORMAL) { - if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) { - std::string result = token_text; - llama_unescape_whitespace(result); - return _try_copy(result.data(), result.size()); - } std::string result = llama_decode_text(token_text); return _try_copy(result.data(), result.size()); } - if (attr & LLAMA_TOKEN_ATTR_BYTE) { - // VAETKI uses <0xXX> format for byte tokens - if (pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) { - char byte = (char) token_to_byte(token); - return _try_copy(&byte, 1); - } - } break; } case LLAMA_VOCAB_TYPE_RWKV: { @@ -3467,19 +3419,6 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const { } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { - if (pimpl->pre_type == LLAMA_VOCAB_PRE_TYPE_VAETKI) { - const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - auto token = pimpl->token_to_id.find(buf); - if (token != pimpl->token_to_id.end()) { - return (*token).second; - } - const char buf2[2] = { (char)ch, 0 }; - auto token2 = pimpl->token_to_id.find(buf2); - if (token2 != pimpl->token_to_id.end()) { - return (*token2).second; - } - return LLAMA_TOKEN_NULL; - } return pimpl->token_to_id.at(unicode_byte_to_utf8(ch)); } case LLAMA_VOCAB_TYPE_PLAMO2: { diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 8ac8a6036e..28c3a82b91 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -54,7 +54,6 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, - LLAMA_VOCAB_PRE_TYPE_VAETKI = 46, }; struct LLM_KV; diff --git a/src/unicode.cpp b/src/unicode.cpp index 6a0c970335..b47dcbe619 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -956,7 +956,7 @@ bool unicode_cpt_is_han(uint32_t cpt) { return false; } -std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs, bool skip_byte_encoding) { +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { { "\\p{N}", unicode_cpt_flags::NUMBER }, @@ -1143,8 +1143,5 @@ std::vector unicode_regex_split(const std::string & text, const std start += offset; } - if (skip_byte_encoding) { - return bpe_words; - } return unicode_byte_encoding_process(bpe_words); } diff --git a/src/unicode.h b/src/unicode.h index 7856dbc98e..5bd1362ff4 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt); bool unicode_cpt_is_han(uint32_t cpt); -std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs, bool skip_byte_encoding = false); +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs);