From cf00fe1ea325f8fba3f99b953146827a07434c9e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 May 2024 11:00:15 +0300 Subject: [PATCH] starcoder : fix pre-tokenizer --- convert-hf-to-gguf-update.py | 1 + llama.cpp | 5 +++++ models/ggml-vocab-bert-bge.gguf.inp | 2 +- models/ggml-vocab-bert-bge.gguf.out | 2 +- models/ggml-vocab-deepseek-coder.gguf.inp | 2 +- models/ggml-vocab-deepseek-coder.gguf.out | 2 +- models/ggml-vocab-deepseek-llm.gguf.inp | 2 +- models/ggml-vocab-deepseek-llm.gguf.out | 2 +- models/ggml-vocab-falcon.gguf.inp | 2 +- models/ggml-vocab-falcon.gguf.out | 2 +- models/ggml-vocab-gpt-2.gguf.inp | 2 +- models/ggml-vocab-gpt-2.gguf.out | 2 +- models/ggml-vocab-llama-bpe.gguf.inp | 2 +- models/ggml-vocab-llama-bpe.gguf.out | 2 +- models/ggml-vocab-llama-spm.gguf.inp | 2 +- models/ggml-vocab-llama-spm.gguf.out | 2 +- models/ggml-vocab-mpt.gguf.inp | 2 +- models/ggml-vocab-mpt.gguf.out | 2 +- models/ggml-vocab-phi-3.gguf.inp | 2 +- models/ggml-vocab-phi-3.gguf.out | 2 +- models/ggml-vocab-starcoder.gguf.inp | 2 +- models/ggml-vocab-starcoder.gguf.out | 2 +- 22 files changed, 26 insertions(+), 20 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3dc..3843b4c3e8 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -189,6 +189,7 @@ print("\n") # generate tests for each tokenizer model tests = [ + "ied 4 ½ months" "", " ", " ", diff --git a/llama.cpp b/llama.cpp index 18d6297ce1..7ce81d6b95 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12235,6 +12235,11 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; case LLAMA_VOCAB_PRE_TYPE_GPT2: word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 969552e1c1..8afef45ef2 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,4 +1,4 @@ - + 29464 2094 1018 1092 2706 diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 8ef585c787..094c772cd2 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,4 +1,4 @@ - + 1050 207 19 207 19192 4217 207 243 315 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0ea9d66e34..0bb8b52308 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,4 +1,4 @@ - + 1052 207 19 207 19109 4223 207 243 300 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index cb8da7b1a3..8aff915536 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,4 +1,4 @@ - + 878 204 31 3068 133 2137 204 258 466 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 9986f38e40..14cfb7b365 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,4 +1,4 @@ - + 798 604 25208 1933 220 220 220 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 4d903e1cdf..555ed323d2 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,4 +1,4 @@ - + 1142 220 19 220 27154 4038 220 256 262 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 15d00b106d..86a7eff91b 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,4 +1,4 @@ - + 474 287 29871 29946 29871 30226 7378 259 1678 268 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index 1f4b0eb3a8..e7e578022d 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,4 +1,4 @@ - + 728 577 24142 2607 209 50276 50275 diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 15d00b106d..86a7eff91b 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,4 +1,4 @@ - + 474 287 29871 29946 29871 30226 7378 259 1678 268 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 0389f00c71..5e3062baba 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index cd04254af7..551b3ce6e0 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,4 +1,4 @@ - + 4850 244 57 244 162 159 17722 244 280 283