From 69efb59853f8315c7594b3b1c3ca22b94ed9d56d Mon Sep 17 00:00:00 2001
From: teleprint-me <77757836+teleprint-me@users.noreply.github.com>
Date: Wed, 8 May 2024 01:17:07 -0400
Subject: [PATCH] fix: Apply proper paths for handling qwen

---
 convert-hf-to-gguf-update.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 52f217ef53..bdbfeea188 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -120,14 +120,21 @@ for model in models:
     # model and repo urls are not the same
     # url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json"
     if name == "qwen":  # qwen is an outlier and will raise a FileNotFoundError
-        # fetch the qwen tokenizer
+        # override the tokenizer path
+        model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken"
+        # fetch the qwens BPE tokenizer
         download_file_with_auth(
-            url="https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json",
+            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken",
             token=token,
             save_path=model_tokenizer_path
         )
-    else:  # Et tu, Brute?
-        # Get the models tokenizer
+        # fetch qwens tokenizer script; this is required.
+        download_file_with_auth(
+            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py",
+            token=token,
+            save_path=f"{model_name_or_path}/tokenization_qwen.py"
+        )
+    else:  # Get the models tokenizer
         download_file_with_auth(
             url=f"{url_main}/tokenizer.json",
             token=token,
@@ -177,8 +184,7 @@ for model in models:
     if tokt == TOKENIZER_TYPE.SPM:
         continue
 
-    # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", trust_remote_code=True)
 
     chktok = tokenizer.encode(chktxt)
     chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -314,8 +320,7 @@ for model in models:
     name = model["name"]
     tokt = model["tokt"]
 
-    # create the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
 
     with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
         for text in tests:
@@ -336,7 +341,7 @@ shscript = "#!/usr/bin/env bash\n\n"
 
 for model in models:
     name = model["name"]
-    tmpline = f"python3 convert-hf-to-gguf.py {model_name_or_path}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
+    tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name} --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
     shscript += tmpline
     logging.info(tmpline.strip())