From 5ef493ea1a01385c02ef4c56d38dfe5e116c47c6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 09:48:29 +0100
Subject: [PATCH] Exclude embeddings and output tensor

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f5fa309c44..32013e47ba 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -697,8 +697,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= params->quantize_output_tensor || name != "output.weight";
         q &= !params->only_copy;
+        // TODO: Exclude embeddings and output tensors?
+        q &= params->quantize_output_tensor || name != "output.weight";
+        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
     };