diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f5fa309c44..32013e47ba 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -697,8 +697,10 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_decay_w2.weight") == std::string::npos; q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= params->quantize_output_tensor || name != "output.weight"; q &= !params->only_copy; + // TODO: Exclude embeddings and output tensors? + q &= params->quantize_output_tensor || name != "output.weight"; + q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; };