261 lines
9.7 KiB
Python
261 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate reference outputs from HuggingFace NLLB model.
|
|
This creates ground truth data for numerical verification.
|
|
"""
|
|
|
|
import torch
|
|
import numpy as np
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
import json
|
|
import os
|
|
|
|
print("=" * 80)
|
|
print("NLLB Reference Output Generator")
|
|
print("=" * 80)
|
|
|
|
# Create results directory
|
|
os.makedirs("results", exist_ok=True)
|
|
|
|
# Test sentences
|
|
test_sentences = [
|
|
"eng_Latn Hello, how are you?",
|
|
"eng_Latn The quick brown fox jumps over the lazy dog.",
|
|
"eng_Latn Machine learning is transforming the world.",
|
|
]
|
|
|
|
# Target language
|
|
target_lang = "fra_Latn"
|
|
|
|
print("\n1. Loading HuggingFace NLLB model...")
|
|
model_name = "facebook/nllb-200-distilled-600M"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn")
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
model.eval()
|
|
|
|
print(f" Model: {model_name}")
|
|
print(f" Vocab size: {len(tokenizer)}")
|
|
print(f" Model config:")
|
|
print(f" - d_model: {model.config.d_model}")
|
|
print(f" - encoder_layers: {model.config.encoder_layers}")
|
|
print(f" - decoder_layers: {model.config.decoder_layers}")
|
|
print(f" - encoder_attention_heads: {model.config.encoder_attention_heads}")
|
|
print(f" - encoder_ffn_dim: {model.config.encoder_ffn_dim}")
|
|
|
|
# Save model config
|
|
config_data = {
|
|
"model_name": model_name,
|
|
"d_model": model.config.d_model,
|
|
"encoder_layers": model.config.encoder_layers,
|
|
"decoder_layers": model.config.decoder_layers,
|
|
"encoder_attention_heads": model.config.encoder_attention_heads,
|
|
"decoder_attention_heads": model.config.decoder_attention_heads,
|
|
"encoder_ffn_dim": model.config.encoder_ffn_dim,
|
|
"decoder_ffn_dim": model.config.decoder_ffn_dim,
|
|
"max_position_embeddings": model.config.max_position_embeddings,
|
|
"vocab_size": len(tokenizer),
|
|
"bos_token_id": tokenizer.bos_token_id,
|
|
"eos_token_id": tokenizer.eos_token_id,
|
|
"pad_token_id": tokenizer.pad_token_id,
|
|
"decoder_start_token_id": model.config.decoder_start_token_id,
|
|
}
|
|
|
|
with open("results/model_config.json", "w") as f:
|
|
json.dump(config_data, f, indent=2)
|
|
print("\n [OK] Saved model config to results/model_config.json")
|
|
|
|
print("\n2. Testing Tokenizer...")
|
|
tokenizer_data = {}
|
|
|
|
for i, sentence in enumerate(test_sentences):
|
|
print(f"\n Test {i+1}: {sentence}")
|
|
|
|
# Tokenize
|
|
inputs = tokenizer(sentence, return_tensors="pt")
|
|
input_ids = inputs["input_ids"][0].tolist()
|
|
|
|
print(f" Token IDs: {input_ids}")
|
|
print(f" Tokens: {[tokenizer.decode([tid]) for tid in input_ids]}")
|
|
|
|
tokenizer_data[f"test_{i+1}"] = {
|
|
"sentence": sentence,
|
|
"input_ids": input_ids,
|
|
"tokens": [tokenizer.decode([tid]) for tid in input_ids],
|
|
}
|
|
|
|
with open("results/tokenizer_reference.json", "w") as f:
|
|
json.dump(tokenizer_data, f, indent=2)
|
|
print("\n [OK] Saved tokenizer reference to results/tokenizer_reference.json")
|
|
|
|
print("\n3. Generating Encoder Outputs...")
|
|
encoder_data = {}
|
|
|
|
with torch.no_grad():
|
|
for i, sentence in enumerate(test_sentences[:1]): # Start with one sentence
|
|
print(f"\n Test {i+1}: {sentence}")
|
|
|
|
# Tokenize
|
|
inputs = tokenizer(sentence, return_tensors="pt")
|
|
input_ids = inputs["input_ids"]
|
|
attention_mask = inputs["attention_mask"]
|
|
|
|
print(f" Input shape: {input_ids.shape}")
|
|
|
|
# Get encoder outputs with hidden states
|
|
encoder_outputs = model.model.encoder(
|
|
input_ids=input_ids,
|
|
attention_mask=attention_mask,
|
|
output_hidden_states=True,
|
|
return_dict=True,
|
|
)
|
|
|
|
# Save encoder output (last hidden state)
|
|
encoder_output = encoder_outputs.last_hidden_state[0].cpu().numpy()
|
|
print(f" Encoder output shape: {encoder_output.shape}")
|
|
print(f" Encoder output stats: min={encoder_output.min():.6f}, max={encoder_output.max():.6f}, mean={encoder_output.mean():.6f}")
|
|
|
|
# Save layer-by-layer hidden states
|
|
layer_outputs = []
|
|
for layer_idx, hidden_state in enumerate(encoder_outputs.hidden_states):
|
|
layer_output = hidden_state[0].cpu().numpy()
|
|
layer_outputs.append({
|
|
"layer": layer_idx,
|
|
"shape": list(layer_output.shape),
|
|
"mean": float(layer_output.mean()),
|
|
"std": float(layer_output.std()),
|
|
"min": float(layer_output.min()),
|
|
"max": float(layer_output.max()),
|
|
})
|
|
print(f" Layer {layer_idx}: mean={layer_output.mean():.6f}, std={layer_output.std():.6f}")
|
|
|
|
encoder_data[f"test_{i+1}"] = {
|
|
"input_ids": input_ids[0].tolist(),
|
|
"encoder_output_shape": list(encoder_output.shape),
|
|
"encoder_output_stats": {
|
|
"mean": float(encoder_output.mean()),
|
|
"std": float(encoder_output.std()),
|
|
"min": float(encoder_output.min()),
|
|
"max": float(encoder_output.max()),
|
|
},
|
|
"layer_outputs": layer_outputs,
|
|
}
|
|
|
|
# Save full encoder output as numpy array
|
|
np.save(f"results/encoder_output_test_{i+1}.npy", encoder_output)
|
|
|
|
with open("results/encoder_reference.json", "w") as f:
|
|
json.dump(encoder_data, f, indent=2)
|
|
print("\n [OK] Saved encoder reference to results/encoder_reference.json")
|
|
|
|
print("\n4. Generating Decoder Outputs...")
|
|
decoder_data = {}
|
|
|
|
with torch.no_grad():
|
|
for i, sentence in enumerate(test_sentences[:1]): # Start with one sentence
|
|
print(f"\n Test {i+1}: {sentence}")
|
|
|
|
# Tokenize source
|
|
inputs = tokenizer(sentence, return_tensors="pt")
|
|
|
|
# Get encoder outputs
|
|
encoder_outputs = model.model.encoder(**inputs, return_dict=True)
|
|
|
|
# Prepare decoder input (start with decoder_start_token_id + target language code)
|
|
decoder_start_token_id = model.config.decoder_start_token_id
|
|
target_lang_id = tokenizer.convert_tokens_to_ids(target_lang)
|
|
|
|
decoder_input_ids = torch.tensor([[decoder_start_token_id, target_lang_id]])
|
|
|
|
print(f" Decoder start tokens: {decoder_input_ids[0].tolist()}")
|
|
print(f" Decoder tokens: {[tokenizer.decode([tid]) for tid in decoder_input_ids[0].tolist()]}")
|
|
|
|
# Get decoder outputs
|
|
decoder_outputs = model.model.decoder(
|
|
input_ids=decoder_input_ids,
|
|
encoder_hidden_states=encoder_outputs.last_hidden_state,
|
|
output_hidden_states=True,
|
|
return_dict=True,
|
|
)
|
|
|
|
decoder_output = decoder_outputs.last_hidden_state[0].cpu().numpy()
|
|
print(f" Decoder output shape: {decoder_output.shape}")
|
|
print(f" Decoder output stats: min={decoder_output.min():.6f}, max={decoder_output.max():.6f}, mean={decoder_output.mean():.6f}")
|
|
|
|
# Get logits
|
|
lm_logits = model.lm_head(decoder_outputs.last_hidden_state)
|
|
logits = lm_logits[0].cpu().numpy()
|
|
|
|
print(f" Logits shape: {logits.shape}")
|
|
print(f" Top 5 predictions for last token: {torch.topk(lm_logits[0, -1], 5).indices.tolist()}")
|
|
|
|
decoder_data[f"test_{i+1}"] = {
|
|
"decoder_input_ids": decoder_input_ids[0].tolist(),
|
|
"decoder_output_shape": list(decoder_output.shape),
|
|
"decoder_output_stats": {
|
|
"mean": float(decoder_output.mean()),
|
|
"std": float(decoder_output.std()),
|
|
"min": float(decoder_output.min()),
|
|
"max": float(decoder_output.max()),
|
|
},
|
|
"logits_shape": list(logits.shape),
|
|
"top_5_predictions": torch.topk(lm_logits[0, -1], 5).indices.tolist(),
|
|
}
|
|
|
|
# Save outputs
|
|
np.save(f"results/decoder_output_test_{i+1}.npy", decoder_output)
|
|
np.save(f"results/decoder_logits_test_{i+1}.npy", logits)
|
|
|
|
with open("results/decoder_reference.json", "w") as f:
|
|
json.dump(decoder_data, f, indent=2)
|
|
print("\n [OK] Saved decoder reference to results/decoder_reference.json")
|
|
|
|
print("\n5. Generating Full Translation...")
|
|
translation_data = {}
|
|
|
|
for i, sentence in enumerate(test_sentences):
|
|
print(f"\n Test {i+1}: {sentence}")
|
|
|
|
# Translate
|
|
inputs = tokenizer(sentence, return_tensors="pt")
|
|
translated_tokens = model.generate(
|
|
**inputs,
|
|
forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang),
|
|
max_length=50,
|
|
)
|
|
|
|
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
|
|
|
print(f" Translation: {translation}")
|
|
print(f" Output token IDs: {translated_tokens[0].tolist()}")
|
|
|
|
translation_data[f"test_{i+1}"] = {
|
|
"source": sentence,
|
|
"target_lang": target_lang,
|
|
"translation": translation,
|
|
"output_token_ids": translated_tokens[0].tolist(),
|
|
}
|
|
|
|
with open("results/translation_reference.json", "w") as f:
|
|
json.dump(translation_data, f, indent=2)
|
|
print("\n [OK] Saved translation reference to results/translation_reference.json")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("[SUCCESS] Reference generation complete!")
|
|
print("=" * 80)
|
|
print("\nGenerated files:")
|
|
print(" - results/model_config.json")
|
|
print(" - results/tokenizer_reference.json")
|
|
print(" - results/encoder_reference.json")
|
|
print(" - results/encoder_output_test_1.npy")
|
|
print(" - results/decoder_reference.json")
|
|
print(" - results/decoder_output_test_1.npy")
|
|
print(" - results/decoder_logits_test_1.npy")
|
|
print(" - results/translation_reference.json")
|
|
print("\nNext steps:")
|
|
print(" 1. Run: python test_1_tokenizer.py")
|
|
print(" 2. Run: python test_2_encoder.py")
|
|
print(" 3. Run: python test_3_decoder.py")
|
|
print(" 4. Run: python test_5_translation.py")
|
|
print("=" * 80)
|
|
|