#!/usr/bin/env python3 """Fuse Q/K/V tensors in an existing GGUF file into a single QKV tensor. This script operates at the binary level to preserve ALL metadata (including tokenizer) byte-for-byte from the original file. Usage: python scripts/fuse_qkv_gguf.py input.gguf output.gguf """ import sys, struct, os, re import numpy as np sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'gguf-py')) from gguf import GGUFReader def align_offset(offset, alignment=32): return (offset + alignment - 1) // alignment * alignment def write_tensor_info(f, name, n_dims, dims, tensor_type, data_offset): """Write one tensor info entry in GGUF format.""" name_bytes = name.encode('utf-8') f.write(struct.pack(' QKV[{fused_ne0},{fused_ne1}] {fused_data.nbytes} bytes") output_tensors.append((fused_name, 2, [fused_ne0, fused_ne1], int(q.tensor_type), fused_data.tobytes())) else: dims = [int(x) for x in t.field.parts[3]] n_dims = int(t.field.parts[2][0]) output_tensors.append((t.name, n_dims, dims, int(t.tensor_type), bytes(t.data))) n_tensors_new = len(output_tensors) print(f"\n {n_tensors_orig} -> {n_tensors_new} tensors") with open(input_path, 'rb') as f: f.seek(kv_data_start) kv_data_bytes = f.read(kv_data_end - kv_data_start) print(f"\nWriting {output_path}...") alignment = 32 with open(output_path, 'wb') as f: f.write(magic) f.write(struct.pack(' ti_section_end: f.write(b'\x00' * (tensor_data_start - ti_section_end)) for i, (name, n_dims, dims, ttype, data) in enumerate(output_tensors): current_pos = f.tell() - tensor_data_start target_pos = data_offsets[i] if target_pos > current_pos: f.write(b'\x00' * (target_pos - current_pos)) f.write(data) final_size = f.tell() print(f" Output size: {final_size / 1e9:.2f} GB") print(" Done!") if __name__ == '__main__': main()