Merge d3aea508a1 into 2aa45ef9e3

2025-12-16 16:38:07 +08:00 · 2025-12-16 16:38:07 +08:00 · 755aeef41c
parent 2aa45ef9e3 d3aea508a1
commit 755aeef41c
14 changed files with 2754 additions and 0 deletions
--- a/convert_nougat_to_gguf.py
+++ b/convert_nougat_to_gguf.py
@ -0,0 +1,386 @@
 #!/usr/bin/env python3
 """
 Convert Nougat (Neural Optical Understanding for Academic Documents) model to GGUF format.
 This script handles the conversion of Nougat's Swin Transformer encoder and mBART decoder.
 """
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import torch
 from transformers import NougatProcessor, VisionEncoderDecoderModel
 # Add parent directory to path to import gguf
 sys.path.append(str(Path(__file__).parent / "gguf-py"))
 import gguf
 # Constants for Nougat
 NOUGAT_VISION_PREFIX = "vision_model"
 NOUGAT_DECODER_PREFIX = "decoder"
 NOUGAT_ENCODER_PREFIX = "encoder"
 def parse_args():
    parser = argparse.ArgumentParser(description="Convert Nougat model to GGUF format")
    parser.add_argument(
        "--model-id",
        type=str,
        default="facebook/nougat-base",
        help="HuggingFace model ID or path to local model",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./models",
        help="Output directory for GGUF files",
    )
    parser.add_argument(
        "--quantization",
        type=str,
        choices=["f32", "f16", "q8_0", "q4_0", "q4_1"],
        default="f16",
        help="Quantization type for model weights",
    )
    parser.add_argument(
        "--split-model",
        action="store_true",
        help="Split into separate vision and text GGUF files",
    )
    parser.add_argument(
        "--vocab-only",
        action="store_true",
        help="Only export vocabulary/tokenizer",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Verbose output during conversion",
    )
    return parser.parse_args()
 def get_tensor_name(name: str) -> str:
    """Map Nougat tensor names to GGUF tensor names"""
    # Vision model (Swin Transformer) mappings
    if name.startswith("encoder.model.encoder."):
        # Swin encoder layers
        name = name.replace("encoder.model.encoder.", "swin.")
        # Patch embedding
        if "embeddings.patch_embeddings" in name:
            if "projection.weight" in name:
                return "swin.patch_embed.weight"
            elif "projection.bias" in name:
                return "swin.patch_embed.bias"
        # Position embeddings
        if "position_embeddings" in name:
            return "swin.pos_embed"
        # Layer mappings
        if "layers." in name:
            # Extract stage and layer indices
            parts = name.split(".")
            for i, part in enumerate(parts):
                if part == "layers":
                    stage_idx = int(parts[i + 1])
                    if "blocks." in name:
                        block_idx = int(parts[parts.index("blocks") + 1])
                        # Attention components
                        if "attn.qkv" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.attn.qkv.{'weight' if 'weight' in name else 'bias'}"
                        elif "attn.proj" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.attn.proj.{'weight' if 'weight' in name else 'bias'}"
                        elif "norm1" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.norm1.{'weight' if 'weight' in name else 'bias'}"
                        elif "norm2" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.norm2.{'weight' if 'weight' in name else 'bias'}"
                        elif "mlp.fc1" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.mlp.fc1.{'weight' if 'weight' in name else 'bias'}"
                        elif "mlp.fc2" in name:
                            return f"swin.stage.{stage_idx}.layer.{block_idx}.mlp.fc2.{'weight' if 'weight' in name else 'bias'}"
                    # Downsample layers
                    elif "downsample" in name:
                        if "norm" in name:
                            return f"swin.stage.{stage_idx}.downsample.norm.{'weight' if 'weight' in name else 'bias'}"
                        elif "reduction" in name:
                            return f"swin.stage.{stage_idx}.downsample.reduction.weight"
    # Decoder model (mBART) mappings
    elif name.startswith("decoder.model."):
        name = name.replace("decoder.model.", "")
        # Token and position embeddings
        if name == "shared.weight":
            return "token_embd.weight"
        elif name == "decoder.embed_positions.weight":
            return "position_embd.weight"
        # Decoder layers
        if "decoder.layers." in name:
            layer_idx = int(name.split(".")[2])
            # Self-attention
            if "self_attn.q_proj" in name:
                return f"blk.{layer_idx}.attn_q.weight"
            elif "self_attn.k_proj" in name:
                return f"blk.{layer_idx}.attn_k.weight"
            elif "self_attn.v_proj" in name:
                return f"blk.{layer_idx}.attn_v.weight"
            elif "self_attn.out_proj" in name:
                return f"blk.{layer_idx}.attn_o.weight"
            elif "self_attn_layer_norm" in name:
                return f"blk.{layer_idx}.attn_norm.{'weight' if 'weight' in name else 'bias'}"
            # Cross-attention
            elif "encoder_attn.q_proj" in name:
                return f"blk.{layer_idx}.attn_q_cross.weight"
            elif "encoder_attn.k_proj" in name:
                return f"blk.{layer_idx}.attn_k_cross.weight"
            elif "encoder_attn.v_proj" in name:
                return f"blk.{layer_idx}.attn_v_cross.weight"
            elif "encoder_attn.out_proj" in name:
                return f"blk.{layer_idx}.attn_o_cross.weight"
            elif "encoder_attn_layer_norm" in name:
                return f"blk.{layer_idx}.attn_norm_cross.{'weight' if 'weight' in name else 'bias'}"
            # FFN
            elif "fc1" in name:
                return f"blk.{layer_idx}.ffn_up.weight"
            elif "fc2" in name:
                return f"blk.{layer_idx}.ffn_down.weight"
            elif "final_layer_norm" in name:
                return f"blk.{layer_idx}.ffn_norm.{'weight' if 'weight' in name else 'bias'}"
        # Output layers
        elif "decoder.layer_norm" in name:
            return f"output_norm.{'weight' if 'weight' in name else 'bias'}"
        elif "lm_head" in name:
            return "output.weight"
    # Encoder layers (for encoder-only export)
    elif name.startswith("encoder."):
        name = name.replace("encoder.", "enc.")
        # Similar mappings but with enc. prefix
        return f"enc.{name}"
    # Default: return original name
    return name
 def convert_swin_encoder(model_dict: Dict[str, torch.Tensor], gguf_writer: gguf.GGUFWriter, args):
    """Convert Swin Transformer encoder weights to GGUF format"""
    print("Converting Swin Transformer encoder...")
    # Write Swin hyperparameters
    swin_config = {
        "window_size": 7,
        "patch_size": 4,
        "image_size": 384,  # Default for Nougat
        "hidden_dim": 96,
        "depths": [2, 2, 6, 2],
        "num_heads": [3, 6, 12, 24],
        "mlp_ratio": 4.0,
        "norm_eps": 1e-5,
    }
    gguf_writer.add_string("swin.type", "swin_transformer")
    gguf_writer.add_int32("swin.window_size", swin_config["window_size"])
    gguf_writer.add_int32("swin.patch_size", swin_config["patch_size"])
    gguf_writer.add_int32("swin.image_size", swin_config["image_size"])
    gguf_writer.add_int32("swin.hidden_dim", swin_config["hidden_dim"])
    gguf_writer.add_float32("swin.mlp_ratio", swin_config["mlp_ratio"])
    gguf_writer.add_float32("swin.norm_eps", swin_config["norm_eps"])
    # Convert encoder weights
    encoder_tensors = {k: v for k, v in model_dict.items() if k.startswith("encoder.")}
    for name, tensor in encoder_tensors.items():
        gguf_name = get_tensor_name(name)
        if args.verbose:
            print(f"  {name} -> {gguf_name} {list(tensor.shape)}")
        # Convert to appropriate dtype
        if args.quantization == "f32":
            data = tensor.float().cpu().numpy()
        elif args.quantization == "f16":
            data = tensor.half().cpu().numpy()
        else:
            # Quantization would be applied here
            data = tensor.float().cpu().numpy()
        gguf_writer.add_tensor(gguf_name, data)
    print(f"  Converted {len(encoder_tensors)} encoder tensors")
 def convert_mbart_decoder(model_dict: Dict[str, torch.Tensor], gguf_writer: gguf.GGUFWriter, args):
    """Convert mBART decoder weights to GGUF format"""
    print("Converting mBART decoder...")
    # Write mBART architecture info
    gguf_writer.add_string("general.architecture", "mbart")
    # Convert decoder weights
    decoder_tensors = {k: v for k, v in model_dict.items() if k.startswith("decoder.")}
    for name, tensor in decoder_tensors.items():
        gguf_name = get_tensor_name(name)
        if args.verbose:
            print(f"  {name} -> {gguf_name} {list(tensor.shape)}")
        # Convert to appropriate dtype
        if args.quantization == "f32":
            data = tensor.float().cpu().numpy()
        elif args.quantization == "f16":
            data = tensor.half().cpu().numpy()
        else:
            # Quantization would be applied here
            data = tensor.float().cpu().numpy()
        gguf_writer.add_tensor(gguf_name, data)
    print(f"  Converted {len(decoder_tensors)} decoder tensors")
 def convert_tokenizer(processor, gguf_writer: gguf.GGUFWriter, args):
    """Convert Nougat tokenizer/processor to GGUF format"""
    print("Converting tokenizer...")
    tokenizer = processor.tokenizer
    vocab = tokenizer.get_vocab()
    # Write tokenizer metadata
    gguf_writer.add_string("tokenizer.model", "mbart")
    gguf_writer.add_int32("tokenizer.vocab_size", len(vocab))
    # Add special tokens
    special_tokens = {
        "bos": tokenizer.bos_token,
        "eos": tokenizer.eos_token,
        "unk": tokenizer.unk_token,
        "pad": tokenizer.pad_token,
    }
    for key, token in special_tokens.items():
        if token:
            gguf_writer.add_string(f"tokenizer.{key}_token", token)
            gguf_writer.add_int32(f"tokenizer.{key}_token_id", tokenizer.convert_tokens_to_ids(token))
    # Add vocabulary
    tokens = []
    scores = []
    token_types = []
    for token, token_id in sorted(vocab.items(), key=lambda x: x[1]):
        tokens.append(token.encode("utf-8"))
        scores.append(0.0)  # Dummy scores for now
        token_types.append(1 if token in tokenizer.all_special_tokens else 0)
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(token_types)
    print(f"  Vocabulary size: {len(vocab)}")
 def main():
    args = parse_args()
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"Loading Nougat model from {args.model_id}...")
    # Load model and processor
    processor = NougatProcessor.from_pretrained(args.model_id)
    model = VisionEncoderDecoderModel.from_pretrained(args.model_id)
    # Get model state dict
    state_dict = model.state_dict()
    if args.split_model:
        # Create separate files for vision and text models
        # Vision model (Swin encoder)
        vision_output = output_dir / "nougat-vision.gguf"
        print(f"\nCreating vision model: {vision_output}")
        vision_writer = gguf.GGUFWriter(str(vision_output), "nougat-vision")
        vision_writer.add_string("general.name", "Nougat Vision Model (Swin)")
        vision_writer.add_string("general.description", "Swin Transformer encoder for Nougat OCR")
        vision_writer.add_string("general.architecture", "swin")
        convert_swin_encoder(state_dict, vision_writer, args)
        vision_writer.write_header_to_file()
        vision_writer.write_kv_data_to_file()
        vision_writer.write_tensors_to_file()
        vision_writer.close()
        # Text model (mBART decoder)
        text_output = output_dir / "nougat-text.gguf"
        print(f"\nCreating text model: {text_output}")
        text_writer = gguf.GGUFWriter(str(text_output), "nougat-text")
        text_writer.add_string("general.name", "Nougat Text Model (mBART)")
        text_writer.add_string("general.description", "mBART decoder for Nougat OCR")
        convert_mbart_decoder(state_dict, text_writer, args)
        convert_tokenizer(processor, text_writer, args)
        text_writer.write_header_to_file()
        text_writer.write_kv_data_to_file()
        text_writer.write_tensors_to_file()
        text_writer.close()
    else:
        # Create single combined model file
        output_file = output_dir / "nougat-combined.gguf"
        print(f"\nCreating combined model: {output_file}")
        writer = gguf.GGUFWriter(str(output_file), "nougat")
        writer.add_string("general.name", "Nougat OCR Model")
        writer.add_string("general.description", "Neural Optical Understanding for Academic Documents")
        writer.add_string("general.architecture", "nougat")
        # Add both encoder and decoder
        convert_swin_encoder(state_dict, writer, args)
        convert_mbart_decoder(state_dict, writer, args)
        if not args.vocab_only:
            convert_tokenizer(processor, writer, args)
        writer.write_header_to_file()
        writer.write_kv_data_to_file()
        writer.write_tensors_to_file()
        writer.close()
    print("\nConversion complete!")
    # Print model statistics
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel statistics:")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Encoder parameters: {sum(p.numel() for n, p in model.named_parameters() if 'encoder' in n):,}")
    print(f"  Decoder parameters: {sum(p.numel() for n, p in model.named_parameters() if 'decoder' in n):,}")
    if args.quantization != "f32":
        print(f"  Quantization: {args.quantization}")
 if __name__ == "__main__":
    main()
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -410,6 +410,8 @@ class MODEL_ARCH(IntEnum):
    BITNET           = auto()
    T5               = auto()
    T5ENCODER        = auto()
    MBART            = auto()
    MBARTENCODER     = auto()
    JAIS             = auto()
    NEMOTRON         = auto()
    NEMOTRON_H       = auto()
@ -784,6 +786,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BITNET:           "bitnet",
    MODEL_ARCH.T5:               "t5",
    MODEL_ARCH.T5ENCODER:        "t5encoder",
    MODEL_ARCH.MBART:            "mbart",
    MODEL_ARCH.MBARTENCODER:     "mbartencoder",
    MODEL_ARCH.JAIS:             "jais",
    MODEL_ARCH.NEMOTRON:         "nemotron",
    MODEL_ARCH.NEMOTRON_H:       "nemotron_h",
@ -2485,6 +2489,48 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ENC_FFN_UP,
        MODEL_TENSOR.ENC_OUTPUT_NORM,
    ],
    MODEL_ARCH.MBART: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_NORM_CROSS,
        MODEL_TENSOR.ATTN_Q_CROSS,
        MODEL_TENSOR.ATTN_K_CROSS,
        MODEL_TENSOR.ATTN_V_CROSS,
        MODEL_TENSOR.ATTN_OUT_CROSS,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.ENC_ATTN_NORM,
        MODEL_TENSOR.ENC_ATTN_Q,
        MODEL_TENSOR.ENC_ATTN_K,
        MODEL_TENSOR.ENC_ATTN_V,
        MODEL_TENSOR.ENC_ATTN_OUT,
        MODEL_TENSOR.ENC_FFN_NORM,
        MODEL_TENSOR.ENC_FFN_DOWN,
        MODEL_TENSOR.ENC_FFN_UP,
        MODEL_TENSOR.ENC_OUTPUT_NORM,
    ],
    MODEL_ARCH.MBARTENCODER: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ENC_ATTN_NORM,
        MODEL_TENSOR.ENC_ATTN_Q,
        MODEL_TENSOR.ENC_ATTN_K,
        MODEL_TENSOR.ENC_ATTN_V,
        MODEL_TENSOR.ENC_ATTN_OUT,
        MODEL_TENSOR.ENC_FFN_NORM,
        MODEL_TENSOR.ENC_FFN_DOWN,
        MODEL_TENSOR.ENC_FFN_UP,
        MODEL_TENSOR.ENC_OUTPUT_NORM,
    ],
    MODEL_ARCH.JAIS: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -72,6 +72,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_BITNET,           "bitnet"           },
    { LLM_ARCH_T5,               "t5"               },
    { LLM_ARCH_T5ENCODER,        "t5encoder"        },
    { LLM_ARCH_MBART,            "mbart"            },
    { LLM_ARCH_MBARTENCODER,     "mbartencoder"     },
    { LLM_ARCH_JAIS,             "jais"             },
    { LLM_ARCH_NEMOTRON,         "nemotron"         },
    { LLM_ARCH_NEMOTRON_H,       "nemotron_h"       },
@ -1706,6 +1708,54 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_MBART,
        {
            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
            { LLM_TENSOR_POS_EMBD,             "position_embd" },
            { LLM_TENSOR_OUTPUT_NORM,          "output_norm" },
            { LLM_TENSOR_OUTPUT,               "output" },
            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
            { LLM_TENSOR_ATTN_NORM,            "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,               "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,               "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,               "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,             "blk.%d.attn_o" },
            { LLM_TENSOR_ATTN_NORM_CROSS,      "blk.%d.attn_norm_cross" },
            { LLM_TENSOR_ATTN_Q_CROSS,         "blk.%d.attn_q_cross" },
            { LLM_TENSOR_ATTN_K_CROSS,         "blk.%d.attn_k_cross" },
            { LLM_TENSOR_ATTN_V_CROSS,         "blk.%d.attn_v_cross" },
            { LLM_TENSOR_ATTN_OUT_CROSS,       "blk.%d.attn_o_cross" },
            { LLM_TENSOR_FFN_NORM,             "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_DOWN,             "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,               "blk.%d.ffn_up" },
            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_MBARTENCODER,
        {
            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
            { LLM_TENSOR_POS_EMBD,             "position_embd" },
            { LLM_TENSOR_OUTPUT,               "output" },
            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_JAIS,
        {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -76,6 +76,8 @@ enum llm_arch {
    LLM_ARCH_BITNET,
    LLM_ARCH_T5,
    LLM_ARCH_T5ENCODER,
    LLM_ARCH_MBART,
    LLM_ARCH_MBARTENCODER,
    LLM_ARCH_JAIS,
    LLM_ARCH_NEMOTRON,
    LLM_ARCH_NEMOTRON_H,
@ -326,6 +328,11 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_OUT,
    LLM_TENSOR_ATTN_NORM,
    LLM_TENSOR_ATTN_NORM_2,
    LLM_TENSOR_ATTN_NORM_CROSS,
    LLM_TENSOR_ATTN_Q_CROSS,
    LLM_TENSOR_ATTN_K_CROSS,
    LLM_TENSOR_ATTN_V_CROSS,
    LLM_TENSOR_ATTN_OUT_CROSS,
    LLM_TENSOR_ATTN_OUT_NORM,
    LLM_TENSOR_ATTN_POST_NORM,
    LLM_TENSOR_ATTN_ROT_EMBD,
--- a/src/models/mbart-dec.cpp
+++ b/src/models/mbart-dec.cpp
@ -0,0 +1,162 @@
 #include "models.h"
 llm_build_mbart_dec::llm_build_mbart_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    // mBART uses learned positional embeddings
    inpL = build_inp_embd(model.tok_embd);
    // Add positional embeddings
    ggml_tensor * pos_embd = build_inp_pos_embd();
    if (pos_embd) {
        inpL = ggml_add(ctx0, inpL, pos_embd);
        cb(inpL, "pos_embd", -1);
    }
    // Get encoder embeddings for cross-attention
    ggml_tensor * embd_enc = build_inp_cross_embd();
    const int64_t n_outputs_enc = embd_enc->ne[1];
    // Layer normalization before the first layer (mBART characteristic)
    cur = build_norm(inpL,
            model.output_norm, NULL,
            LLM_NORM, -1);
    cb(cur, "input_norm", -1);
    inpL = cur;
    auto * inp_attn_self  = build_attn_inp_kv();
    auto * inp_attn_cross = build_attn_inp_cross();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    const int64_t dec_n_layer = hparams.dec_n_layer;
    for (int il = 0; il < dec_n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
        // self-attention
        {
            // norm before attention
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM, il);
            cb(cur, "attn_norm", il);
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            // mBART uses standard scaled dot-product attention
            cur = build_attn(inp_attn_self,
                    model.layers[il].wo, model.layers[il].bo,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf((float)n_embd_head), il);
            cb(cur, "kqv_out", il);
        }
        // residual connection
        cur = ggml_add(ctx0, cur, inpSA);
        cb(cur, "self_attn_out", il);
        ggml_tensor * inpCA = cur;
        // cross-attention
        {
            // norm before cross-attention
            cur = build_norm(cur,
                    model.layers[il].attn_norm_cross, NULL,
                    LLM_NORM, il);
            cb(cur, "attn_norm_cross", il);
            // Q from decoder, K and V from encoder
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
            cb(Qcur, "Qcur_cross", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
            cb(Kcur, "Kcur_cross", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
            cb(Vcur, "Vcur_cross", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
            cur = build_attn(inp_attn_cross,
                    model.layers[il].wo_cross, model.layers[il].bo_cross,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf((float)n_embd_head), il);
            cb(cur, "kqv_cross_out", il);
        }
        if (il == dec_n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
        }
        // residual connection
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
        cb(ffn_inp, "cross_attn_out", il);
        // feed-forward network
        {
            // norm before FFN
            cur = build_norm(ffn_inp,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM, il);
            cb(cur, "ffn_norm", il);
            // mBART uses GELU activation
            cur = build_ffn(cur,
                    model.layers[il].ffn_up,   NULL, NULL,
                    NULL, NULL, NULL,
                    model.layers[il].ffn_down, NULL, NULL,
                    NULL,
                    LLM_FFN_GELU,
                    LLM_FFN_SEQ,
                    il);
            cb(cur, "ffn_out", il);
        }
        // residual connection
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "layer_out", il);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cb(cur, "result_embd", -1);
    // Final layer normalization
    cur = build_norm(cur,
            model.output_norm, NULL,
            LLM_NORM, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    // lm_head for generation
    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/src/models/mbart-enc.cpp
+++ b/src/models/mbart-enc.cpp
@ -0,0 +1,114 @@
 #include "models.h"
 llm_build_mbart_enc::llm_build_mbart_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    ggml_tensor * cur;
    ggml_tensor * inpL;
    // mBART uses learned positional embeddings
    inpL = build_inp_embd(model.tok_embd);
    // Add positional embeddings for mBART
    ggml_tensor * pos_embd = build_inp_pos_embd();
    if (pos_embd) {
        inpL = ggml_add(ctx0, inpL, pos_embd);
        cb(inpL, "pos_embd", -1);
    }
    // Layer normalization before the first layer (mBART characteristic)
    cur = build_norm(inpL,
            model.output_norm_enc, NULL,
            LLM_NORM, -1);
    cb(cur, "input_norm", -1);
    inpL = cur;
    auto * inp_attn = build_attn_inp_no_cache();
    ggml_tensor * inp_out_ids = build_inp_out_ids();
    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
        // self-attention (mBART uses pre-norm)
        {
            // norm before attention
            cur = build_norm(inpL,
                    model.layers[il].attn_norm_enc, NULL,
                    LLM_NORM, il);
            cb(cur, "attn_norm", il);
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
            cb(Qcur, "Qcur", il);
            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
            cb(Kcur, "Kcur", il);
            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
            cb(Vcur, "Vcur", il);
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            // mBART uses standard scaled dot-product attention without relative position bias
            cur = build_attn(inp_attn,
                    model.layers[il].wo_enc, model.layers[il].bo_enc,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf((float)n_embd_head), il);
            cb(cur, "kqv_out", il);
        }
        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
        // residual connection
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "attn_out", il);
        // feed-forward network
        {
            // norm before FFN
            cur = build_norm(ffn_inp,
                    model.layers[il].ffn_norm_enc, NULL,
                    LLM_NORM, il);
            cb(cur, "ffn_norm", il);
            // mBART uses GELU activation
            cur = build_ffn(cur,
                    model.layers[il].ffn_up_enc,   NULL, NULL,
                    NULL, NULL, NULL,
                    model.layers[il].ffn_down_enc, NULL, NULL,
                    NULL,
                    LLM_FFN_GELU,
                    LLM_FFN_SEQ,
                    il);
            cb(cur, "ffn_out", il);
        }
        // residual connection
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "ffn_out", il);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        // input for next layer
        inpL = cur;
    }
    cur = inpL;
    cb(cur, "result_embd", -1);
    // Final layer normalization
    cur = build_norm(cur,
            model.output_norm_enc, NULL,
            LLM_NORM, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
    ggml_build_forward_expand(gf, cur);
 }
--- a/src/models/models.h
+++ b/src/models/models.h
@ -535,6 +535,14 @@ struct llm_build_t5_enc : public llm_graph_context {
    llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mbart_enc : public llm_graph_context {
    llm_build_mbart_enc(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_mbart_dec : public llm_graph_context {
    llm_build_mbart_dec(const llama_model & model, const llm_graph_params & params);
 };
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
    llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
 };
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -29,6 +29,7 @@ else()
    add_subdirectory(tokenize)
    add_subdirectory(tts)
    add_subdirectory(mtmd)
    add_subdirectory(nougat)
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
--- a/tools/mtmd/nougat-preprocess.cpp
+++ b/tools/mtmd/nougat-preprocess.cpp
@ -0,0 +1,386 @@
 #include "clip.h"
 #include "swin.h"
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <vector>
 #include <string>
 #include <memory>
 #include <algorithm>
 // External image loading library integration (stb_image)
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 #ifdef _WIN32
 #include <windows.h>
 #else
 #include <sys/stat.h>
 #endif
 // Document-specific preprocessing parameters for Nougat
 struct nougat_preprocess_params {
    int target_width = 896;     // Nougat uses different resolution than standard vision models
    int target_height = 1344;   // Optimized for document aspect ratio
    float mean[3] = {0.485f, 0.456f, 0.406f};  // ImageNet normalization
    float std[3] = {0.229f, 0.224f, 0.225f};
    bool center_crop = false;   // Documents should not be center-cropped
    bool maintain_aspect = true; // Important for documents
    int patch_size = 4;         // Swin Transformer patch size
 };
 // Structure to hold document metadata
 struct document_metadata {
    int original_width;
    int original_height;
    int num_pages;
    std::string format; // PDF, PNG, JPG, etc.
    float dpi;
 };
 // Preprocess a single document image for Nougat
 static bool preprocess_document_image(
    const uint8_t* img_data,
    int width,
    int height,
    int channels,
    const nougat_preprocess_params& params,
    std::vector<float>& output) {
    // Calculate scaling to fit target dimensions while maintaining aspect ratio
    float scale_w = static_cast<float>(params.target_width) / width;
    float scale_h = static_cast<float>(params.target_height) / height;
    float scale = params.maintain_aspect ? std::min(scale_w, scale_h) : 1.0f;
    int new_width = static_cast<int>(width * scale);
    int new_height = static_cast<int>(height * scale);
    // Ensure dimensions are divisible by patch size
    new_width = (new_width / params.patch_size) * params.patch_size;
    new_height = (new_height / params.patch_size) * params.patch_size;
    // Resize image using bilinear interpolation
    std::vector<uint8_t> resized_img(new_width * new_height * 3);
    for (int y = 0; y < new_height; y++) {
        for (int x = 0; x < new_width; x++) {
            float src_x = x / scale;
            float src_y = y / scale;
            int x0 = static_cast<int>(src_x);
            int y0 = static_cast<int>(src_y);
            int x1 = std::min(x0 + 1, width - 1);
            int y1 = std::min(y0 + 1, height - 1);
            float fx = src_x - x0;
            float fy = src_y - y0;
            for (int c = 0; c < 3; c++) {
                float v00 = img_data[(y0 * width + x0) * channels + c];
                float v10 = img_data[(y0 * width + x1) * channels + c];
                float v01 = img_data[(y1 * width + x0) * channels + c];
                float v11 = img_data[(y1 * width + x1) * channels + c];
                float v0 = v00 * (1 - fx) + v10 * fx;
                float v1 = v01 * (1 - fx) + v11 * fx;
                float v = v0 * (1 - fy) + v1 * fy;
                resized_img[(y * new_width + x) * 3 + c] = static_cast<uint8_t>(v);
            }
        }
    }
    // Pad to target size if needed
    int pad_left = (params.target_width - new_width) / 2;
    int pad_top = (params.target_height - new_height) / 2;
    output.resize(params.target_width * params.target_height * 3);
    // Initialize with padding (white background for documents)
    std::fill(output.begin(), output.end(), 1.0f);
    // Copy resized image to output with normalization
    for (int y = 0; y < new_height; y++) {
        for (int x = 0; x < new_width; x++) {
            int out_x = x + pad_left;
            int out_y = y + pad_top;
            if (out_x >= 0 && out_x < params.target_width &&
                out_y >= 0 && out_y < params.target_height) {
                for (int c = 0; c < 3; c++) {
                    float pixel = resized_img[(y * new_width + x) * 3 + c] / 255.0f;
                    pixel = (pixel - params.mean[c]) / params.std[c];
                    output[(out_y * params.target_width + out_x) * 3 + c] = pixel;
                }
            }
        }
    }
    return true;
 }
 // Load and preprocess a document file (supports various formats)
 bool nougat_preprocess_document_file(
    const std::string& filename,
    nougat_preprocess_params& params,
    std::vector<std::vector<float>>& page_outputs,
    document_metadata& metadata) {
    // Check file extension
    std::string ext = filename.substr(filename.find_last_of(".") + 1);
    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
    metadata.format = ext;
    if (ext == "pdf") {
        // PDF processing would require a PDF library like poppler or mupdf
        // For now, we'll return an error for PDF files
        fprintf(stderr, "PDF processing not yet implemented. Please convert to image format.\n");
        return false;
    }
    // Load image using stb_image
    int width, height, channels;
    unsigned char* img_data = stbi_load(filename.c_str(), &width, &height, &channels, 3);
    if (!img_data) {
        fprintf(stderr, "Failed to load image: %s\n", filename.c_str());
        return false;
    }
    metadata.original_width = width;
    metadata.original_height = height;
    metadata.num_pages = 1; // Single image
    metadata.dpi = 300.0f;  // Assume standard document DPI
    // Preprocess the image
    std::vector<float> output;
    bool success = preprocess_document_image(
        img_data, width, height, 3, params, output);
    if (success) {
        page_outputs.push_back(output);
    }
    stbi_image_free(img_data);
    return success;
 }
 // Batch preprocessing for multiple document pages
 bool nougat_preprocess_document_batch(
    const std::vector<std::string>& filenames,
    nougat_preprocess_params& params,
    std::vector<std::vector<float>>& outputs) {
    outputs.clear();
    outputs.reserve(filenames.size());
    for (const auto& filename : filenames) {
        document_metadata metadata;
        std::vector<std::vector<float>> page_outputs;
        if (!nougat_preprocess_document_file(filename, params, page_outputs, metadata)) {
            fprintf(stderr, "Failed to preprocess: %s\n", filename.c_str());
            continue;
        }
        // Add all pages from this document
        for (auto& page : page_outputs) {
            outputs.push_back(std::move(page));
        }
    }
    return !outputs.empty();
 }
 // Apply document-specific augmentations
 void nougat_augment_document(
    std::vector<float>& image_data,
    int width,
    int height,
    bool random_rotation = false,
    bool deskew = true,
    bool denoise = true) {
    // Document deskewing (straighten tilted scans)
    if (deskew) {
        // Simplified deskew - would need proper implementation
        // using Hough transform or similar technique
    }
    // Denoising for scanned documents
    if (denoise) {
        // Apply median filter or similar denoising
        // Simplified implementation
        std::vector<float> temp = image_data;
        for (int y = 1; y < height - 1; y++) {
            for (int x = 1; x < width - 1; x++) {
                for (int c = 0; c < 3; c++) {
                    std::vector<float> neighborhood;
                    // Collect 3x3 neighborhood
                    for (int dy = -1; dy <= 1; dy++) {
                        for (int dx = -1; dx <= 1; dx++) {
                            int idx = ((y + dy) * width + (x + dx)) * 3 + c;
                            neighborhood.push_back(temp[idx]);
                        }
                    }
                    // Median filter
                    std::sort(neighborhood.begin(), neighborhood.end());
                    image_data[(y * width + x) * 3 + c] = neighborhood[4];
                }
            }
        }
    }
    // Random rotation for augmentation during training
    if (random_rotation) {
        // Apply small random rotation (-5 to +5 degrees)
        // Would need proper rotation implementation
    }
 }
 // Extract text regions from document for focused processing
 struct text_region {
    int x, y, width, height;
    float confidence;
 };
 std::vector<text_region> nougat_detect_text_regions(
    const std::vector<float>& image_data,
    int width,
    int height) {
    std::vector<text_region> regions;
    // Simple text detection based on connected components
    // This would need a proper implementation using:
    // - Edge detection
    // - Connected component analysis
    // - Text/non-text classification
    // For now, return the whole image as a single region
    text_region full_page;
    full_page.x = 0;
    full_page.y = 0;
    full_page.width = width;
    full_page.height = height;
    full_page.confidence = 1.0f;
    regions.push_back(full_page);
    return regions;
 }
 // Enhanced preprocessing for mathematical formulas
 void nougat_preprocess_math_regions(
    std::vector<float>& image_data,
    int width,
    int height,
    const std::vector<text_region>& math_regions) {
    // Apply special preprocessing for mathematical content
    for (const auto& region : math_regions) {
        // Enhance contrast for mathematical symbols
        for (int y = region.y; y < region.y + region.height; y++) {
            for (int x = region.x; x < region.x + region.width; x++) {
                for (int c = 0; c < 3; c++) {
                    int idx = (y * width + x) * 3 + c;
                    float& pixel = image_data[idx];
                    // Increase contrast
                    pixel = (pixel - 0.5f) * 1.2f + 0.5f;
                    pixel = std::max(0.0f, std::min(1.0f, pixel));
                }
            }
        }
    }
 }
 // Table detection and preprocessing
 struct table_region {
    text_region bounds;
    int rows, cols;
    std::vector<text_region> cells;
 };
 std::vector<table_region> nougat_detect_tables(
    const std::vector<float>& image_data,
    int width,
    int height) {
    std::vector<table_region> tables;
    // Table detection would require:
    // - Line detection (horizontal and vertical)
    // - Grid structure analysis
    // - Cell boundary detection
    // Placeholder implementation
    return tables;
 }
 // Main preprocessing pipeline for Nougat OCR
 extern "C" bool nougat_preprocess_pipeline(
    const char* input_path,
    float** output_data,
    int* output_width,
    int* output_height,
    int* num_pages) {
    nougat_preprocess_params params;
    std::vector<std::vector<float>> page_outputs;
    document_metadata metadata;
    // Load and preprocess document
    if (!nougat_preprocess_document_file(
            input_path, params, page_outputs, metadata)) {
        return false;
    }
    // Apply document-specific processing
    for (auto& page : page_outputs) {
        // Detect text regions
        auto text_regions = nougat_detect_text_regions(
            page, params.target_width, params.target_height);
        // Apply augmentations
        nougat_augment_document(
            page, params.target_width, params.target_height,
            false, true, true);
        // Detect and process mathematical regions
        // (would need actual math detection)
        // nougat_preprocess_math_regions(page, width, height, math_regions);
    }
    // Prepare output
    if (!page_outputs.empty()) {
        *output_width = params.target_width;
        *output_height = params.target_height;
        *num_pages = page_outputs.size();
        // Allocate and copy data
        size_t total_size = params.target_width * params.target_height * 3 * page_outputs.size();
        *output_data = new float[total_size];
        size_t offset = 0;
        for (const auto& page : page_outputs) {
            std::copy(page.begin(), page.end(), *output_data + offset);
            offset += page.size();
        }
        return true;
    }
    return false;
 }
 // Cleanup function
 extern "C" void nougat_preprocess_cleanup(float* data) {
    delete[] data;
 }
--- a/tools/mtmd/nougat_surgery.py
+++ b/tools/mtmd/nougat_surgery.py
@ -0,0 +1,400 @@
 #!/usr/bin/env python3
 """
 Nougat Model Surgery Script
 Splits the Nougat model into separate vision encoder (Swin) and text decoder (mBART) components.
 Also creates the multimodal projector that connects them.
 """
 import argparse
 import json
 import os
 import sys
 from pathlib import Path
 from typing import Dict, Optional, Tuple
 import numpy as np
 import torch
 import torch.nn as nn
 from transformers import VisionEncoderDecoderModel, NougatProcessor
 # Add parent directory to import gguf
 sys.path.append(str(Path(__file__).parent.parent / "gguf-py"))
 import gguf
 class NougatModelSurgeon:
    """Handles splitting and converting Nougat model components"""
    def __init__(self, model_id: str, output_dir: str, verbose: bool = False):
        self.model_id = model_id
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.verbose = verbose
        # Load the model
        print(f"Loading Nougat model from {model_id}...")
        self.model = VisionEncoderDecoderModel.from_pretrained(model_id)
        self.processor = NougatProcessor.from_pretrained(model_id)
    def extract_vision_encoder(self) -> Dict[str, torch.Tensor]:
        """Extract Swin Transformer vision encoder weights"""
        print("Extracting vision encoder (Swin Transformer)...")
        vision_dict = {}
        encoder = self.model.encoder
        # Get all encoder parameters
        for name, param in encoder.named_parameters():
            # Map to our Swin naming convention
            mapped_name = self._map_swin_tensor_name(name)
            vision_dict[mapped_name] = param.detach().cpu()
            if self.verbose:
                print(f"  {name} -> {mapped_name} {list(param.shape)}")
        print(f"  Extracted {len(vision_dict)} vision encoder tensors")
        return vision_dict
    def extract_text_decoder(self) -> Dict[str, torch.Tensor]:
        """Extract mBART text decoder weights"""
        print("Extracting text decoder (mBART)...")
        decoder_dict = {}
        decoder = self.model.decoder
        # Get all decoder parameters
        for name, param in decoder.named_parameters():
            # Map to our mBART naming convention
            mapped_name = self._map_mbart_tensor_name(name)
            decoder_dict[mapped_name] = param.detach().cpu()
            if self.verbose:
                print(f"  {name} -> {mapped_name} {list(param.shape)}")
        print(f"  Extracted {len(decoder_dict)} text decoder tensors")
        return decoder_dict
    def extract_projector(self) -> Dict[str, torch.Tensor]:
        """Extract multimodal projector that connects vision and text models"""
        print("Extracting multimodal projector...")
        projector_dict = {}
        # In Nougat, the projection happens through the decoder's cross-attention
        # We need to extract the projection matrices that connect encoder outputs to decoder
        # Look for cross-attention weights in decoder
        for name, param in self.model.decoder.named_parameters():
            if "encoder_attn" in name:
                # These are the cross-attention weights that project from vision to text
                mapped_name = self._map_projector_tensor_name(name)
                projector_dict[mapped_name] = param.detach().cpu()
                if self.verbose:
                    print(f"  {name} -> {mapped_name} {list(param.shape)}")
        # If there's a specific projection layer between encoder and decoder
        if hasattr(self.model, "enc_to_dec_proj"):
            projector_dict["mm.projector.weight"] = self.model.enc_to_dec_proj.weight.detach().cpu()
            if hasattr(self.model.enc_to_dec_proj, "bias"):
                projector_dict["mm.projector.bias"] = self.model.enc_to_dec_proj.bias.detach().cpu()
        print(f"  Extracted {len(projector_dict)} projector tensors")
        return projector_dict
    def _map_swin_tensor_name(self, name: str) -> str:
        """Map HuggingFace Swin tensor names to our convention"""
        # Remove model prefix
        if name.startswith("model.encoder."):
            name = name[len("model.encoder."):]
        elif name.startswith("encoder."):
            name = name[len("encoder."):]
        # Patch embeddings
        if "embeddings.patch_embeddings" in name:
            if "projection.weight" in name:
                return "swin.patch_embed.weight"
            elif "projection.bias" in name:
                return "swin.patch_embed.bias"
            elif "norm" in name:
                return f"swin.patch_embed.norm.{'weight' if 'weight' in name else 'bias'}"
        # Position embeddings
        if "position_embeddings" in name:
            return "swin.pos_embed"
        # Parse layer structure
        if "layers." in name:
            parts = name.split(".")
            stage_idx = None
            layer_idx = None
            # Find stage and layer indices
            for i, part in enumerate(parts):
                if part == "layers" and i + 1 < len(parts):
                    stage_idx = int(parts[i + 1])
                if part == "blocks" and i + 1 < len(parts):
                    layer_idx = int(parts[i + 1])
            if stage_idx is not None:
                # Layer-specific components
                if layer_idx is not None:
                    # Attention
                    if "attn.qkv" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.attn.qkv.{suffix}"
                    elif "attn.proj" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.attn.proj.{suffix}"
                    # Norms
                    elif "norm1" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.norm1.{suffix}"
                    elif "norm2" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.norm2.{suffix}"
                    # MLP/FFN
                    elif "mlp.fc1" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.mlp.fc1.{suffix}"
                    elif "mlp.fc2" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.mlp.fc2.{suffix}"
                    # Relative position bias
                    elif "relative_position_bias_table" in name:
                        return f"swin.stage.{stage_idx}.layer.{layer_idx}.attn.relative_position_bias_table"
                # Downsample layers between stages
                elif "downsample" in name:
                    if "norm" in name:
                        suffix = "weight" if "weight" in name else "bias"
                        return f"swin.stage.{stage_idx}.downsample.norm.{suffix}"
                    elif "reduction" in name:
                        return f"swin.stage.{stage_idx}.downsample.reduction.weight"
        # Output normalization
        if "layernorm" in name or "layer_norm" in name:
            if "final" in name or "output" in name:
                suffix = "weight" if "weight" in name else "bias"
                return f"swin.norm.{suffix}"
        # Default mapping
        return f"swin.{name}"
    def _map_mbart_tensor_name(self, name: str) -> str:
        """Map HuggingFace mBART tensor names to our convention"""
        # Remove model prefix
        if name.startswith("model.decoder."):
            name = name[len("model.decoder."):]
        elif name.startswith("decoder."):
            name = name[len("decoder."):]
        # Embeddings
        if name == "embed_tokens.weight" or name == "shared.weight":
            return "token_embd.weight"
        elif "embed_positions" in name:
            return "position_embd.weight"
        # Parse decoder layers
        if "layers." in name:
            parts = name.split(".")
            layer_idx = int(parts[1])
            # Self-attention
            if "self_attn.q_proj" in name:
                return f"blk.{layer_idx}.attn_q.weight"
            elif "self_attn.k_proj" in name:
                return f"blk.{layer_idx}.attn_k.weight"
            elif "self_attn.v_proj" in name:
                return f"blk.{layer_idx}.attn_v.weight"
            elif "self_attn.out_proj" in name:
                return f"blk.{layer_idx}.attn_o.weight"
            elif "self_attn_layer_norm" in name:
                suffix = "weight" if "weight" in name else "bias"
                return f"blk.{layer_idx}.attn_norm.{suffix}"
            # Cross-attention (encoder-decoder attention)
            elif "encoder_attn.q_proj" in name:
                return f"blk.{layer_idx}.attn_q_cross.weight"
            elif "encoder_attn.k_proj" in name:
                return f"blk.{layer_idx}.attn_k_cross.weight"
            elif "encoder_attn.v_proj" in name:
                return f"blk.{layer_idx}.attn_v_cross.weight"
            elif "encoder_attn.out_proj" in name:
                return f"blk.{layer_idx}.attn_o_cross.weight"
            elif "encoder_attn_layer_norm" in name:
                suffix = "weight" if "weight" in name else "bias"
                return f"blk.{layer_idx}.attn_norm_cross.{suffix}"
            # FFN
            elif "fc1" in name:
                return f"blk.{layer_idx}.ffn_up.weight"
            elif "fc2" in name:
                return f"blk.{layer_idx}.ffn_down.weight"
            elif "final_layer_norm" in name:
                suffix = "weight" if "weight" in name else "bias"
                return f"blk.{layer_idx}.ffn_norm.{suffix}"
        # Output layers
        elif "layernorm" in name or "layer_norm" in name:
            suffix = "weight" if "weight" in name else "bias"
            return f"output_norm.{suffix}"
        elif "lm_head" in name or "output_projection" in name:
            return "output.weight"
        # Default mapping
        return name
    def _map_projector_tensor_name(self, name: str) -> str:
        """Map cross-attention tensors to projector names"""
        # Extract layer index from name
        if "layers." in name:
            parts = name.split(".")
            layer_idx = int(parts[1])
            if "encoder_attn.q_proj" in name:
                return f"mm.layer.{layer_idx}.q_proj.weight"
            elif "encoder_attn.k_proj" in name:
                return f"mm.layer.{layer_idx}.k_proj.weight"
            elif "encoder_attn.v_proj" in name:
                return f"mm.layer.{layer_idx}.v_proj.weight"
            elif "encoder_attn.out_proj" in name:
                return f"mm.layer.{layer_idx}.out_proj.weight"
        return f"mm.{name}"
    def save_component(self, tensors: Dict[str, torch.Tensor], filename: str, arch_name: str, description: str):
        """Save component tensors to GGUF file"""
        output_path = self.output_dir / filename
        print(f"Saving {arch_name} to {output_path}...")
        writer = gguf.GGUFWriter(str(output_path), arch_name)
        writer.add_string("general.name", arch_name)
        writer.add_string("general.description", description)
        writer.add_string("general.architecture", arch_name.lower())
        # Add tensors
        for name, tensor in tensors.items():
            data = tensor.float().cpu().numpy()
            writer.add_tensor(name, data)
        writer.write_header_to_file()
        writer.write_kv_data_to_file()
        writer.write_tensors_to_file()
        writer.close()
        print(f"  Saved {len(tensors)} tensors")
    def perform_surgery(self):
        """Main surgery operation - split model into components"""
        print("\n" + "=" * 60)
        print("Starting Nougat Model Surgery")
        print("=" * 60)
        # Extract components
        vision_tensors = self.extract_vision_encoder()
        text_tensors = self.extract_text_decoder()
        projector_tensors = self.extract_projector()
        # Save components
        print("\nSaving components...")
        self.save_component(
            vision_tensors,
            "nougat-vision-swin.gguf",
            "Nougat-Vision-Swin",
            "Swin Transformer vision encoder from Nougat OCR model"
        )
        self.save_component(
            text_tensors,
            "nougat-text-mbart.gguf",
            "Nougat-Text-mBART",
            "mBART text decoder from Nougat OCR model"
        )
        if projector_tensors:
            self.save_component(
                projector_tensors,
                "nougat-projector.gguf",
                "Nougat-Projector",
                "Multimodal projector connecting vision and text models"
            )
        # Save configuration
        self.save_config()
        print("\n" + "=" * 60)
        print("Surgery Complete!")
        print(f"Output files saved to: {self.output_dir}")
        print("=" * 60)
    def save_config(self):
        """Save model configuration for reconstruction"""
        config = {
            "model_id": self.model_id,
            "vision_config": {
                "architecture": "swin",
                "image_size": 384,
                "patch_size": 4,
                "window_size": 7,
                "num_channels": 3,
                "depths": [2, 2, 6, 2],
                "num_heads": [3, 6, 12, 24],
            },
            "text_config": {
                "architecture": "mbart",
                "vocab_size": self.processor.tokenizer.vocab_size,
                "max_position_embeddings": 1024,
                "hidden_size": self.model.config.decoder.hidden_size,
                "num_layers": self.model.config.decoder.num_hidden_layers,
                "num_attention_heads": self.model.config.decoder.num_attention_heads,
            },
            "components": {
                "vision": "nougat-vision-swin.gguf",
                "text": "nougat-text-mbart.gguf",
                "projector": "nougat-projector.gguf" if self.extract_projector() else None,
            }
        }
        config_path = self.output_dir / "nougat-config.json"
        with open(config_path, "w") as f:
            json.dump(config, f, indent=2)
        print(f"\nConfiguration saved to {config_path}")
 def main():
    parser = argparse.ArgumentParser(description="Nougat Model Surgery - Split model into components")
    parser.add_argument(
        "--model-id",
        type=str,
        default="facebook/nougat-base",
        help="HuggingFace model ID or path to local model"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./models/nougat-surgery",
        help="Output directory for split components"
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Verbose output showing tensor mappings"
    )
    args = parser.parse_args()
    surgeon = NougatModelSurgeon(args.model_id, args.output_dir, args.verbose)
    surgeon.perform_surgery()
 if __name__ == "__main__":
    main()
--- a/tools/mtmd/swin.cpp
+++ b/tools/mtmd/swin.cpp
@ -0,0 +1,475 @@
 #include "swin.h"
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "gguf.h"
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <stdexcept>
 #include <vector>
 #include <array>
 // Window partition operation - splits input into non-overlapping windows
 struct ggml_tensor * swin_window_partition(struct ggml_context * ctx, struct ggml_tensor * x, int window_size) {
    // x shape: [batch_size, height, width, channels]
    // output shape: [batch_size * num_windows, window_size, window_size, channels]
    int batch_size = x->ne[3];
    int H = x->ne[2];
    int W = x->ne[1];
    int C = x->ne[0];
    int nH = H / window_size;
    int nW = W / window_size;
    // Reshape to [batch_size, nH, window_size, nW, window_size, C]
    struct ggml_tensor * reshaped = ggml_reshape_4d(ctx, x,
        C * window_size,
        window_size * nW,
        nH,
        batch_size);
    // Permute to [batch_size, nH, nW, window_size, window_size, C]
    struct ggml_tensor * permuted = ggml_permute(ctx, reshaped, 0, 2, 1, 3);
    // Reshape to [batch_size * nH * nW, window_size, window_size, C]
    struct ggml_tensor * output = ggml_reshape_4d(ctx, permuted,
        C,
        window_size,
        window_size,
        batch_size * nH * nW);
    return output;
 }
 // Window reverse operation - merges windows back to original spatial dimensions
 struct ggml_tensor * swin_window_reverse(struct ggml_context * ctx, struct ggml_tensor * windows, int window_size, int H, int W) {
    // windows shape: [batch_size * num_windows, window_size, window_size, channels]
    // output shape: [batch_size, height, width, channels]
    int C = windows->ne[0];
    int nH = H / window_size;
    int nW = W / window_size;
    int batch_size = windows->ne[3] / (nH * nW);
    // Reshape to [batch_size, nH, nW, window_size, window_size, C]
    struct ggml_tensor * reshaped = ggml_reshape_4d(ctx, windows,
        C * window_size * window_size,
        nW,
        nH,
        batch_size);
    // Permute to [batch_size, nH, window_size, nW, window_size, C]
    struct ggml_tensor * permuted = ggml_permute(ctx, reshaped, 0, 2, 1, 3);
    // Reshape to [batch_size, H, W, C]
    struct ggml_tensor * output = ggml_reshape_4d(ctx, permuted, C, W, H, batch_size);
    return output;
 }
 // Create attention mask for shifted window attention
 struct ggml_tensor * swin_create_window_mask(struct ggml_context * ctx, int window_size, int shift_size, int H, int W) {
    if (shift_size == 0) {
        return nullptr; // No mask needed for non-shifted windows
    }
    // Create a mask tensor
    struct ggml_tensor * mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, H, W);
    // Initialize mask with region indices
    float * mask_data = (float *)mask->data;
    int h_slices[] = {0, H - window_size, H - shift_size, H};
    int w_slices[] = {0, W - window_size, W - shift_size, W};
    int cnt = 0;
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            for (int h = h_slices[i]; h < h_slices[i + 1]; h++) {
                for (int w = w_slices[j]; w < w_slices[j + 1]; w++) {
                    mask_data[h * W + w] = cnt;
                }
            }
            cnt++;
        }
    }
    return mask;
 }
 // Build window attention layer
 static struct ggml_tensor * swin_window_attention(
    struct ggml_context * ctx,
    struct ggml_tensor * x,
    const swin_layer & layer,
    int num_heads,
    int window_size,
    bool shifted) {
    int batch_size = x->ne[3];
    int seq_len = x->ne[2] * x->ne[1]; // window_size * window_size
    int hidden_dim = x->ne[0];
    int head_dim = hidden_dim / num_heads;
    // Reshape input for attention: [batch_size, seq_len, hidden_dim]
    x = ggml_reshape_3d(ctx, x, hidden_dim, seq_len, batch_size);
    // Layer norm
    x = ggml_norm(ctx, x, layer.ln1_w->ne[0]);
    x = ggml_add(ctx, ggml_mul(ctx, x, layer.ln1_w), layer.ln1_b);
    // QKV projection
    struct ggml_tensor * qkv = ggml_mul_mat(ctx, layer.qkv_w, x);
    qkv = ggml_add(ctx, qkv, layer.qkv_b);
    // Split into Q, K, V
    int qkv_dim = qkv->ne[0] / 3;
    struct ggml_tensor * q = ggml_view_3d(ctx, qkv, qkv_dim, seq_len, batch_size, qkv->nb[1], qkv->nb[2], 0);
    struct ggml_tensor * k = ggml_view_3d(ctx, qkv, qkv_dim, seq_len, batch_size, qkv->nb[1], qkv->nb[2], qkv_dim * ggml_element_size(qkv));
    struct ggml_tensor * v = ggml_view_3d(ctx, qkv, qkv_dim, seq_len, batch_size, qkv->nb[1], qkv->nb[2], 2 * qkv_dim * ggml_element_size(qkv));
    // Reshape for multi-head attention
    q = ggml_reshape_4d(ctx, q, head_dim, num_heads, seq_len, batch_size);
    k = ggml_reshape_4d(ctx, k, head_dim, num_heads, seq_len, batch_size);
    v = ggml_reshape_4d(ctx, v, head_dim, num_heads, seq_len, batch_size);
    // Transpose for attention: [batch_size, num_heads, seq_len, head_dim]
    q = ggml_permute(ctx, q, 0, 2, 1, 3);
    k = ggml_permute(ctx, k, 0, 2, 1, 3);
    v = ggml_permute(ctx, v, 0, 2, 1, 3);
    // Scaled dot-product attention
    float scale = 1.0f / sqrtf(head_dim);
    struct ggml_tensor * attn = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, k)), q);
    attn = ggml_scale(ctx, attn, scale);
    // Add relative position bias if available
    if (layer.relative_position_bias_table != nullptr) {
        // This would need proper indexing based on relative positions
        // For now, simplified version
        attn = ggml_add(ctx, attn, layer.relative_position_bias_table);
    }
    // Apply mask for shifted window attention
    if (shifted) {
        // Create and apply attention mask
        struct ggml_tensor * mask = swin_create_window_mask(ctx, window_size, window_size / 2,
                                                           window_size, window_size);
        if (mask != nullptr) {
            // Convert mask to attention mask
            attn = ggml_add(ctx, attn, mask);
        }
    }
    // Softmax
    attn = ggml_soft_max(ctx, attn);
    // Apply attention to values
    struct ggml_tensor * out = ggml_mul_mat(ctx, v, attn);
    // Transpose back: [batch_size, seq_len, num_heads, head_dim]
    out = ggml_permute(ctx, out, 0, 2, 1, 3);
    // Reshape to merge heads: [batch_size, seq_len, hidden_dim]
    out = ggml_reshape_3d(ctx, out, hidden_dim, seq_len, batch_size);
    // Output projection
    out = ggml_mul_mat(ctx, layer.proj_w, out);
    out = ggml_add(ctx, out, layer.proj_b);
    return out;
 }
 // Build FFN layer
 static struct ggml_tensor * swin_ffn(
    struct ggml_context * ctx,
    struct ggml_tensor * x,
    const swin_layer & layer,
    float mlp_ratio) {
    // Layer norm
    x = ggml_norm(ctx, x, layer.ln2_w->ne[0]);
    x = ggml_add(ctx, ggml_mul(ctx, x, layer.ln2_w), layer.ln2_b);
    // FFN: Linear -> GELU -> Linear
    x = ggml_mul_mat(ctx, layer.fc1_w, x);
    x = ggml_add(ctx, x, layer.fc1_b);
    x = ggml_gelu(ctx, x);
    x = ggml_mul_mat(ctx, layer.fc2_w, x);
    x = ggml_add(ctx, x, layer.fc2_b);
    return x;
 }
 // Build Swin Transformer block
 static struct ggml_tensor * swin_block(
    struct ggml_context * ctx,
    struct ggml_tensor * x,
    const swin_layer & layer,
    int num_heads,
    int window_size,
    bool shifted,
    float mlp_ratio) {
    int H = x->ne[2];
    int W = x->ne[1];
    struct ggml_tensor * shortcut = x;
    // Shifted window partitioning if needed
    if (shifted && (H > window_size || W > window_size)) {
        // Cyclic shift
        int shift_size = window_size / 2;
        x = ggml_roll(ctx, x, -shift_size, 2); // Roll along H dimension
        x = ggml_roll(ctx, x, -shift_size, 1); // Roll along W dimension
    }
    // Partition into windows
    if (H > window_size || W > window_size) {
        x = swin_window_partition(ctx, x, window_size);
    }
    // Window attention
    x = swin_window_attention(ctx, x, layer, num_heads, window_size, shifted);
    // Reverse window partition
    if (H > window_size || W > window_size) {
        x = swin_window_reverse(ctx, x, window_size, H, W);
    }
    // Reverse cyclic shift if needed
    if (shifted && (H > window_size || W > window_size)) {
        int shift_size = window_size / 2;
        x = ggml_roll(ctx, x, shift_size, 2); // Roll back along H dimension
        x = ggml_roll(ctx, x, shift_size, 1); // Roll back along W dimension
    }
    // Residual connection
    x = ggml_add(ctx, x, shortcut);
    // FFN with residual
    shortcut = x;
    x = swin_ffn(ctx, x, layer, mlp_ratio);
    x = ggml_add(ctx, x, shortcut);
    return x;
 }
 // Patch merging layer (downsampling)
 static struct ggml_tensor * swin_patch_merging(
    struct ggml_context * ctx,
    struct ggml_tensor * x,
    struct ggml_tensor * norm_w,
    struct ggml_tensor * norm_b,
    struct ggml_tensor * reduction) {
    int batch_size = x->ne[3];
    int H = x->ne[2];
    int W = x->ne[1];
    int C = x->ne[0];
    // Reshape to merge 2x2 patches
    x = ggml_reshape_4d(ctx, x, C, W/2, 2, H/2 * 2 * batch_size);
    x = ggml_permute(ctx, x, 0, 2, 1, 3);
    x = ggml_reshape_4d(ctx, x, C * 4, W/2, H/2, batch_size);
    // Layer norm
    x = ggml_norm(ctx, x, norm_w->ne[0]);
    x = ggml_add(ctx, ggml_mul(ctx, x, norm_w), norm_b);
    // Linear reduction
    x = ggml_mul_mat(ctx, reduction, x);
    return x;
 }
 // Build complete Swin Transformer graph
 struct ggml_cgraph * swin_build_graph(
    struct swin_ctx * ctx,
    const swin_image_batch * imgs,
    std::pair<int, int> load_image_size,
    bool is_inf) {
    if (!ctx->has_vision_encoder) {
        return nullptr;
    }
    const auto & model = ctx->vision_model;
    const auto & hparams = model.hparams;
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
        /*.no_alloc   =*/ true,
    };
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * cgraph = ggml_new_graph(ctx0);
    const int batch_size = imgs->size;
    const int image_size = hparams.image_size;
    const int patch_size = hparams.patch_size;
    const int num_patches_side = image_size / patch_size;
    const int num_patches = num_patches_side * num_patches_side;
    const int hidden_dim = hparams.hidden_dim;
    // Input image tensor
    struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
                                                  3, image_size, image_size, batch_size);
    ggml_set_name(inp, "inp");
    // Patch embedding: Conv2D with stride=patch_size
    struct ggml_tensor * x = ggml_conv_2d(ctx0, model.patch_embed, inp, patch_size, patch_size, 0, 0, 1, 1);
    // Reshape to [batch_size, num_patches, hidden_dim]
    x = ggml_reshape_3d(ctx0, x, hidden_dim, num_patches, batch_size);
    // Add positional embeddings if available
    if (model.pos_embed != nullptr) {
        x = ggml_add(ctx0, x, model.pos_embed);
    }
    // Layer norm after patch embedding
    if (model.patch_norm_w != nullptr) {
        x = ggml_norm(ctx0, x, model.patch_norm_w->ne[0]);
        x = ggml_add(ctx0, ggml_mul(ctx0, x, model.patch_norm_w), model.patch_norm_b);
    }
    // Reshape for spatial processing
    x = ggml_reshape_4d(ctx0, x, hidden_dim, num_patches_side, num_patches_side, batch_size);
    // Process through Swin stages
    int H = num_patches_side;
    int W = num_patches_side;
    int C = hidden_dim;
    for (size_t stage_idx = 0; stage_idx < model.stages.size(); stage_idx++) {
        const auto & stage = model.stages[stage_idx];
        // Process layers in this stage
        for (size_t layer_idx = 0; layer_idx < stage.layers.size(); layer_idx++) {
            const auto & layer = stage.layers[layer_idx];
            bool shifted = (layer_idx % 2 == 1); // Alternate between regular and shifted windows
            x = swin_block(ctx0, x, layer,
                         hparams.num_heads[stage_idx],
                         hparams.window_size,
                         shifted,
                         hparams.mlp_ratio);
        }
        // Patch merging (downsampling) between stages, except for the last stage
        if (stage_idx < model.stages.size() - 1 && stage.downsample_reduction != nullptr) {
            x = swin_patch_merging(ctx0, x,
                                 stage.downsample_norm_w,
                                 stage.downsample_norm_b,
                                 stage.downsample_reduction);
            H /= 2;
            W /= 2;
            C *= 2; // Channel dimension doubles after patch merging
        }
    }
    // Global average pooling
    x = ggml_reshape_3d(ctx0, x, C, H * W, batch_size);
    x = ggml_mean(ctx0, x); // Average over spatial dimensions
    // Final layer norm
    if (model.output_norm_w != nullptr) {
        x = ggml_norm(ctx0, x, model.output_norm_w->ne[0]);
        x = ggml_add(ctx0, ggml_mul(ctx0, x, model.output_norm_w), model.output_norm_b);
    }
    ggml_set_name(x, "output");
    ggml_build_forward_expand(cgraph, x);
    return cgraph;
 }
 // Model loading function
 struct swin_ctx * swin_model_load(const std::string & fname, int verbosity) {
    struct swin_ctx * ctx = new swin_ctx();
    struct gguf_init_params params = {
        /*.no_alloc = */ true,
        /*.ctx      = */ &ctx->ctx,
    };
    struct gguf_context * gguf_ctx = gguf_init_from_file(fname.c_str(), params);
    if (!gguf_ctx) {
        fprintf(stderr, "%s: failed to load model from %s\n", __func__, fname.c_str());
        swin_free(ctx);
        return nullptr;
    }
    // Load hyperparameters
    auto & hparams = ctx->vision_model.hparams;
    // Read Swin-specific parameters from GGUF
    const int n_kv = gguf_get_n_kv(gguf_ctx);
    for (int i = 0; i < n_kv; ++i) {
        const char * key = gguf_get_key(gguf_ctx, i);
        if (strcmp(key, KEY_SWIN_WINDOW_SIZE) == 0) {
            hparams.window_size = gguf_get_val_i32(gguf_ctx, i);
        } else if (strcmp(key, KEY_SWIN_PATCH_SIZE) == 0) {
            hparams.patch_size = gguf_get_val_i32(gguf_ctx, i);
        } else if (strcmp(key, KEY_SWIN_IMAGE_SIZE) == 0) {
            hparams.image_size = gguf_get_val_i32(gguf_ctx, i);
        } else if (strcmp(key, KEY_SWIN_HIDDEN_DIM) == 0) {
            hparams.hidden_dim = gguf_get_val_i32(gguf_ctx, i);
        } else if (strcmp(key, KEY_SWIN_MLP_RATIO) == 0) {
            hparams.mlp_ratio = gguf_get_val_f32(gguf_ctx, i);
        } else if (strcmp(key, KEY_SWIN_NORM_EPS) == 0) {
            hparams.norm_eps = gguf_get_val_f32(gguf_ctx, i);
        }
        // TODO: Load depths and num_heads arrays
    }
    ctx->has_vision_encoder = true;
    if (verbosity >= 1) {
        printf("Swin Transformer model loaded:\n");
        printf("  image_size:  %d\n", hparams.image_size);
        printf("  patch_size:  %d\n", hparams.patch_size);
        printf("  window_size: %d\n", hparams.window_size);
        printf("  hidden_dim:  %d\n", hparams.hidden_dim);
        printf("  num_stages:  %d\n", hparams.num_stages());
    }
    // TODO: Load actual tensor weights from GGUF file
    gguf_free(gguf_ctx);
    return ctx;
 }
 // Free context
 void swin_free(struct swin_ctx * ctx) {
    if (ctx == nullptr) {
        return;
    }
    if (ctx->backend) {
        ggml_backend_free(ctx->backend);
    }
    if (ctx->params_buffer) {
        ggml_backend_buffer_free(ctx->params_buffer);
    }
    if (ctx->compute_buffer) {
        ggml_backend_buffer_free(ctx->compute_buffer);
    }
    if (ctx->ctx) {
        ggml_free(ctx->ctx);
    }
    delete ctx;
 }
--- a/tools/mtmd/swin.h
+++ b/tools/mtmd/swin.h
@ -0,0 +1,153 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include "clip-impl.h"
 #include <vector>
 #include <string>
 // Swin Transformer constants
 #define KEY_SWIN_WINDOW_SIZE      "swin.window_size"
 #define KEY_SWIN_PATCH_SIZE       "swin.patch_size"
 #define KEY_SWIN_IMAGE_SIZE       "swin.image_size"
 #define KEY_SWIN_DEPTHS           "swin.depths"
 #define KEY_SWIN_NUM_HEADS        "swin.num_heads"
 #define KEY_SWIN_HIDDEN_DIM       "swin.hidden_dim"
 #define KEY_SWIN_NUM_CHANNELS     "swin.num_channels"
 #define KEY_SWIN_MLP_RATIO        "swin.mlp_ratio"
 #define KEY_SWIN_DROP_PATH_RATE   "swin.drop_path_rate"
 #define KEY_SWIN_NORM_EPS         "swin.norm_eps"
 // Tensor names for Swin Transformer
 #define TN_SWIN_PATCH_EMBED       "swin.patch_embed.weight"
 #define TN_SWIN_PATCH_NORM        "swin.patch_embed.norm.%s"
 #define TN_SWIN_POS_EMBED         "swin.pos_embed"
 #define TN_SWIN_DOWNSAMPLE_NORM   "swin.stage.%d.downsample.norm.%s"
 #define TN_SWIN_DOWNSAMPLE_PROJ   "swin.stage.%d.downsample.reduction.weight"
 #define TN_SWIN_ATTN_NORM         "swin.stage.%d.layer.%d.norm1.%s"
 #define TN_SWIN_ATTN_QKV          "swin.stage.%d.layer.%d.attn.qkv.%s"
 #define TN_SWIN_ATTN_PROJ         "swin.stage.%d.layer.%d.attn.proj.%s"
 #define TN_SWIN_ATTN_REL_POS      "swin.stage.%d.layer.%d.attn.relative_position_bias_table"
 #define TN_SWIN_FFN_NORM          "swin.stage.%d.layer.%d.norm2.%s"
 #define TN_SWIN_FFN_FC1           "swin.stage.%d.layer.%d.mlp.fc1.%s"
 #define TN_SWIN_FFN_FC2           "swin.stage.%d.layer.%d.mlp.fc2.%s"
 #define TN_SWIN_OUTPUT_NORM       "swin.norm.%s"
 // Forward declarations
 struct swin_ctx;
 // Swin Transformer hyperparameters
 struct swin_hparams {
    int32_t image_size = 384;
    int32_t patch_size = 4;
    int32_t num_channels = 3;
    int32_t window_size = 7;
    int32_t hidden_dim = 96;
    std::vector<int32_t> depths = {2, 2, 6, 2};       // depths for each stage
    std::vector<int32_t> num_heads = {3, 6, 12, 24};  // number of heads for each stage
    float mlp_ratio = 4.0f;
    float drop_path_rate = 0.1f;
    float norm_eps = 1e-5f;
    bool use_checkpoint = false;
    // Computed values
    int32_t num_stages() const { return depths.size(); }
    int32_t num_patches() const { return (image_size / patch_size) * (image_size / patch_size); }
 };
 // Swin Transformer layer
 struct swin_layer {
    // Window attention
    struct ggml_tensor * ln1_w;
    struct ggml_tensor * ln1_b;
    struct ggml_tensor * qkv_w;
    struct ggml_tensor * qkv_b;
    struct ggml_tensor * proj_w;
    struct ggml_tensor * proj_b;
    struct ggml_tensor * relative_position_bias_table;
    // FFN
    struct ggml_tensor * ln2_w;
    struct ggml_tensor * ln2_b;
    struct ggml_tensor * fc1_w;
    struct ggml_tensor * fc1_b;
    struct ggml_tensor * fc2_w;
    struct ggml_tensor * fc2_b;
 };
 // Swin Transformer stage
 struct swin_stage {
    std::vector<swin_layer> layers;
    // Patch merging (downsample) layer
    struct ggml_tensor * downsample_norm_w = nullptr;
    struct ggml_tensor * downsample_norm_b = nullptr;
    struct ggml_tensor * downsample_reduction = nullptr;
 };
 // Swin Transformer vision model
 struct swin_vision_model {
    swin_hparams hparams;
    // Patch embedding
    struct ggml_tensor * patch_embed;
    struct ggml_tensor * patch_norm_w;
    struct ggml_tensor * patch_norm_b;
    struct ggml_tensor * pos_embed;
    // Stages
    std::vector<swin_stage> stages;
    // Output norm
    struct ggml_tensor * output_norm_w;
    struct ggml_tensor * output_norm_b;
 };
 // Main Swin context
 struct swin_ctx {
    bool has_vision_encoder = false;
    bool has_projector = false;
    swin_vision_model vision_model;
    // Backend and compute
    struct ggml_backend * backend = nullptr;
    ggml_backend_buffer_t params_buffer = nullptr;
    struct ggml_context * ctx = nullptr;
    std::vector<uint8_t> buf_compute_meta;
    // GGML compute resources
    struct ggml_backend_buffer * compute_buffer = nullptr;
    struct ggml_context * ctx_compute = nullptr;
    struct ggml_alloc * compute_alloc = nullptr;
 };
 // Public API functions
 struct swin_ctx * swin_model_load(const std::string & fname, int verbosity = 1);
 void swin_free(struct swin_ctx * ctx);
 // Build Swin Transformer graph for inference
 struct ggml_cgraph * swin_build_graph(
    struct swin_ctx * ctx,
    const swin_image_batch * imgs,
    std::pair<int, int> load_image_size = {0, 0},
    bool is_inf = false);
 // Encode image batch
 bool swin_image_batch_encode(
    struct swin_ctx * ctx,
    int n_threads,
    const swin_image_batch * imgs,
    float * vec);
 // Utility functions
 int swin_patch_size(const struct swin_ctx * ctx);
 bool swin_image_preprocess(struct swin_ctx * ctx, const swin_image_u8 * img, swin_image_f32 * res);
 bool swin_image_batch_preprocess(struct swin_ctx * ctx, int n_threads, const swin_image_batch * imgs, swin_image_f32_batch * res_batch);
 // Window operations for Swin Transformer
 struct ggml_tensor * swin_window_partition(struct ggml_context * ctx, struct ggml_tensor * x, int window_size);
 struct ggml_tensor * swin_window_reverse(struct ggml_context * ctx, struct ggml_tensor * windows, int window_size, int H, int W);
 struct ggml_tensor * swin_create_window_mask(struct ggml_context * ctx, int window_size, int shift_size, int H, int W);
 struct ggml_tensor * swin_compute_mask(struct ggml_context * ctx, int window_size, int shift_size, int H, int W);
--- a/tools/nougat-cli.cpp
+++ b/tools/nougat-cli.cpp
@ -0,0 +1,539 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
 #include "mtmd/swin.h"
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include <thread>
 #include <chrono>
 // External preprocessing function
 extern "C" bool nougat_preprocess_pipeline(
    const char* input_path,
    float** output_data,
    int* output_width,
    int* output_height,
    int* num_pages);
 extern "C" void nougat_preprocess_cleanup(float* data);
 // CLI arguments structure
 struct nougat_params {
    std::string input_path = "";
    std::string output_path = "";
    std::string vision_model = "models/nougat-vision-swin.gguf";
    std::string text_model = "models/nougat-text-mbart.gguf";
    std::string projector_model = "models/nougat-projector.gguf";
    // Processing options
    bool batch_mode = false;
    int batch_size = 1;
    int n_threads = 4;
    int n_gpu_layers = 0;
    // Output options
    std::string output_format = "markdown"; // markdown, latex, plain
    bool verbose = false;
    bool save_intermediate = false;
    // Performance options
    bool use_mmap = true;
    bool use_flash_attn = false;
    int context_size = 2048;
    // Document-specific options
    bool deskew = true;
    bool denoise = true;
    bool detect_tables = true;
    bool detect_math = true;
    int max_pages = -1; // -1 for all pages
 };
 static void print_usage(const char* prog_name) {
    fprintf(stdout, "\n");
    fprintf(stdout, "Nougat OCR - Neural Optical Understanding for Academic Documents\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "Usage: %s [options] -i input_file -o output_file\n", prog_name);
    fprintf(stdout, "\n");
    fprintf(stdout, "Options:\n");
    fprintf(stdout, "  -i, --input FILE          Input document (PDF, PNG, JPG)\n");
    fprintf(stdout, "  -o, --output FILE         Output file path\n");
    fprintf(stdout, "  --vision-model FILE       Path to vision model GGUF (default: models/nougat-vision-swin.gguf)\n");
    fprintf(stdout, "  --text-model FILE         Path to text model GGUF (default: models/nougat-text-mbart.gguf)\n");
    fprintf(stdout, "  --projector FILE          Path to projector model GGUF (default: models/nougat-projector.gguf)\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "  Processing Options:\n");
    fprintf(stdout, "  -t, --threads N           Number of threads (default: 4)\n");
    fprintf(stdout, "  -ngl, --n-gpu-layers N    Number of layers to offload to GPU (default: 0)\n");
    fprintf(stdout, "  -b, --batch-size N        Batch size for processing (default: 1)\n");
    fprintf(stdout, "  -c, --context-size N      Context size (default: 2048)\n");
    fprintf(stdout, "  --max-pages N             Maximum pages to process (default: all)\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "  Output Options:\n");
    fprintf(stdout, "  -f, --format FORMAT       Output format: markdown, latex, plain (default: markdown)\n");
    fprintf(stdout, "  -v, --verbose             Verbose output\n");
    fprintf(stdout, "  --save-intermediate       Save intermediate processing results\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "  Document Processing:\n");
    fprintf(stdout, "  --no-deskew               Disable automatic deskewing\n");
    fprintf(stdout, "  --no-denoise              Disable denoising\n");
    fprintf(stdout, "  --no-tables               Disable table detection\n");
    fprintf(stdout, "  --no-math                 Disable math formula detection\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "  Performance Options:\n");
    fprintf(stdout, "  --no-mmap                 Disable memory mapping\n");
    fprintf(stdout, "  --flash-attn              Use flash attention\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "Examples:\n");
    fprintf(stdout, "  # Basic OCR of a PDF document\n");
    fprintf(stdout, "  %s -i paper.pdf -o paper.md\n", prog_name);
    fprintf(stdout, "\n");
    fprintf(stdout, "  # Process with GPU acceleration\n");
    fprintf(stdout, "  %s -i scan.png -o text.md -ngl 32 -t 8\n", prog_name);
    fprintf(stdout, "\n");
    fprintf(stdout, "  # LaTeX output with math detection\n");
    fprintf(stdout, "  %s -i math_paper.pdf -o paper.tex -f latex --detect-math\n", prog_name);
    fprintf(stdout, "\n");
 }
 static bool parse_args(int argc, char** argv, nougat_params& params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-i" || arg == "--input") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.input_path = argv[i];
        }
        else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.output_path = argv[i];
        }
        else if (arg == "--vision-model") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.vision_model = argv[i];
        }
        else if (arg == "--text-model") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.text_model = argv[i];
        }
        else if (arg == "--projector") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.projector_model = argv[i];
        }
        else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.n_gpu_layers = std::stoi(argv[i]);
        }
        else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.batch_size = std::stoi(argv[i]);
        }
        else if (arg == "-c" || arg == "--context-size") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.context_size = std::stoi(argv[i]);
        }
        else if (arg == "--max-pages") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.max_pages = std::stoi(argv[i]);
        }
        else if (arg == "-f" || arg == "--format") {
            if (++i >= argc) {
                fprintf(stderr, "Error: Missing argument for %s\n", arg.c_str());
                return false;
            }
            params.output_format = argv[i];
        }
        else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        }
        else if (arg == "--save-intermediate") {
            params.save_intermediate = true;
        }
        else if (arg == "--no-deskew") {
            params.deskew = false;
        }
        else if (arg == "--no-denoise") {
            params.denoise = false;
        }
        else if (arg == "--no-tables") {
            params.detect_tables = false;
        }
        else if (arg == "--no-math") {
            params.detect_math = false;
        }
        else if (arg == "--no-mmap") {
            params.use_mmap = false;
        }
        else if (arg == "--flash-attn") {
            params.use_flash_attn = true;
        }
        else if (arg == "-h" || arg == "--help") {
            print_usage(argv[0]);
            exit(0);
        }
        else {
            fprintf(stderr, "Error: Unknown argument '%s'\n", arg.c_str());
            return false;
        }
    }
    // Validate required arguments
    if (params.input_path.empty()) {
        fprintf(stderr, "Error: Input file is required\n");
        return false;
    }
    if (params.output_path.empty()) {
        // Generate default output path
        size_t dot_pos = params.input_path.find_last_of(".");
        params.output_path = params.input_path.substr(0, dot_pos);
        if (params.output_format == "markdown") {
            params.output_path += ".md";
        } else if (params.output_format == "latex") {
            params.output_path += ".tex";
        } else {
            params.output_path += ".txt";
        }
    }
    return true;
 }
 // Process a single page through the Nougat pipeline
 static std::string process_page(
    struct swin_ctx* vision_ctx,
    struct llama_model* text_model,
    struct llama_context* text_ctx,
    const float* image_data,
    int width,
    int height,
    const nougat_params& params) {
    // Step 1: Encode image with Swin Transformer
    if (params.verbose) {
        printf("Encoding image with Swin Transformer...\n");
    }
    // Create image batch
    swin_image_f32 img = {
        width,
        height,
        3,
        std::vector<float>(image_data, image_data + width * height * 3)
    };
    swin_image_batch imgs = {1, &img};
    // Encode image
    std::vector<float> vision_embeddings(2048); // Adjust size based on model
    if (!swin_image_batch_encode(vision_ctx, params.n_threads, &imgs, vision_embeddings.data())) {
        fprintf(stderr, "Failed to encode image\n");
        return "";
    }
    // Step 2: Pass embeddings through projector
    // This would map vision embeddings to text embedding space
    // Step 3: Generate text with mBART decoder
    if (params.verbose) {
        printf("Generating text with mBART decoder...\n");
    }
    // Create batch for text generation
    llama_batch batch = llama_batch_init(params.context_size, 0, 1);
    // Set up cross-attention with vision embeddings
    // This requires the decoder to attend to encoder outputs
    // Start with BOS token
    llama_token bos_token = llama_token_get_bos(text_model);
    batch.token[0] = bos_token;
    batch.pos[0] = 0;
    batch.n_seq_id[0] = 1;
    batch.seq_id[0][0] = 0;
    batch.n_tokens = 1;
    // Decode initial token
    if (llama_decode(text_ctx, batch) != 0) {
        fprintf(stderr, "Failed to decode\n");
        llama_batch_free(batch);
        return "";
    }
    // Generate text autoregressively
    std::vector<llama_token> generated_tokens;
    generated_tokens.push_back(bos_token);
    llama_token eos_token = llama_token_get_eos(text_model);
    int max_tokens = params.context_size;
    for (int i = 1; i < max_tokens; i++) {
        // Get logits from last position
        float* logits = llama_get_logits_ith(text_ctx, batch.n_tokens - 1);
        // Sample next token
        int n_vocab = llama_n_vocab(text_model);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
        // Sample with top-k and top-p
        int top_k = 40;
        float top_p = 0.9f;
        float temp = 0.8f;
        llama_sample_top_k(text_ctx, &candidates_p, top_k, 1);
        llama_sample_top_p(text_ctx, &candidates_p, top_p, 1);
        llama_sample_temp(text_ctx, &candidates_p, temp);
        llama_token new_token = llama_sample_token(text_ctx, &candidates_p);
        // Check for EOS
        if (new_token == eos_token) {
            break;
        }
        generated_tokens.push_back(new_token);
        // Add to batch for next iteration
        batch.token[0] = new_token;
        batch.pos[0] = i;
        batch.n_tokens = 1;
        if (llama_decode(text_ctx, batch) != 0) {
            fprintf(stderr, "Failed to continue decoding\n");
            break;
        }
    }
    llama_batch_free(batch);
    // Convert tokens to text
    std::string result;
    for (auto token : generated_tokens) {
        std::string piece = llama_token_to_piece(text_ctx, token, true);
        result += piece;
    }
    return result;
 }
 int main(int argc, char** argv) {
    nougat_params params;
    // Parse command line arguments
    if (!parse_args(argc, argv, params)) {
        print_usage(argv[0]);
        return 1;
    }
    // Print banner
    printf("\n");
    printf("╔═══════════════════════════════════════════════════════╗\n");
    printf("║          Nougat OCR - Document Understanding          ║\n");
    printf("║        Powered by Swin Transformer + mBART            ║\n");
    printf("╚═══════════════════════════════════════════════════════╝\n");
    printf("\n");
    printf("Input:  %s\n", params.input_path.c_str());
    printf("Output: %s\n", params.output_path.c_str());
    printf("Format: %s\n", params.output_format.c_str());
    printf("\n");
    // Initialize backend
    llama_backend_init();
    // Load vision model (Swin Transformer)
    printf("Loading vision model from %s...\n", params.vision_model.c_str());
    struct swin_ctx* vision_ctx = swin_model_load(params.vision_model, params.verbose ? 2 : 1);
    if (!vision_ctx) {
        fprintf(stderr, "Failed to load vision model\n");
        return 1;
    }
    // Load text model (mBART)
    printf("Loading text model from %s...\n", params.text_model.c_str());
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = params.n_gpu_layers;
    model_params.use_mmap = params.use_mmap;
    struct llama_model* text_model = llama_load_model_from_file(
        params.text_model.c_str(), model_params);
    if (!text_model) {
        fprintf(stderr, "Failed to load text model\n");
        swin_free(vision_ctx);
        return 1;
    }
    // Create text generation context
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = params.context_size;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads;
    ctx_params.flash_attn = params.use_flash_attn;
    struct llama_context* text_ctx = llama_new_context_with_model(text_model, ctx_params);
    if (!text_ctx) {
        fprintf(stderr, "Failed to create text context\n");
        llama_free_model(text_model);
        swin_free(vision_ctx);
        return 1;
    }
    // Preprocess document
    printf("Preprocessing document...\n");
    float* preprocessed_data = nullptr;
    int width, height, num_pages;
    if (!nougat_preprocess_pipeline(
            params.input_path.c_str(),
            &preprocessed_data,
            &width, &height, &num_pages)) {
        fprintf(stderr, "Failed to preprocess document\n");
        llama_free(text_ctx);
        llama_free_model(text_model);
        swin_free(vision_ctx);
        return 1;
    }
    printf("Document info: %d pages, %dx%d pixels\n", num_pages, width, height);
    // Limit pages if requested
    if (params.max_pages > 0 && num_pages > params.max_pages) {
        num_pages = params.max_pages;
        printf("Processing first %d pages only\n", num_pages);
    }
    // Process each page
    std::string full_output;
    auto start_time = std::chrono::high_resolution_clock::now();
    for (int page = 0; page < num_pages; page++) {
        printf("\nProcessing page %d/%d...\n", page + 1, num_pages);
        float* page_data = preprocessed_data + (page * width * height * 3);
        std::string page_text = process_page(
            vision_ctx, text_model, text_ctx,
            page_data, width, height, params);
        if (page_text.empty()) {
            fprintf(stderr, "Warning: Failed to process page %d\n", page + 1);
            continue;
        }
        // Add page separator for multi-page documents
        if (page > 0) {
            if (params.output_format == "markdown") {
                full_output += "\n\n---\n\n";
            } else if (params.output_format == "latex") {
                full_output += "\n\\newpage\n\n";
            } else {
                full_output += "\n\n[Page " + std::to_string(page + 1) + "]\n\n";
            }
        }
        full_output += page_text;
        // Save intermediate results if requested
        if (params.save_intermediate) {
            std::string intermediate_file = params.output_path + ".page" +
                                          std::to_string(page + 1) + ".tmp";
            std::ofstream tmp_out(intermediate_file);
            tmp_out << page_text;
            tmp_out.close();
        }
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time);
    // Save final output
    printf("\nSaving output to %s...\n", params.output_path.c_str());
    std::ofstream output_file(params.output_path);
    if (!output_file) {
        fprintf(stderr, "Failed to open output file\n");
    } else {
        // Add format-specific headers/footers
        if (params.output_format == "latex") {
            output_file << "\\documentclass{article}\n";
            output_file << "\\usepackage{amsmath}\n";
            output_file << "\\usepackage{graphicx}\n";
            output_file << "\\begin{document}\n\n";
        }
        output_file << full_output;
        if (params.output_format == "latex") {
            output_file << "\n\n\\end{document}\n";
        }
        output_file.close();
    }
    // Print statistics
    printf("\n");
    printf("╔════════════════════════════════════╗\n");
    printf("║         OCR Complete!              ║\n");
    printf("╠════════════════════════════════════╣\n");
    printf("║ Pages processed: %-17d ║\n", num_pages);
    printf("║ Time taken:      %-17lds║\n", duration.count());
    printf("║ Output size:     %-17zd ║\n", full_output.size());
    printf("╚════════════════════════════════════╝\n");
    // Cleanup
    nougat_preprocess_cleanup(preprocessed_data);
    llama_free(text_ctx);
    llama_free_model(text_model);
    swin_free(vision_ctx);
    llama_backend_free();
    return 0;
 }
--- a/tools/nougat/CMakeLists.txt
+++ b/tools/nougat/CMakeLists.txt
@ -0,0 +1,27 @@
 set(TARGET nougat-cli)
 # Add executable
 add_executable(${TARGET} ../nougat-cli.cpp)
 # Link with llama library
 target_link_libraries(${TARGET} PRIVATE
    llama
    ${CMAKE_THREAD_LIBS_INIT}
 )
 # Include directories
 target_include_directories(${TARGET} PRIVATE
    ${CMAKE_SOURCE_DIR}/include
    ${CMAKE_SOURCE_DIR}/tools
 )
 # Compile flags
 llama_add_compile_flags()
 # Set output name
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "nougat-cli")
 # Install target
 if(LLAMA_INSTALL)
    install(TARGETS ${TARGET} RUNTIME DESTINATION bin)
 endif()