From 4aeffc690d7b45c2acf2bbdc21274393242a8a0b Mon Sep 17 00:00:00 2001 From: itigges22 Date: Fri, 20 Mar 2026 00:52:32 -0400 Subject: [PATCH] doc: document MTP attention requirement for higher acceptance The MTP head has attention weights (Q/K/V) but they are currently unused (FFN-only path). Adding attention requires resolving the ggml buffer allocation for the MTP layer, which has has_kv=false. Approaches tried: - build_attn with KV cache at il_kv=31: corrupts main model KV - build_attn_inp_no_cache: GGML_ASSERT(buffer) failed - build_attn_mha: GGML_ASSERT(buffer) failed - Manual attention with ggml ops: GGML_ASSERT(buffer) failed Root cause: graph scheduler doesn't allocate buffers for MTP layer attention ops. Need to either extend n_layer_kv_from_start to include MTP layers, or add the MTP attention to the graph plan before scheduler runs. Current state: FFN-only MTP gives 95% acceptance rate at temp=0.6. --- Dockerfile.atlas | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Dockerfile.atlas diff --git a/Dockerfile.atlas b/Dockerfile.atlas new file mode 100644 index 0000000000..e0604f2d05 --- /dev/null +++ b/Dockerfile.atlas @@ -0,0 +1,17 @@ +FROM docker.io/nvidia/cuda:12.8.0-devel-rockylinux9 AS builder +RUN dnf install -y cmake gcc-c++ && dnf clean all +ENV TMPDIR=/llama.cpp/tmp + +# Copy local source with inline MTP changes +COPY . /llama.cpp +RUN cd /llama.cpp && \ + mkdir -p /llama.cpp/tmp && \ + cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CUDA_ARCHITECTURES=120 -DLLAMA_BUILD_TESTS=OFF && \ + cmake --build build --target llama-server llama-cli --config Release -j5 + +FROM docker.io/nvidia/cuda:12.8.0-runtime-rockylinux9 +COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/ +COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/ +RUN mkdir -p /models /templates +EXPOSE 8000 +ENTRYPOINT ["/entrypoint.sh"]