Merge branch 'master' into compilade/refactor-kv-cache

2025-07-02 21:41:39 -04:00 · 2025-07-02 21:41:39 -04:00 · 2bcaf64e8e
parent e3fe61203c 5d46babdc2
commit 2bcaf64e8e
1082 changed files with 318232 additions and 122225 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -13,12 +13,15 @@ Checks: >
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
    -readability-simplify-boolean-expr,
    -readability-math-missing-parentheses,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
    portability-*,
    -portability-simd-intrinsics,
    misc-*,
    -misc-const-correctness,
    -misc-non-private-member-variables-in-classes,
    -misc-no-recursion,
    -misc-use-anonymous-namespace,
 FormatStyle: none
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -0,0 +1,92 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 ARG TARGETARCH
 ARG GGML_CPU_ARM_ARCH=armv8-a
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
    fi && \
    cmake --build build -j $(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -0,0 +1,94 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.4.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 WORKDIR /app
 COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -1,33 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@ -1,26 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -1,50 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,25 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 ENV LLAMA_CURL=1
 RUN make -j$(nproc)
 ENV LC_ALL=C.utf8
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -0,0 +1,95 @@
 ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
 ## Build Image
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ### Full
 FROM base AS full
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y \
        git \
        python3 \
        python3-pip \
        python3-venv && \
    python3 -m venv /opt/venv && \
    . /opt/venv/bin/activate && \
    pip install --upgrade pip setuptools wheel && \
    pip install -r requirements.txt && \
    apt autoremove -y && \
    apt clean -y && \
    rm -rf /tmp/* /var/tmp/* && \
    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
    find /var/cache -type f -delete
 ENV PATH="/opt/venv/bin:$PATH"
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
 FROM ascendai/cann:$ASCEND_VERSION AS build
@ -6,7 +6,7 @@ WORKDIR /app
 COPY . .
-RUN yum install -y gcc g++ cmake make
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
    cmake --build build --config Release --target llama-cli
 # TODO: use image with NNRT
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -1,38 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-cli /
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,28 +0,0 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with static libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@ -1,31 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make -j$(nproc) llama-cli
 ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,27 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-cli /llama-cli && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,23 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make -j$(nproc) llama-cli
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -17,10 +17,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -18,10 +18,10 @@ Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -1,43 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,34 +0,0 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@ -1,36 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -1,54 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 RUN make -j$(nproc) llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,31 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,29 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 ENV LLAMA_CURL=1
 RUN make -j$(nproc) llama-server
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -0,0 +1,101 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.0.1
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y \
    build-essential \
    cmake \
    python3 \
    python3-pip \
    git \
    libcurl4-openssl-dev \
    libgomp1
 WORKDIR /app
 COPY . .
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -31,6 +31,7 @@
  # Increases the runtime closure size by ~700M
  useMpi ? false,
  useRocm ? config.rocmSupport,
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -132,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';
-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
  # `default.metallib` may be compiled with Metal compiler from XCode
  # and we need to escape sandbox on MacOS to access Metal compiler.
  # `xcrun` is used find the path of the Metal compiler, which is varible
  # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
  nativeBuildInputs =
@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ]
    ++ optionals useRocm [
      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
    ]
    ++ optionals useMetalKit [
      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
@ -219,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    broken = (useMetalKit && !effectiveStdenv.isDarwin);
    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
+    homepage = "https://github.com/ggml-org/llama.cpp/";
    license = lib.licenses.mit;
    # Accommodates `nix run` and `lib.getExe`
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@ -34,7 +34,7 @@ let
    # server tests
    openai
-    behave
+    pytest
    prometheus-client
  ];
 in
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -0,0 +1,113 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=6.3
 ARG AMDGPU_VERSION=6.3
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
 ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
 #ARG ROCM_DOCKER_ARCH=gfx1100
 # Set nvcc architectured
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 # ENV CC=/opt/rocm/llvm/bin/clang
 # ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
    cmake \
    git \
    libcurl4-openssl-dev \
    curl \
    libgomp1
 WORKDIR /app
 COPY . .
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
    && cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib \
    && find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3-pip \
    python3 \
    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 # Read the first argument into a variable
@ -8,28 +8,36 @@ arg1="$1"
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert_hf_to_gguf.py "$@"
+    exec python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    exec ./llama-cli "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
    exec ./llama-perplexity "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+    for i in $(ls $1/$2/ggml-model-f16.bin*); do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    exec ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
    echo "              ex: -m model.gguf -f file.txt"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -0,0 +1,89 @@
 ARG UBUNTU_VERSION=24.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan-dev \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    python3-wheel \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.editorconfig
+++ b/.editorconfig
@ -21,15 +21,15 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
-[examples/server/public/*]
+[tools/server/public/*]
 indent_size = 2
-[examples/server/public/deps_*]
+[tools/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
-[examples/server/deps_*]
+[tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
@ -37,6 +37,18 @@ indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
-[examples/cvector-generator/*.txt]
+[tools/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 [models/templates/*.jinja]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 [vendor/miniaudio/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/.flake8
+++ b/.flake8
@ -2,8 +2,9 @@
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 exclude =
-    # Do not traverse examples
+    # Do not traverse examples and tools
    examples,
    tools,
    # Do not include package initializers
    __init__.py,
    # No need to traverse our git directory
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -24,7 +24,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -39,16 +40,19 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
  - type: textarea
-    id: steps_to_reproduce
+    id: info
    attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
      placeholder: >
        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
        Here are the exact commands that I used: ...
    validations:
      required: true
@ -61,12 +65,22 @@ body:
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: command
    attributes:
      label: Compile command
      description: >
        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
        This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          Please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -26,7 +26,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -41,8 +42,10 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: hardware
    attributes:
@ -55,20 +58,20 @@ body:
  - type: textarea
    id: model
    attributes:
-      label: Model
+      label: Models
      description: >
-        Which model at which quantization were you using when encountering the bug?
+        Which model(s) at which quantization were you using when encountering the bug?
        If you downloaded a GGUF file off of Huggingface, please provide a link.
      placeholder: >
        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
    validations:
      required: false
  - type: textarea
-    id: steps_to_reproduce
+    id: info
    attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
      placeholder: >
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@ -14,7 +14,7 @@ body:
    id: version
    attributes:
      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
+      description: Which version of our software is affected? (You can use `--version` to get a version string.)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
@ -24,7 +24,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -33,28 +34,40 @@ body:
        - BSD
        - Other? (Please let us know in description)
    validations:
-      required: true
+      required: false
  - type: dropdown
    id: module
    attributes:
      label: Which llama.cpp modules do you know to be affected?
      multiple: true
      options:
        - Documentation/Github
        - libllama (core library)
        - llama-cli
        - llama-server
        - llama-bench
        - llama-quantize
        - Python/Bash scripts
        - Test code
        - Other (Please specify in the next section)
    validations:
-      required: true
+      required: false
  - type: textarea
-    id: steps_to_reproduce
+    id: command
    attributes:
-      label: Steps to Reproduce
+      label: Command line
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
        This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: false
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
    validations:
      required: true
  - type: textarea
@ -62,7 +75,7 @@ body:
    attributes:
      label: First Bad Commit
      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
+        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
@ -71,8 +84,8 @@ body:
    attributes:
      label: Relevant log output
      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          If applicable, please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
-      required: true
+      required: false
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
  - type: checkboxes
    id: prerequisites
@ -16,11 +16,11 @@ body:
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true
  - type: textarea
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  - type: checkboxes
    id: research-stage
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@ -6,8 +6,8 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
  - type: textarea
    id: background-description
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/actions/get-tag-name/action.yml
+++ b/.github/actions/get-tag-name/action.yml
@ -0,0 +1,22 @@
 name: "Determine tag name"
 description: "Determine the tag name to use for a release"
 outputs:
  name:
    description: "The name of the tag"
    value: ${{ steps.tag.outputs.name }}
 runs:
  using: "composite"
  steps:
    - name: Determine tag name
      id: tag
      shell: bash
      run: |
        BUILD_NUMBER="$(git rev-list --count HEAD)"
        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
        else
          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
        fi
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@ -0,0 +1,67 @@
 name: "Windows - Setup CUDA Toolkit"
 description: "Setup CUDA Toolkit for Windows"
 inputs:
  cuda_version:
    description: "CUDA toolkit version"
    required: true
 runs:
  using: "composite"
  steps:
    - name: Install Cuda Toolkit 11.7
      if: ${{ inputs.cuda_version == '11.7' }}
      shell: pwsh
      run: |
          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
          choco install unzip -y
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
    - name: Install Cuda Toolkit 12.4
      if: ${{ inputs.cuda_version == '12.4' }}
      shell: pwsh
      run: |
          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
          choco install unzip -y
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@ -0,0 +1,30 @@
 name: 'Windows - Setup CURL'
 description: 'Composite action, to be reused in other workflow'
 inputs:
  curl_version:
    description: 'CURL version'
    required: false
    default: '8.6.0_6'
  architecture:
    description: 'Architecture of the libcurl to download'
    required: false
    default: 'win64'
 outputs:
  curl_path:
    description: "Path to the downloaded libcurl"
    value: ${{ steps.get_libcurl.outputs.curl_path }}
 runs:
  using: "composite"
  steps:
    - name: libCURL
      id: get_libcurl
      shell: powershell
      env:
        CURL_VERSION: ${{ inputs.curl_version }}
        ARCHITECTURE: ${{ inputs.architecture }}
      run: |
        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
        mkdir $env:RUNNER_TEMP/libcurl
        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -3,19 +3,18 @@ Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
+            - ggml/src/ggml-kompute/**
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
+            - ggml/src/ggml-metal/**
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
@ -27,8 +26,8 @@ Nvidia GPU:
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
+            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan*
+            - ggml/src/ggml-vulkan/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@ -46,7 +45,9 @@ build:
            - CMakePresets.json
 examples:
    - changed-files:
-        - any-glob-to-any-file: examples/**
+        - any-glob-to-any-file:
            - examples/**
            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
@ -71,15 +72,11 @@ android:
 server:
    - changed-files:
        - any-glob-to-any-file:
-            - examples/server/**
+            - tools/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml*.h
+            - ggml/**
            - ggml/src/ggml*.c
            - ggml/src/ggml*.cpp
            - ggml/src/ggml*.h
            - ggml-cuda/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
@ -89,3 +86,15 @@ nix:
 embedding:
    - changed-files:
        - any-glob-to-any-file: examples/embedding/
 Ascend NPU:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cann.h
            - ggml/src/ggml-cann/**
            - docs/backend/CANN.md
 OpenCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-opencl.h
            - ggml/src/ggml-opencl/**
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,7 +1 @@
-
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
 - [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
 - Self-reported review complexity:
  - [ ] Low
  - [ ] Medium
  - [ ] High
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
+#       https://github.com/ggml-org/llama.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'
@ -57,17 +57,7 @@ jobs:
    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
      || (
        github.event_name == 'schedule'
        && github.ref_name == 'master'
        && github.repository_owner == 'ggerganov'
      )
      || github.event_name == 'pull_request_target'
      || (
        github.event_name == 'push'
        && github.event.ref == 'refs/heads/master'
        && github.repository_owner == 'ggerganov'
      )
    steps:
      - name: Clone
        id: checkout
@ -79,7 +69,7 @@ jobs:
      - name: Install python env
        id: pipenv
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
@ -89,7 +79,7 @@ jobs:
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
+          ./prometheus --config.file=tools/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
@ -102,7 +92,7 @@ jobs:
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
@ -114,7 +104,6 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
@ -127,7 +116,7 @@ jobs:
      - name: Download the dataset
        id: download_dataset
        run: |
-          cd examples/server/bench
+          cd tools/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
      - name: Server bench
@ -137,7 +126,7 @@ jobs:
        run: |
          set -eux
-          cd examples/server/bench
+          cd tools/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
@ -168,9 +157,9 @@ jobs:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
-            examples/server/bench/*.jpg
+            tools/server/bench/*.jpg
-            examples/server/bench/*.json
+            tools/server/bench/*.json
-            examples/server/bench/*.log
+            tools/server/bench/*.log
      - name: Commit status
        uses: Sibz/github-status-action@v1
@ -189,17 +178,17 @@ jobs:
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
+            tools/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
+            tools/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
+            tools/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
+            tools/server/bench/requests_processing.jpg
      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux
-          cd examples/server/bench
+          cd tools/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@ -0,0 +1,51 @@
 name: Build relocatable cmake package
 on:
  workflow_dispatch:
  workflow_call:
 jobs:
  linux:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Install dependencies
        run: |
          sudo apt update
          sudo apt install -y build-essential tcl
      - name: Build
        run: |
          PREFIX="$(pwd)"/inst
          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release
          export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
          tclsh <<'EOF'
          set build(commit)  [string trim [exec git rev-parse --short HEAD]]
          set build(number)  [string trim [exec git rev-list  --count HEAD]]
          set build(version) "0.0.$build(number)"
          set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
          set checks [list "set\\(LLAMA_VERSION     \\s+$build(version)\\)" \
                           "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
                           "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
          puts -nonewline "Checking llama-config.cmake version... "
          foreach check $checks {
              if {![regexp -expanded -- $check $llamaconfig]} {
                  puts "\"$check\" failed!"
                  exit 1
              }
          }
          puts "success."
          EOF
          cd examples/simple-cmake-pkg
          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
          cmake --build build
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@ -0,0 +1,346 @@
 name: Build on Linux using cross-compiler
 on:
  workflow_dispatch:
  workflow_call:
 jobs:
  ubuntu-24-riscv64-cpu-cross:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
      - name: Setup Riscv
        run: |
          sudo dpkg --add-architecture riscv64
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
          EOF
          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  gcc-14-riscv64-linux-gnu \
                  g++-14-riscv64-linux-gnu
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  ubuntu-24-riscv64-vulkan-cross:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
      - name: Setup Riscv
        run: |
          sudo dpkg --add-architecture riscv64
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
          EOF
          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
                  gcc-14-riscv64-linux-gnu \
                  g++-14-riscv64-linux-gnu \
                  libvulkan-dev:riscv64
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  ubuntu-24-arm64-vulkan-cross:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
      - name: Setup Arm64
        run: |
          sudo dpkg --add-architecture arm64
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
          EOF
          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
                  crossbuild-essential-arm64 \
                  libvulkan-dev:arm64
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  ubuntu-24-ppc64el-cpu-cross:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
      - name: Setup PowerPC64le
        run: |
          sudo dpkg --add-architecture ppc64el
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
          EOF
          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  gcc-14-powerpc64le-linux-gnu \
                  g++-14-powerpc64le-linux-gnu
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  ubuntu-24-ppc64el-vulkan-cross:
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4
      - name: Setup PowerPC64le
        run: |
          sudo dpkg --add-architecture ppc64el
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
          EOF
          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
                  gcc-14-powerpc64le-linux-gnu \
                  g++-14-powerpc64le-linux-gnu \
                  libvulkan-dev:ppc64el
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  debian-13-loongarch64-cpu-cross:
    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
    steps:
      - uses: actions/checkout@v4
      - name: Setup LoongArch
        run: |
          rm -f /etc/apt/sources.list.d/*
          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
          EOF
          ( echo 'quiet "true";'; \
            echo 'APT::Get::Assume-Yes "true";'; \
            echo 'APT::Install-Recommends "false";'; \
            echo 'Acquire::Check-Valid-Until "false";'; \
            echo 'Acquire::Retries "5";'; \
          ) > /etc/apt/apt.conf.d/99snapshot-repos
          apt-get update
          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
          dpkg --add-architecture loong64
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
          EOF
          apt-get update || true    ;# Prevent failure due to missing URLs.
          apt-get install -y --no-install-recommends \
                  build-essential \
                  gcc-14-loongarch64-linux-gnu \
                  g++-14-loongarch64-linux-gnu
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
  debian-13-loongarch64-vulkan-cross:
    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
    steps:
      - uses: actions/checkout@v4
      - name: Setup LoongArch
        run: |
          rm -f /etc/apt/sources.list.d/*
          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
          EOF
          ( echo 'quiet "true";'; \
            echo 'APT::Get::Assume-Yes "true";'; \
            echo 'APT::Install-Recommends "false";'; \
            echo 'Acquire::Check-Valid-Until "false";'; \
            echo 'Acquire::Retries "5";'; \
          ) > /etc/apt/apt.conf.d/99snapshot-repos
          apt-get update
          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
          dpkg --add-architecture loong64
          # Add arch-specific repositories for non-amd64 architectures
          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
          EOF
          apt-get update || true    ;# Prevent failure due to missing URLs.
          apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
                  gcc-14-loongarch64-linux-gnu \
                  g++-14-loongarch64-linux-gnu \
                  libvulkan-dev:loong64
      - name: Build
        run: |
          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DCMAKE_SYSTEM_NAME=Linux \
                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -17,7 +17,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,12 +10,10 @@
 name: Publish Docker image
 on:
-  #pull_request:
+  workflow_dispatch: # allows manual triggering
-  push:
+  schedule:
-    branches:
+    # Rebuild daily rather than on every push because it is expensive
-      - master
+    - cron: '12 4 * * *'
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  workflow_dispatch: # allows manual triggering, useful for debugging
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,29 +27,25 @@ permissions:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    #if: github.event.pull_request.draft == false
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      fail-fast: false
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Multi-stage build
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # https://github.com/ggml-org/llama.cpp/issues/11888
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
@ -59,10 +53,12 @@ jobs:
          fetch-depth: 0 # preserve git history, so we can determine the build number
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
        with:
          image: tonistiigi/binfmt:qemu-v7.0.0-28
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
@ -82,26 +78,34 @@ jobs:
          # determine tag name postfix (build number, commit hash)
          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="b${BUILD_NUMBER}"
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
          else
            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
          fi
          # list all tags possible
-          TAGS=""
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
+              TYPE=""
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
+          else
-
+              TYPE="-${{ matrix.config.tag }}"
-          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
+          fi
-          echo "output_tags=$TAGS"  # print out for debugging
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
          echo "full_output_tags=$FULLTAGS"  # print out for debugging
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
+        if: ${{ matrix.config.free_disk_space == true }}
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
@ -116,13 +120,59 @@ jobs:
          docker-images: true
          swap-storage: true
-      - name: Build and push Docker image (tagged + versioned)
+      - name: Build and push Full Docker image (tagged + versioned)
-        if: github.event_name == 'push'
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.output_tags }}
+          tags: ${{ steps.tag.outputs.full_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.light_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.server_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -23,5 +23,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -11,7 +11,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
      with:
-        repository: "ggerganov/llama.cpp"
+        repository: "ggml-org/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -1,72 +0,0 @@
 name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 # Fine-grant permission
 # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
 permissions:
  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
  id-token: write
  contents: read
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get update
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -1,79 +0,0 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 # Fine-grant permission
 # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
 permissions:
  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
  id-token: write
  contents: read
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -1,22 +0,0 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -1,36 +0,0 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -1,6 +1,13 @@
 name: flake8 Lint
-on: [push, pull_request]
+on:
  push:
    branches:
      - master
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,755 @@
 name: Release
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
        description: 'Create new release'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
 jobs:
  macOS-arm64:
    runs-on: macos-14
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
          brew install curl
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build \
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DGGML_RPC=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
          name: llama-bin-macos-arm64.zip
  macOS-x64:
    runs-on: macos-13
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
          brew install curl
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
          cmake -B build \
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip
  ubuntu-22-cpu:
    strategy:
      matrix:
        include:
          - build: 'x64'
            os: ubuntu-22.04
          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
          # - build: 'arm64'
          #   os: ubuntu-22.04-arm
    runs-on: ${{ matrix.os }}
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DGGML_BACKEND_DL=ON \
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
          name: llama-bin-ubuntu-${{ matrix.build }}.zip
  ubuntu-22-vulkan:
    runs-on: ubuntu-22.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        run: |
          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          sudo apt-get update -y
          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DGGML_BACKEND_DL=ON \
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DGGML_VULKAN=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
          name: llama-bin-ubuntu-vulkan-x64.zip
  windows-cpu:
    runs-on: windows-2025
    strategy:
      matrix:
        include:
          - arch: 'x64'
          - arch: 'arm64'
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-cpu-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d
      - name: Install Ninja
        run: |
          choco install ninja
      - name: libCURL
        id: get_libcurl
        uses: ./.github/actions/windows-setup-curl
        with:
          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
      - name: Build
        shell: cmd
        env:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
            -DGGML_OPENMP=ON ^
            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release
      - name: Pack artifacts
        id: pack_artifacts
        env:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
  windows:
    runs-on: windows-2025
    env:
      OPENBLAS_VERSION: 0.3.23
      VULKAN_VERSION: 1.4.313.2
    strategy:
      matrix:
        include:
          - backend: 'vulkan'
            arch: 'x64'
            defines: '-DGGML_VULKAN=ON'
            target: 'ggml-vulkan'
          - backend: 'opencl-adreno'
            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
            target: 'ggml-opencl'
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.backend == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Install Ninja
        id: install_ninja
        run: |
          choco install ninja
      - name: Install OpenCL Headers and Libs
        id: install_opencl
        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
        run: |
          git clone https://github.com/KhronosGroup/OpenCL-Headers
          cd OpenCL-Headers
          cmake -B build `
            -DBUILD_TESTING=OFF `
            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build --target install
          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
          cd OpenCL-ICD-Loader
          cmake -B build-arm64-release `
            -A arm64 `
            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
          cmake --build build-arm64-release --target install --config release
      - name: Build
        id: cmake_build
        run: |
          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
          cmake --build build --config Release --target ${{ matrix.target }}
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
  windows-cuda:
    runs-on: windows-2022
    strategy:
      matrix:
        cuda: ['12.4']
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Install ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
          evict-old-files: 1d
      - name: Install Cuda Toolkit
        uses: ./.github/actions/windows-setup-cuda
        with:
          cuda_version: ${{ matrix.cuda }}
      - name: Install Ninja
        id: install_ninja
        run: |
          choco install ninja
      - name: Build
        id: cmake_build
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
            -DLLAMA_CURL=OFF
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
      - name: Copy and pack Cuda runtime
        run: |
          echo "Cuda install location: ${{ env.CUDA_PATH }}"
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
      - name: Upload Cuda runtime
        uses: actions/upload-artifact@v4
        with:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
  windows-sycl:
    runs-on: windows-2022
    defaults:
      run:
        shell: bash
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
          evict-old-files: 1d
      - name: Install
        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
      - name: Build
        id: cmake_build
        shell: cmd
        run: |
          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
          cmake -G "Ninja" -B build ^
            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-sycl -j
      - name: Build the release package
        id: pack_artifacts
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
      - name: Upload the release package
        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip
  windows-hip:
    runs-on: windows-2022
    strategy:
      matrix:
        include:
          - name: "radeon"
            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Clone rocWMMA repository
        id: clone_rocwmma
        run: |
          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d
      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
      - name: Verify ROCm
        id: verify
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - name: Build
        id: cmake_build
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
            -DGGML_CPU=OFF `
            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
  ios-xcode-build:
    runs-on: macos-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
          ./build-xcframework.sh
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
          name: llama-${{ steps.tag.outputs.name }}-xcframework
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
    # Fine-grant permission
    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
    permissions:
        contents: write # for creating release
    runs-on: ubuntu-latest
    needs:
      - windows
      - windows-cpu
      - windows-cuda
      - windows-sycl
      - windows-hip
      - ubuntu-22-cpu
      - ubuntu-22-vulkan
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v4
        with:
          path: ./artifact
          merge-multiple: true
      - name: Move artifacts
        id: move_artifacts
        run: |
          mkdir -p release
          echo "Adding CPU backend files to existing zips..."
          for arch in x64 arm64; do
            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
            temp_dir=$(mktemp -d)
            echo "Extracting CPU backend for $arch..."
            unzip "$cpu_zip" -d "$temp_dir"
            echo "Adding CPU files to $arch zips..."
            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
              if [[ "$target_zip" == "$cpu_zip" ]]; then
                continue
              fi
              echo "Adding CPU backend to $(basename "$target_zip")"
              realpath_target_zip=$(realpath "$target_zip")
              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
            done
            rm -rf "$temp_dir"
          done
          echo "Renaming and moving zips to release..."
          for zip_file in artifact/llama-bin-win-*.zip; do
            base_name=$(basename "$zip_file" .zip)
            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
            echo "Moving $zip_file to release/$zip_name"
            mv "$zip_file" "release/$zip_name"
          done
          echo "Moving other artifacts..."
          mv -v artifact/*.zip release
      - name: Create release
        id: create_release
        uses: ggml-org/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ steps.tag.outputs.name }}
      - name: Upload release
        id: upload_release
        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
          script: |
            const path = require('path');
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
            for (let file of await fs.readdirSync('./release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  release_id: release_id,
                  name: file,
                  data: await fs.readFileSync(`./release/${file}`)
                });
              }
            }
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -15,10 +15,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
 env:
  LLAMA_LOG_COLORS: 1
@ -74,22 +74,51 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r examples/server/tests/requirements.txt
+          pip install -r tools/server/tests/requirements.txt
-      - name: Verify server deps
+      # Setup nodejs (to be used for verifying bundled index.html)
-        id: verify_server_deps
+      - uses: actions/setup-node@v4
        with:
          node-version: '22.11.0'
      - name: WebUI - Install dependencies
        id: webui_lint
        run: |
          cd tools/server/webui
          npm ci
      - name: WebUI - Check code format
        id: webui_format
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd examples/server
+          cd tools/server/webui
          git ls-files --others --modified
          git status
-          ./deps.sh
+
          npm run format
          git status
-          not_ignored_files="$(git ls-files --others --modified)"
+          modified_files="$(git status -s)"
-          echo "Modified files: ${not_ignored_files}"
+          echo "Modified files: ${modified_files}"
-          if [ -n "${not_ignored_files}" ]; then
+          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
+            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${not_ignored_files}"
+            echo "${modified_files}"
            exit 1
          fi
      - name: Verify bundled index.html
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
          cd tools/server/webui
          git status
          npm run build
          git status
          modified_files="$(git status -s)"
          echo "Modified files: ${modified_files}"
          if [ -n "${modified_files}" ]; then
            echo "Repository is dirty or server/webui is not built as expected"
            echo "Hint: You may need to follow Web UI build guide in server/README.md"
            echo "${modified_files}"
            exit 1
          fi
@ -100,40 +129,58 @@ jobs:
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-      - name: Build
+      - name: Build (sanitizers)
-        id: cmake_build
+        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build (sanitizers)
        id: cmake_build
        if: ${{ matrix.sanitizer == '' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Tests
        id: server_integration_tests
        if: ${{ matrix.sanitizer == '' }}
        env:
          GITHUB_ACTIONS: "true"
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
-          PORT=8888 ./tests.sh
+          ./tests.sh
      - name: Tests (sanitizers)
        id: server_integration_tests_sanitizers
        if: ${{ matrix.sanitizer != '' }}
        run: |
          cd tools/server/tests
          LLAMA_SANITIZE=1 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
-          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
+          SLOW_TESTS=1 ./tests.sh
  server-windows:
-    runs-on: windows-2019
+    runs-on: windows-2022
    steps:
      - name: Clone
@ -145,17 +192,14 @@ jobs:
      - name: libCURL
        id: get_libcurl
-        env:
+        uses: ./.github/actions/windows-setup-curl
          CURL_VERSION: 8.6.0_6
        run: |
          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
          mkdir $env:RUNNER_TEMP/libcurl
          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
      - name: Build
        id: cmake_build
        env:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
@ -167,24 +211,27 @@ jobs:
      - name: Tests dependencies
        id: test_dependencies
        run: |
-          pip install -r examples/server/tests/requirements.txt
+          pip install -r tools/server/tests/requirements.txt
      - name: Copy Libcurl
        id: prepare_libcurl
        env:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
          $env:PYTHONIOENCODING = ":replace"
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+          pytest -v -x -m "not slow"
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
-          cd examples/server/tests
+          cd tools/server/tests
-          behave.exe --stop --no-skipped --no-capture --tags slow
+          $env:SLOW_TESTS = "1"
          pytest -v -x
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@ -0,0 +1,42 @@
 name: Update Winget Package
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    - cron: '28 5 * * *' # Update every day at 5:28 UTC
 jobs:
  update:
    name: Update Winget Package
    runs-on: ubuntu-latest
    steps:
      - name: Install cargo binstall
        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
      - name: Install komac
        run: |
          cargo binstall komac@2.11.2 -y
      - name: Find latest release
        id: find_latest_release
        uses: actions/github-script@v6
        with:
          script: |
            const { data: releases } = await github.rest.repos.listReleases({
              owner: context.repo.owner,
              repo: context.repo.repo,
            });
            console.log("Latest release:", releases[0].tag_name);
            return releases[0].tag_name;
      - name: Update manifest
        env:
          VERSION: ${{ steps.find_latest_release.outputs.result }}
        run: |
          echo "Updating manifest..."
          komac update --version ${{ env.VERSION }} \
            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
            --submit \
            ggml.llamacpp
--- a/.gitignore
+++ b/.gitignore
@ -18,6 +18,7 @@
 *.metallib
 *.o
 *.so
 *.swp
 *.tmp
 # IDE / OS
@ -44,6 +45,8 @@ lcov-report/
 tags
 .build/
 build*
 release
 debug
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
@ -93,10 +96,11 @@ perf-*.txt
 # Examples
 examples/jeopardy/results.txt
-examples/server/*.css.hpp
+tools/server/*.css.hpp
-examples/server/*.html.hpp
+tools/server/*.html.hpp
-examples/server/*.js.hpp
+tools/server/*.js.hpp
-examples/server/*.mjs.hpp
+tools/server/*.mjs.hpp
 tools/server/*.gz.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
@ -104,6 +108,10 @@ examples/server/*.mjs.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh
 # Server Web UI temporary files
 node_modules
 tools/server/webui/dist
 # Python
 /.venv
--- a/326
+++ b/326
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,6 +16,7 @@ endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
@ -28,6 +29,8 @@ else()
    set(LLAMA_STANDALONE OFF)
 endif()
 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -46,11 +49,11 @@ if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if (MSVC)
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()
 #
@ -74,31 +77,35 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 # 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 if (NOT DEFINED LLAMA_BUILD_NUMBER)
    set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
 endif()
 if (NOT DEFINED LLAMA_BUILD_COMMIT)
    set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
 endif()
 set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
 # override ggml options
-set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
+set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
-set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
+set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
    set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 if (NOT DEFINED GGML_AMX)
    set(GGML_AMX ON)
 endif()
 if (NOT DEFINED GGML_CUDA_GRAPHS)
    set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
@ -122,16 +129,84 @@ llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        message(STATUS "Using -fsanitize=thread")
        add_compile_options(-fsanitize=thread)
        link_libraries     (-fsanitize=thread)
    endif()
    if (LLAMA_SANITIZE_ADDRESS)
        message(STATUS "Using -fsanitize=address")
        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
        link_libraries     (-fsanitize=address)
    endif()
    if (LLAMA_SANITIZE_UNDEFINED)
        message(STATUS "Using -fsanitize=undefined")
        add_compile_options(-fsanitize=undefined)
        link_libraries     (-fsanitize=undefined)
    endif()
 endif()
 #
 # 3rd-party
 #
 if (LLAMA_USE_SYSTEM_GGML)
    message(STATUS "Using system-provided libggml, skipping ggml build")
    find_package(ggml REQUIRED)
    add_library(ggml ALIAS ggml::ggml)
 endif()
 if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
    set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 if (MINGW)
    # Target Windows 8 for PrefetchVirtualMemory
    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()
 #
 # build the library
 #
 if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
 #
 # utils, programs, examples and tests
 #
 if (NOT LLAMA_BUILD_COMMON)
    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
    set(LLAMA_CURL OFF)
 endif()
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
 endif()
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
 endif()
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
 endif()
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()
 #
 # install
 #
@ -139,32 +214,18 @@ add_subdirectory(src)
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
 set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
 set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
 set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
-# At the moment some compile definitions are placed within the ggml/src
+set(LLAMA_PUBLIC_HEADERS
-# directory but not exported on the `ggml` target. This could be improved by
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
-# determining _precisely_ which defines are necessary for the llama-config
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
-# package.
+
-#
+set_target_properties(llama
-set(GGML_TRANSIENT_DEFINES)
+    PROPERTIES
-get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
+        PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
 if (GGML_DIR_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
 endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
 if (GGML_TARGET_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 configure_package_config_file(
@ -201,22 +262,4 @@ configure_file(cmake/llama.pc.in
        @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION lib/pkgconfig)
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 #
 # utils, programs, examples and tests
 #
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
 endif()
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
 endif()
 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
 endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -32,11 +32,9 @@
    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
    {
-        "name": "arm64-windows-msvc", "hidden": true,
+        "name": "x64-windows-llvm", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
        }
    },
@ -66,9 +64,10 @@
    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
-    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
 /ci/ @ggerganov
 /.devops/*.Dockerfile @ngxson
 /tools/server/ @ngxson
 /ggml/src/ggml-cuda/fattn* @JohannesGaessler
 /ggml/src/ggml-cuda/mmq.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmv.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
 /ggml/src/ggml-opt.cpp @JohannesGaessler
 /ggml/src/gguf.cpp @JohannesGaessler
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,9 +1,12 @@
 # Pull requests (for contributors)
 - llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
+    - Execute [the full CI locally on your machine](ci/README.md) before publishing
-  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
@ -11,23 +14,114 @@
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS)
 # Coding guidelines
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
+- Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
 - Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
    - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
    ```cpp
    // OK
    llama_context * ctx;
    const llama_rope_type rope_type;
    // not OK
    struct llama_context * ctx;
    const enum llama_rope_type rope_type;
    ```
    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
 - Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code
 - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 ![matmul](media/matmul.png)
 # Naming guidelines
 - Use `snake_case` for function, variable and type names
 - Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)
    ```cpp
    // not OK
    int small_number;
    int big_number;
    // OK
    int number_small;
    int number_big;
    ```
 - Enum values are always in upper case and prefixed with the enum name
    ```cpp
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_NONE = 0,
        LLAMA_VOCAB_TYPE_SPM  = 1,
        LLAMA_VOCAB_TYPE_BPE  = 2,
        LLAMA_VOCAB_TYPE_WPM  = 3,
        LLAMA_VOCAB_TYPE_UGM  = 4,
        LLAMA_VOCAB_TYPE_RWKV = 5,
    };
    ```
 - The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
    ```cpp
    llama_model_init();           // class: "llama_model",         method: "init"
    llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
    llama_sampler_get_seed();     // class: "llama_sampler",       method: "get_seed"
    llama_set_embeddings();       // class: "llama_context",       method: "set_embeddings"
    llama_n_threads();            // class: "llama_context",       method: "n_threads"
    llama_adapter_lora_free();    // class: "llama_adapter_lora",  method: "free"
    ```
    - The `get` `<action>` can be omitted
    - The `<noun>` can be omitted if not necessary
    - The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
    - Use `init`/`free` for constructor/destructor `<action>`
 - Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
    ```cpp
    typedef struct llama_context * llama_context_t;
    enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
    ```
    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
 - C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
 - Python filenames are all lowercase with underscores
 - _(TODO: abbreviations usage)_
 # Preprocessor directives
 - _(TODO: add guidelines with examples and apply them to the codebase)_
    ```cpp
    #ifdef FOO
    #endif // FOO
    ```
 # Documentation
 - Documentation is a community effort
 - When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
 - When you notice incorrect or outdated documentation, please update it
 # Resources
 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
-https://github.com/ggerganov/llama.cpp/projects
+https://github.com/ggml-org/llama.cpp/projects
--- a/192
+++ b/192
@ -1,3 +1,7 @@
 ifndef LLAMA_MAKEFILE
 $(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 endif
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
@ -18,6 +22,7 @@ BUILD_TARGETS = \
 	llama-infill \
 	llama-llava-cli \
 	llama-minicpmv-cli\
 	llama-qwen2vl-cli\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@ -34,6 +39,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
 	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@ -46,6 +52,7 @@ TEST_TARGETS = \
 	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grammar-integration \
@ -250,11 +257,11 @@ endif
 # Compile flags
 #
-# keep standard at C11 and C++11
+# keep standard at C11 and C++17
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
+MK_CXXFLAGS  = -std=c++17 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_NVCCFLAGS = -std=c++17
 ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
@ -290,6 +297,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
 	MK_LDFLAGS  += -ldl
 endif
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -359,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
 endif
 ifndef GGML_NO_CPU_AARCH64
-	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
 endif
 # warnings
@ -439,6 +447,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native
 	# Usage AMX build test
 	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
 	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
 	#MK_CXXFLAGS += -mfma -mf16c -mavx
@ -451,7 +463,7 @@ endif
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
+	# https://github.com/ggml-org/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
@ -573,15 +585,18 @@ endif
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
 endif
 # only necessary for the CPU backend files
 MK_CPPFLAGS += -Iggml/src/ggml-cpu
 ifdef GGML_RPC
 	MK_CPPFLAGS  += -DGGML_USE_RPC
 	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
 endif # GGML_RPC
-OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
 OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
 ifdef GGML_CUDA_FA_ALL_QUANTS
@ -665,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
 endif # GGML_CUDA_CCBIN
 ifdef GGML_CUDA_NO_FA
 	MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
 endif # GGML_CUDA_NO_FA
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@ -750,7 +769,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 endif # GGML_VULKAN
-ifdef GGML_HIPBLAS
+ifdef GGML_HIP
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH      ?= /usr
 		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@ -761,10 +780,6 @@ ifdef GGML_HIPBLAS
 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 ifdef GGML_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # GGML_HIP_UMA
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
@ -785,6 +800,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 ifdef GGML_CUDA_NO_FA
 	HIPFLAGS += -DGGML_CUDA_NO_FA
 endif # GGML_CUDA_NO_FA
 	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@ -805,7 +824,7 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # GGML_HIPBLAS
+endif # GGML_HIP
 ifdef GGML_MUSA
 	ifeq ($(wildcard /opt/musa),)
@ -813,7 +832,7 @@ ifdef GGML_MUSA
 	else
 		MUSA_PATH ?= /opt/musa
 	endif
-	MTGPU_TARGETS ?= mp_21 mp_22
+	MUSA_ARCHITECTURES ?= 21;22;31
 	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
 	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
@ -832,7 +851,8 @@ ifdef GGML_MUSA
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
-	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
+	MUSAFLAGS  = -fsigned-char -x musa -mtgpu
 	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -860,6 +880,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 ifdef GGML_CUDA_NO_FA
 	MUSAFLAGS += -DGGML_CUDA_NO_FA
 endif # GGML_CUDA_NO_FA
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
@ -876,14 +900,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/%.cu \
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
-	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
 endif # GGML_MUSA
 ifdef GGML_METAL
@ -938,7 +962,6 @@ DIR_COMMON = common
 OBJ_GGML = \
 	$(DIR_GGML)/src/ggml.o \
 	$(DIR_GGML)/src/ggml-aarch64.o \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
@ -946,9 +969,11 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/repack.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
 	$(OBJ_GGML_EXT)
 OBJ_LLAMA = \
@ -966,6 +991,8 @@ OBJ_COMMON = \
 	$(DIR_COMMON)/console.o \
 	$(DIR_COMMON)/ngram-cache.o \
 	$(DIR_COMMON)/sampling.o \
 	$(DIR_COMMON)/speculative.o \
 	$(DIR_COMMON)/chat.o \
 	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o
@ -1059,8 +1086,8 @@ endif
 ifdef REMOVE_WARNING
 $(info !!! REMOVAL WARNING !!!)
 $(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggml-org/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
 $(info )
 endif
@ -1087,17 +1114,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 # Default target
 all: $(BUILD_TARGETS)
 # force c++ build for source file that have same name as c file
 # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
+$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
-$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 	ggml/src/ggml-cpu/ggml-cpu.cpp \
 	ggml/include/ggml-backend.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
 	ggml/src/ggml-backend-impl.h \
 	ggml/include/ggml-cpu.h \
 	ggml/src/ggml-impl.h
 	$(CXX) $(CXXFLAGS)   -c $< -o $@
 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
@ -1134,8 +1154,15 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
 # Include dependency files
 -include $(DEP_FILES)
 # Clean generated server assets
 clean-server-assets:
 	find tools/server -type f -name "*.js.hpp"   -delete
 	find tools/server -type f -name "*.mjs.hpp"  -delete
 	find tools/server -type f -name "*.css.hpp"  -delete
 	find tools/server -type f -name "*.html.hpp" -delete
 # Clean rule
-clean:
+clean: clean-server-assets
 	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf *.a *.dll *.so *.dot
 	find ggml src common tests examples pocs -type f -name "*.o" -delete
@ -1152,7 +1179,7 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
-llama-cli: examples/main/main.cpp \
+llama-cli: tools/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1160,7 +1187,7 @@ llama-cli: examples/main/main.cpp \
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
-llama-infill: examples/infill/infill.cpp \
+llama-run: tools/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1175,7 +1202,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-tokenize: examples/tokenize/tokenize.cpp \
+llama-tokenize: tools/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1185,27 +1212,27 @@ llama-batched: examples/batched/batched.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+llama-batched-bench: tools/batched-bench/batched-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-quantize: examples/quantize/quantize.cpp \
+llama-quantize: tools/quantize/quantize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-perplexity: examples/perplexity/perplexity.cpp \
+llama-perplexity: tools/perplexity/perplexity.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-imatrix: examples/imatrix/imatrix.cpp \
+llama-imatrix: tools/imatrix/imatrix.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1247,7 +1274,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+llama-gguf-split: tools/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1257,7 +1284,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1267,12 +1294,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-bench: examples/llama-bench/llama-bench.cpp \
+llama-bench: tools/llama-bench/llama-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-llama-export-lora: examples/export-lora/export-lora.cpp \
+llama-export-lora: tools/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1328,30 +1355,28 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 ifdef GGML_RPC
-rpc-server: examples/rpc/rpc-server.cpp \
+rpc-server: tools/rpc/rpc-server.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC
 llama-server: \
-	examples/server/server.cpp \
+	tools/server/server.cpp \
-	examples/server/utils.hpp \
+	tools/server/utils.hpp \
-	examples/server/httplib.h \
+	tools/server/httplib.h \
-	examples/server/index.html.hpp \
+	tools/server/index.html.hpp \
-	examples/server/completion.js.hpp \
+	tools/server/loading.html.hpp \
-	examples/server/loading.html.hpp \
+	common/chat.cpp \
-	examples/server/deps_daisyui.min.css.hpp \
+	common/chat.h \
-	examples/server/deps_markdown-it.js.hpp \
+	common/chat-template.hpp \
 	examples/server/deps_tailwindcss.js.hpp \
 	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
-	common/stb_image.h \
+	common/minja.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
-# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% Makefile
+tools/server/%.hpp: tools/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@ -1364,28 +1389,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-libllava.a: examples/llava/llava.cpp \
+libllava.a: tools/mtmd/llava.cpp \
-	examples/llava/llava.h \
+	tools/mtmd/llava.h \
-	examples/llava/clip.cpp \
+	tools/mtmd/clip.cpp \
-	examples/llava/clip.h \
+	tools/mtmd/clip.h \
 	common/stb_image.h \
 	common/base64.hpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
-llama-llava-cli: examples/llava/llava-cli.cpp \
+llama-llava-cli: tools/mtmd/llava-cli.cpp \
-	examples/llava/llava.cpp \
+	tools/mtmd/llava.cpp \
-	examples/llava/llava.h \
+	tools/mtmd/llava.h \
-	examples/llava/clip.cpp \
+	tools/mtmd/clip.cpp \
-	examples/llava/clip.h \
+	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
-	examples/llava/llava.cpp \
+	tools/mtmd/llava.cpp \
-	examples/llava/llava.h \
+	tools/mtmd/llava.h \
-	examples/llava/clip.cpp \
+	tools/mtmd/clip.cpp \
-	examples/llava/clip.h \
+	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
 	tools/mtmd/llava.cpp \
 	tools/mtmd/llava.h \
 	tools/mtmd/clip.cpp \
 	tools/mtmd/clip.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
@ -1442,7 +1475,12 @@ tests/test-double-float: tests/test-double-float.cpp
 tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-chat: tests/test-chat.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-opt: tests/test-opt.cpp \
@ -1526,7 +1564,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server
+.PHONY: FORCE main quantize perplexity embedding server
 # Define the object file target
 examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
--- a/Package.swift
+++ b/Package.swift
@ -1,89 +0,0 @@
 // swift-tools-version:5.5
 import PackageDescription
 var sources = [
    "src/llama.cpp",
    "src/llama-vocab.cpp",
    "src/llama-grammar.cpp",
    "src/llama-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
    "ggml/src/ggml-aarch64.c",
    "ggml/src/ggml-alloc.c",
    "ggml/src/ggml-backend.cpp",
    "ggml/src/ggml-backend-reg.cpp",
    "ggml/src/ggml-cpu/ggml-cpu.c",
    "ggml/src/ggml-cpu/ggml-cpu.cpp",
    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
    "ggml/src/ggml-threading.cpp",
    "ggml/src/ggml-quants.c",
 ]
 var resources: [Resource] = []
 var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
    .unsafeFlags(["-fno-objc-arc"]),
    .headerSearchPath("ggml/src"),
    // NOTE: NEW_LAPACK will required iOS version 16.4+
    // We should consider add this in the future when we drop support for iOS 14
    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
    // .define("ACCELERATE_NEW_LAPACK"),
    // .define("ACCELERATE_LAPACK_ILP64")
 ]
 #if canImport(Darwin)
 sources.append("ggml/src/ggml-common.h")
 sources.append("ggml/src/ggml-metal/ggml-metal.m")
 resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
        .define("GGML_USE_ACCELERATE"),
        .define("GGML_USE_METAL")
    ]
 )
 #endif
 #if os(Linux)
    cSettings.append(.define("_GNU_SOURCE"))
 #endif
 let package = Package(
    name: "llama",
    platforms: [
        .macOS(.v12),
        .iOS(.v14),
        .watchOS(.v4),
        .tvOS(.v14)
    ],
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
    targets: [
        .target(
            name: "llama",
            path: ".",
            exclude: [
               "build",
               "cmake",
               "examples",
               "scripts",
               "models",
               "tests",
               "CMakeLists.txt",
               "Makefile",
               "ggml/src/ggml-metal-embed.metal"
            ],
            sources: sources,
            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: cSettings,
            linkerSettings: linkerSettings
        )
    ],
    cxxLanguageStandard: .cxx11
 )
--- a/README.md
+++ b/README.md
@ -3,47 +3,79 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
+[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 ## Recent API changes
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
 ## Hot topics
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
+- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 ----
 ## Quick start
 Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
 - Install `llama.cpp` using [brew, nix or winget](docs/install.md)
 - Run with Docker - see our [Docker documentation](docs/docker.md)
 - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
 - Build from source by cloning this repository - check out [our build guide](docs/build.md)
 Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
 Example command:
 ```sh
 # Use a local model file
 llama-cli -m my_model.gguf
 # Or download and run a model directly from Hugging Face
 llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
 # Launch OpenAI-compatible API server
 llama-server -hf ggml-org/gemma-3-1b-it-GGUF
 ```
 ## Description
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
+range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
 improved significantly thanks to many contributions. It is the main playground for developing new features for the
 [ggml](https://github.com/ggerganov/ggml) library.
-**Supported models:**
+<details>
 <summary>Models</summary>
 Typically finetunes of the base models below are supported as well.
 Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
 #### Text-only
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [x] LLaMA 3 🦙🦙🦙
@ -53,22 +85,23 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
+- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
 - [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
 - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@ -79,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
 - [x] [OLMo 2](https://allenai.org/olmo)
 - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
@ -88,17 +122,20 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
 - [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
 - [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
 - [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
-(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
+#### Multimodal
 **Multimodal models:**
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@ -109,9 +146,15 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 - [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
 - [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
-**Bindings:**
+</details>
 <details>
 <summary>Bindings</summary>
 - Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@ -123,6 +166,7 @@ Typically finetunes of the base models below are supported as well.
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
@ -132,321 +176,341 @@ Typically finetunes of the base models below are supported as well.
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
-**UI:**
+</details>
-Unless otherwise noted these projects are open-source with permissive licensing:
+<details>
-
+<summary>UIs</summary>
 - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
 - [ollama/ollama](https://github.com/ollama/ollama)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [RAGNA Desktop](https://ragna.app/) (proprietary)
 - [RecurseChat](https://recurse.chat/) (proprietary)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [MindMac](https://mindmac.app) (proprietary)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 - [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
-**Tools:**
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
 - [LARS](https://github.com/abgulati/LARS) (AGPL)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [MindMac](https://mindmac.app) (proprietary)
 - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
 - [nat/openplayground](https://github.com/nat/openplayground) (MIT)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
 - [ollama/ollama](https://github.com/ollama/ollama) (MIT)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
 - [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [semperai/amica](https://github.com/semperai/amica) (MIT)
 - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
 - [Autopen](https://github.com/blackhole89/autopen) (GPL)
 </details>
 <details>
 <summary>Tools</summary>
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
-**Infrastructure:**
+</details>
 <details>
 <summary>Infrastructure</summary>
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
 - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
 - [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
 </details>
 <details>
 <summary>Games</summary>
 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
 ## Demo
 <details>
 <summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
 ```
 $ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
 I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
 I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
 I LDFLAGS:   -framework Accelerate
 I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 make: Nothing to be done for `default'.
 main: build = 1041 (cf658ad)
 main: seed  = 1692823051
 llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
 llama_model_loader: - type  f32:   81 tensors
 llama_model_loader: - type q4_0:  281 tensors
 llama_model_loader: - type q6_K:    1 tensors
 llm_load_print_meta: format         = GGUF V1 (latest)
 llm_load_print_meta: arch           = llama
 llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
 llm_load_print_meta: n_ctx_train    = 4096
 llm_load_print_meta: n_ctx          = 512
 llm_load_print_meta: n_embd         = 5120
 llm_load_print_meta: n_head         = 40
 llm_load_print_meta: n_head_kv      = 40
 llm_load_print_meta: n_layer        = 40
 llm_load_print_meta: n_rot          = 128
 llm_load_print_meta: n_gqa          = 1
 llm_load_print_meta: f_norm_eps     = 1.0e-05
 llm_load_print_meta: f_norm_rms_eps = 1.0e-05
 llm_load_print_meta: n_ff           = 13824
 llm_load_print_meta: freq_base      = 10000.0
 llm_load_print_meta: freq_scale     = 1
 llm_load_print_meta: model type     = 13B
 llm_load_print_meta: model ftype    = mostly Q4_0
 llm_load_print_meta: model size     = 13.02 B
 llm_load_print_meta: general.name   = LLaMA v2
 llm_load_print_meta: BOS token = 1 '<s>'
 llm_load_print_meta: EOS token = 2 '</s>'
 llm_load_print_meta: UNK token = 0 '<unk>'
 llm_load_print_meta: LF token  = 13 '<0x0A>'
 llm_load_tensors: ggml ctx size =    0.11 MB
 llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
 ...................................................................................................
 llama_new_context_with_model: kv self size  =  400.00 MB
 llama_new_context_with_model: compute buffer total size =   75.41 MB
 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
 generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
 Building a website can be done in 10 simple steps:
 Step 1: Find the right website platform.
 Step 2: Choose your domain name and hosting plan.
 Step 3: Design your website layout.
 Step 4: Write your website content and add images.
 Step 5: Install security features to protect your site from hackers or spammers
 Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
 Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
 Step 8: Start marketing and promoting the website via social media channels or paid ads
 Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
 Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
 How does a Website Work?
 A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
 The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
 How to
 llama_print_timings:        load time =   576.45 ms
 llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
 llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
 llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
 llama_print_timings:       total time = 25431.49 ms
 ```
 </details>
 <details>
 <summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
 And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
 https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
 </details>
 ## Usage
 Here are the end-to-end binary build and model conversion steps for most supported models.
 ### Basic usage
 Firstly, you need to get the binary. There are different methods that you can follow:
 - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
 - Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
 - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
 - Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
 You can run a basic completion using this command:
 ```bash
 llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
 # Output:
 # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
 ```
 See [this page](./examples/main/README.md) for a full list of parameters.
 ### Conversation mode
 If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
 ```bash
 llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
 # Output:
 # > hi, who are you?
 # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
 #
 # > what is 1+1?
 # Easy peasy! The answer to 1+1 is... 2!
 ```
 By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 ```bash
 ./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 ```
 You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
 ```bash
 ./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
 ```
 ### Web server
 [llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 Example usage:
 ```bash
 ./llama-server -m your_model.gguf --port 8080
 # Basic web UI can be accessed via browser: http://localhost:8080
 # Chat completion endpoint: http://localhost:8080/v1/chat/completions
 ```
 ### Interactive mode
 > [!NOTE]
 > If you prefer basic usage, please consider using conversation mode instead of interactive mode
 In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
 Here is an example of a few-shot interaction, invoked with the command
 ```bash
 # default arguments using a 7B model
 ./examples/chat.sh
 # advanced chat with a 13B model
 ./examples/chat-13B.sh
 # custom arguments using a 13B model
 ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
 ### Persistent Interaction
 The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
 ```bash
 # Start a new chat
 PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
 # Resume that chat
 PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
 # Start a different chat with the same prompt/model
 PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
 # Different prompt cache for different prompt/model
 PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
 ```
 ### Constrained output with grammars
 `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
 ```bash
 ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 ## Build
 Please refer to [Build llama.cpp locally](./docs/build.md)
 ## Supported backends
 | Backend | Target devices |
 | --- | --- |
-| [Metal](./docs/build.md#metal-build) | Apple Silicon |
+| [Metal](docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](./docs/build.md#blas-build) | All |
+| [BLAS](docs/build.md#blas-build) | All |
-| [BLIS](./docs/backend/BLIS.md) | All |
+| [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads GPU |
-| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
+| [CUDA](docs/build.md#cuda) | Nvidia GPU |
-| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
+| [HIP](docs/build.md#hip) | AMD GPU |
-| [Vulkan](./docs/build.md#vulkan) | GPU |
+| [Vulkan](docs/build.md#vulkan) | GPU |
-| [CANN](./docs/build.md#cann) | Ascend NPU |
+| [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-## Tools
+## Obtaining and quantizing models
-### Prepare and Quantize
+The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
-> [!NOTE]
+- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
+```sh
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
 ```
-To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
-### Perplexity (measuring model quality)
+After downloading a model, use the CLI tools to run it locally - see below.
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+
 The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
 - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
 - Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
 - Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
 - Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
 To learn more about model quantization, [read this documentation](tools/quantize/README.md)
 ## [`llama-cli`](tools/main)
 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
 - <details open>
    <summary>Run in conversation mode</summary>
    Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
    ```bash
    llama-cli -m model.gguf
    # > hi, who are you?
    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
    #
    # > what is 1+1?
    # Easy peasy! The answer to 1+1 is... 2!
    ```
    </details>
 - <details>
    <summary>Run in conversation mode with custom chat template</summary>
    ```bash
    # use the "chatml" template (use -h to see the list of supported templates)
    llama-cli -m model.gguf -cnv --chat-template chatml
    # use a custom template
    llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
    ```
    </details>
 - <details>
    <summary>Run simple text completion</summary>
    To disable conversation mode explicitly, use `-no-cnv`
    ```bash
    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
    ```
    </details>
 - <details>
    <summary>Constrain the output with a custom grammar</summary>
    ```bash
    llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
    # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
    ```
    The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
    For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
    </details>
 ## [`llama-server`](tools/server)
 #### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
 - <details open>
    <summary>Start a local HTTP server with default configuration on port 8080</summary>
    ```bash
    llama-server -m model.gguf --port 8080
    # Basic web UI can be accessed via browser: http://localhost:8080
    # Chat completion endpoint: http://localhost:8080/v1/chat/completions
    ```
    </details>
 - <details>
    <summary>Support multiple-users and parallel decoding</summary>
    ```bash
    # up to 4 concurrent requests, each with 4096 max context
    llama-server -m model.gguf -c 16384 -np 4
    ```
    </details>
 - <details>
    <summary>Enable speculative decoding</summary>
    ```bash
    # the draft.gguf model should be a small variant of the target model.gguf
    llama-server -m model.gguf -md draft.gguf
    ```
    </details>
 - <details>
    <summary>Serve an embedding model</summary>
    ```bash
    # use the /embedding endpoint
    llama-server -m model.gguf --embedding --pooling cls -ub 8192
    ```
    </details>
 - <details>
    <summary>Serve a reranking model</summary>
    ```bash
    # use the /reranking endpoint
    llama-server -m model.gguf --reranking
    ```
    </details>
 - <details>
    <summary>Constrain all outputs with a grammar</summary>
    ```bash
    # custom grammar
    llama-server -m model.gguf --grammar-file grammar.gbnf
    # JSON
    llama-server -m model.gguf --grammar-file grammars/json.gbnf
    ```
    </details>
 ## [`llama-perplexity`](tools/perplexity)
 #### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
 - <details open>
    <summary>Measure the perplexity over a text file</summary>
    ```bash
    llama-perplexity -m model.gguf -f file.txt
    # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
    # Final estimate: PPL = 5.4007 +/- 0.67339
    ```
    </details>
 - <details>
    <summary>Measure KL divergence</summary>
    ```bash
    # TODO
    ```
    </details>
 [^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
 ## [`llama-bench`](tools/llama-bench)
 #### Benchmark the performance of the inference for various parameters.
 - <details open>
    <summary>Run default benchmark</summary>
    ```bash
    llama-bench -m model.gguf
    # Output:
    # | model               |       size |     params | backend    | threads |          test |                  t/s |
    # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |
    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |
    #
    # build: 3e0ba0e60 (4229)
    ```
    </details>
 ## [`llama-run`](tools/run)
 #### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
 - <details>
    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
    ```bash
    llama-run granite-code
    ```
    </details>
 [^3]: [RamaLama](https://github.com/containers/ramalama)
 ## [`llama-simple`](examples/simple)
 #### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
 - <details>
    <summary>Basic text completion</summary>
    ```bash
    llama-simple -m model.gguf
    # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
    ```
    </details>
 To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
 ## Contributing
@ -454,27 +518,26 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 ## Other documentation
- [main (cli)](./examples/main/README.md)
+- [main (cli)](tools/main/README.md)
- [server](./examples/server/README.md)
+- [server](tools/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
+- [GBNF grammars](grammars/README.md)
 - [GBNF grammars](./grammars/README.md)
-**Development documentation**
+#### Development documentation
- [How to build](./docs/build.md)
+- [How to build](docs/build.md)
- [Running on Docker](./docs/docker.md)
+- [Running on Docker](docs/docker.md)
- [Build on Android](./docs/android.md)
+- [Build on Android](docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
+- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
-**Seminal papers and background on the models**
+#### Seminal papers and background on the models
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@ -485,3 +548,56 @@ If your issue is with model generation quality, then please at least scan the fo
 - GPT-3.5 / InstructGPT / ChatGPT:
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 ## XCFramework
 The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
 and macOS. It can be used in Swift projects without the need to compile the
 library from source. For example:
 ```swift
 // swift-tools-version: 5.10
 // The swift-tools-version declares the minimum version of Swift required to build this package.
 import PackageDescription
 let package = Package(
    name: "MyLlamaPackage",
    targets: [
        .executableTarget(
            name: "MyLlamaPackage",
            dependencies: [
                "LlamaFramework"
            ]),
        .binaryTarget(
            name: "LlamaFramework",
            url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
            checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
        )
    ]
 )
 ```
 The above example is using an intermediate build `b5046` of the library. This can be modified
 to use a different version by changing the URL and checksum.
 ## Completions
 Command-line completion is available for some environments.
 #### Bash Completion
 ```bash
 $ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
 $ source ~/.llama-completion.bash
 ```
 Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
 automatically. For example:
 ```console
 $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 ```
 ## Dependencies
 - [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
 - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
 - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks
 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.
 ### Multi-Tenant environments
@ -62,6 +63,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@ -0,0 +1,541 @@
 #!/usr/bin/env bash
 #
 # Options
 IOS_MIN_OS_VERSION=16.4
 MACOS_MIN_OS_VERSION=13.3
 VISIONOS_MIN_OS_VERSION=1.0
 TVOS_MIN_OS_VERSION=16.4
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
 GGML_METAL=ON
 GGML_METAL_EMBED_LIBRARY=ON
 GGML_BLAS_DEFAULT=ON
 GGML_METAL_USE_BF16=ON
 GGML_OPENMP=OFF
 COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
 COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
 # Common options for all builds
 COMMON_CMAKE_ARGS=(
    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
    -DGGML_METAL=${GGML_METAL}
    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
    -DGGML_NATIVE=OFF
    -DGGML_OPENMP=${GGML_OPENMP}
 )
 XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
 MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
 MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
 echo "Detected Xcode version: $XCODE_VERSION"
 check_required_tool() {
    local tool=$1
    local install_message=$2
    if ! command -v $tool &> /dev/null; then
        echo "Error: $tool is required but not found."
        echo "$install_message"
        exit 1
    fi
 }
 echo "Checking for required tools..."
 check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
 check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
 check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
 check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
 set -e
 ## Clean up previous builds
 rm -rf build-apple
 rm -rf build-ios-sim
 rm -rf build-ios-device
 rm -rf build-macos
 rm -rf build-visionos
 rm -rf build-visionos-sim
 rm -rf build-tvos-sim
 rm -rf build-tvos-device
 # Setup the xcframework build directory structure
 setup_framework_structure() {
    local build_dir=$1
    local min_os_version=$2
    local platform=$3  # "ios", "macos", "visionos", or "tvos"
    local framework_name="llama"
    echo "Creating ${platform}-style framework structure for ${build_dir}"
    if [[ "$platform" == "macos" ]]; then
        # macOS versioned structure uses versioned directories
        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
        # Create symbolic links
        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
        # Set header and module paths
        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
    else
        # iOS/VisionOS/tvOS use a flat structure
        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
        # Remove any existing structure to ensure clean build
        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
        # Set header and module paths
        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
    fi
    # Copy all required headers (common for all platforms)
    cp include/llama.h             ${header_path}
    cp ggml/include/ggml.h         ${header_path}
    cp ggml/include/ggml-opt.h     ${header_path}
    cp ggml/include/ggml-alloc.h   ${header_path}
    cp ggml/include/ggml-backend.h ${header_path}
    cp ggml/include/ggml-metal.h   ${header_path}
    cp ggml/include/ggml-cpu.h     ${header_path}
    cp ggml/include/ggml-blas.h    ${header_path}
    cp ggml/include/gguf.h         ${header_path}
    # Create module map (common for all platforms)
    cat > ${module_path}module.modulemap << EOF
 framework module llama {
    header "llama.h"
    header "ggml.h"
    header "ggml-alloc.h"
    header "ggml-backend.h"
    header "ggml-metal.h"
    header "ggml-cpu.h"
    header "ggml-blas.h"
    header "gguf.h"
    link "c++"
    link framework "Accelerate"
    link framework "Metal"
    link framework "Foundation"
    export *
 }
 EOF
    # Platform-specific settings for Info.plist
    local platform_name=""
    local sdk_name=""
    local supported_platform=""
    case "$platform" in
        "ios")
            platform_name="iphoneos"
            sdk_name="iphoneos${min_os_version}"
            supported_platform="iPhoneOS"
            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
            local device_family='    <key>UIDeviceFamily</key>
    <array>
        <integer>1</integer>
        <integer>2</integer>
    </array>'
            ;;
        "macos")
            platform_name="macosx"
            sdk_name="macosx${min_os_version}"
            supported_platform="MacOSX"
            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
            local device_family=""
            ;;
        "visionos")
            platform_name="xros"
            sdk_name="xros${min_os_version}"
            supported_platform="XRPlatform"
            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
            local device_family=""
            ;;
        "tvos")
            platform_name="appletvos"
            sdk_name="appletvos${min_os_version}"
            supported_platform="AppleTVOS"
            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
            local device_family='    <key>UIDeviceFamily</key>
    <array>
        <integer>3</integer>
    </array>'
            ;;
    esac
    # Create Info.plist
    cat > ${plist_path} << EOF
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
    <key>CFBundleDevelopmentRegion</key>
    <string>en</string>
    <key>CFBundleExecutable</key>
    <string>llama</string>
    <key>CFBundleIdentifier</key>
    <string>org.ggml.llama</string>
    <key>CFBundleInfoDictionaryVersion</key>
    <string>6.0</string>
    <key>CFBundleName</key>
    <string>llama</string>
    <key>CFBundlePackageType</key>
    <string>FMWK</string>
    <key>CFBundleShortVersionString</key>
    <string>1.0</string>
    <key>CFBundleVersion</key>
    <string>1</string>
    <key>MinimumOSVersion</key>
    <string>${min_os_version}</string>
    <key>CFBundleSupportedPlatforms</key>
    <array>
        <string>${supported_platform}</string>
    </array>${device_family}
    <key>DTPlatformName</key>
    <string>${platform_name}</string>
    <key>DTSDKName</key>
    <string>${sdk_name}</string>
 </dict>
 </plist>
 EOF
 }
 # Create dynamic libraries from static libraries.
 combine_static_libraries() {
    local build_dir="$1"
    local release_dir="$2"
    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
    local is_simulator="$4"
    local base_dir="$(pwd)"
    local framework_name="llama"
    # Determine output path based on platform
    local output_lib=""
    if [[ "$platform" == "macos" ]]; then
        # macOS uses versioned structure
        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
    else
        # iOS, visionOS, and tvOS use a directory flat structure
        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
    fi
    local libs=(
        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
    )
    # Create temporary directory for processing
    local temp_dir="${base_dir}/${build_dir}/temp"
    mkdir -p "${temp_dir}"
    # Since we have multiple architectures libtool will find object files that do not
    # match the target architecture. We suppress these warnings.
    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
    # Determine SDK, architectures, and install_name based on platform and simulator flag.
    local sdk=""
    local archs=""
    local min_version_flag=""
    local install_name=""
    case "$platform" in
        "ios")
            if [[ "$is_simulator" == "true" ]]; then
                sdk="iphonesimulator"
                archs="arm64 x86_64"
                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
            else
                sdk="iphoneos"
                archs="arm64"
                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
            fi
            install_name="@rpath/llama.framework/llama"
            ;;
        "macos")
            sdk="macosx"
            archs="arm64 x86_64"
            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
            install_name="@rpath/llama.framework/Versions/Current/llama"
            ;;
        "visionos")
            if [[ "$is_simulator" == "true" ]]; then
                sdk="xrsimulator"
                archs="arm64 x86_64"
                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
            else
                sdk="xros"
                archs="arm64"
                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
            fi
            # Use flat structure for visionOS, same as iOS
            install_name="@rpath/llama.framework/llama"
            ;;
        "tvos")
            if [[ "$is_simulator" == "true" ]]; then
                sdk="appletvsimulator"
                archs="arm64 x86_64"
                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
            else
                sdk="appletvos"
                archs="arm64"
                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
            fi
            install_name="@rpath/llama.framework/llama"
            ;;
    esac
    # Build architecture flags
    local arch_flags=""
    for arch in $archs; do
        arch_flags+=" -arch $arch"
    done
    # Create dynamic library
    echo "Creating dynamic library for ${platform}."
    xcrun -sdk $sdk clang++ -dynamiclib \
        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
        $arch_flags \
        $min_version_flag \
        -Wl,-force_load,"${temp_dir}/combined.a" \
        -framework Foundation -framework Metal -framework Accelerate \
        -install_name "$install_name" \
        -o "${base_dir}/${output_lib}"
    # Platform-specific post-processing for device builds
    if [[ "$is_simulator" == "false" ]]; then
        if command -v xcrun vtool &>/dev/null; then
            case "$platform" in
                "ios")
                    echo "Marking binary as a framework binary for iOS..."
                    xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "visionos")
                    echo "Marking binary as a framework binary for visionOS..."
                    if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
                        echo "Xcode version greater than 16.2, using visionOS."
                        VISION_OS_BUILD_VERSION="visionos"
                    else
                        echo "Xcode version less than or equal to 16.2, using xros."
                        VISION_OS_BUILD_VERSION="xros"
                    fi
                    xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "tvos")
                    echo "Marking binary as a framework binary for tvOS..."
                    xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
            esac
        else
            echo "Warning: vtool not found. Binary may not pass App Store validation."
        fi
    fi
    echo "Creating properly formatted dSYM..."
    # Create a separate directory for dSYMs for all platforms
    mkdir -p "${base_dir}/${build_dir}/dSYMs"
    # iOS and visionOS style dSYM (flat structure)
    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
        # Generate dSYM in the dSYMs directory
        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
        # Create a copy of the binary that will be stripped
        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
        # Strip debug symbols from the copy
        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
        # Replace the original with the stripped version
        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
    else
        # macOS style dSYM
        # First strip debug info to a separate file
        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
        # Generate dSYM in the dSYMs directory
        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
        # Replace original binary with stripped version
        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
    fi
    # Remove any automatically generated dSYM files in the framework structure as they will
    # otherwise case Invalid Bundle Structure validation errors.
    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
        rm -rf "${base_dir}/${output_lib}.dSYM"
    fi
    # Clean up
    rm -rf "${temp_dir}"
 }
 echo "Building for iOS simulator..."
 cmake -B build-ios-sim -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
    -DIOS=ON \
    -DCMAKE_SYSTEM_NAME=iOS \
    -DCMAKE_OSX_SYSROOT=iphonesimulator \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet
 echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
    -DCMAKE_OSX_SYSROOT=iphoneos \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-device --config Release -- -quiet
 echo "Building for macOS..."
 cmake -B build-macos -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-macos --config Release -- -quiet
 echo "Building for visionOS..."
 cmake -B build-visionos -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet
 echo "Building for visionOS simulator..."
 cmake -B build-visionos-sim -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet
 # Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
 echo "Building for tvOS simulator..."
 cmake -B build-tvos-sim -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
    -DCMAKE_SYSTEM_NAME=tvOS \
    -DCMAKE_OSX_SYSROOT=appletvsimulator \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DGGML_METAL=ON \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -- -quiet
 echo "Building for tvOS devices..."
 cmake -B build-tvos-device -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
    -DCMAKE_SYSTEM_NAME=tvOS \
    -DCMAKE_OSX_SYSROOT=appletvos \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DGGML_METAL=ON \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-device --config Release -- -quiet
 # Setup frameworks and copy binaries and headers
 echo "Setting up framework structures..."
 setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
 setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
 setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
 setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
 setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
 setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
 setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
 # Create dynamic libraries from static libraries
 echo "Creating dynamic libraries from static libraries..."
 combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
 combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
 combine_static_libraries "build-macos" "Release" "macos" "false"
 combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
 combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
 combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
 combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
 # Create XCFramework with correct debug symbols paths
 echo "Creating XCFramework..."
 xcodebuild -create-xcframework \
    -framework $(pwd)/build-ios-sim/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
    -framework $(pwd)/build-ios-device/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
    -framework $(pwd)/build-macos/framework/llama.framework \
    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
    -framework $(pwd)/build-visionos/framework/llama.framework \
    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
    -framework $(pwd)/build-tvos-device/framework/llama.framework \
    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
    -output $(pwd)/build-apple/llama.xcframework
--- a/ci/README.md
+++ b/ci/README.md
@ -1,11 +1,11 @@
 # CI
-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
 https://github.com/ggml-org/ci
 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.
@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with MUSA support
 GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
 ## Running MUSA CI in a Docker Container
 Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
 ### 1. Create a local directory to store cached models, configuration files and venv:
 ```bash
 mkdir -p $HOME/llama.cpp/ci-cache
 ```
 ### 2. Create a local directory to store CI run results:
 ```bash
 mkdir -p $HOME/llama.cpp/ci-results
 ```
 ### 3. Start a Docker container and run the CI:
 ```bash
 docker run --privileged -it \
    -v $HOME/llama.cpp/ci-cache:/ci-cache \
    -v $HOME/llama.cpp/ci-results:/ci-results \
    -v $PWD:/ws -w /ws \
    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
 ```
 Inside the container, execute the following commands:
 ```bash
 apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
 git config --global --add safe.directory /ws
 GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
 ```
 This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # sample usage:
 #
@ -16,6 +16,9 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with MUSA support
 # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -36,14 +39,27 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
    if command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
        if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
        else
            echo "Warning: Using fallback CUDA architectures"
            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
        fi
    else
        echo "Error: nvidia-smi not found, cannot build with CUDA"
        exit 1
    fi
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,13 +68,24 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-
+    # Use only main GPU
    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
    # Enable sysman for correct memory reporting
    export ZES_ENABLE_SYSMAN=1
    # to circumvent precision issues on CPY operations
    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
 if [ ! -z ${GG_BUILD_MUSA} ]; then
    # Use qy1 by default (MTT S80)
    MUSA_ARCH=${MUSA_ARCH:-21}
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
@ -173,8 +200,8 @@ function gg_run_test_scripts_debug {
    set -e
-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
@ -197,8 +224,8 @@ function gg_run_test_scripts_release {
    set -e
-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
@ -326,17 +353,17 @@ function gg_run_open_llama_7b_v2 {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -352,10 +379,10 @@ function gg_run_open_llama_7b_v2 {
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -460,17 +487,17 @@ function gg_run_pythia_1_4b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/llama-cli --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -591,17 +618,17 @@ function gg_run_pythia_2_8b {
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -752,7 +779,7 @@ function gg_run_rerank_tiny {
    model_f16="${path_models}/ggml-model-f16.gguf"
    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
    # sample output
    # rerank score 0:    0.029
@ -808,14 +835,17 @@ export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
+    if ! python3 -m venv "$MNT/venv"; then
        echo "Error: Failed to create Python virtual environment at $MNT/venv."
        exit 1
    fi
    source "$MNT/venv/bin/activate"
    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
@ -823,8 +853,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi
 ret=0
-
+if [ -z ${GG_BUILD_SYCL} ]; then
-test $ret -eq 0 && gg_run ctest_debug
+    # SYCL build breaks with debug build flags
    test $ret -eq 0 && gg_run ctest_debug
 fi
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@ -832,7 +864,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny
    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
            test $ret -eq 0 && gg_run test_scripts_debug
        fi
        test $ret -eq 0 && gg_run test_scripts_release
    fi
@ -843,7 +877,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
            test $ret -eq 0 && gg_run ctest_with_model_debug
        fi
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@ -1,6 +0,0 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-pc-windows-msvc )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@ -41,14 +41,20 @@ endif()
 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+    if (CMAKE_VS_PLATFORM_NAME)
        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
    else()
        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
    endif()
 else()
    execute_process(
-        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND ${CMAKE_C_COMPILER} --version
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
    set(BUILD_COMPILER ${OUT})
    execute_process(
        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
        OUTPUT_VARIABLE OUT
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@ -0,0 +1,35 @@
 include("ggml/cmake/common.cmake")
 function(llama_add_compile_flags)
    if (LLAMA_FATAL_WARNINGS)
        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
            list(APPEND C_FLAGS   -Werror)
            list(APPEND CXX_FLAGS -Werror)
        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
            add_compile_options(/WX)
        endif()
    endif()
    if (LLAMA_ALL_WARNINGS)
        if (NOT MSVC)
            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                                -Werror=implicit-int -Werror=implicit-function-declaration)
            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
            list(APPEND C_FLAGS   ${WARNING_FLAGS})
            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
        else()
            # todo : msvc
            set(C_FLAGS   "" PARENT_SCOPE)
            set(CXX_FLAGS "" PARENT_SCOPE)
        endif()
    endif()
 endfunction()
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -3,159 +3,13 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_STATIC @GGML_STATIC@)
 set(GGML_NATIVE @GGML_NATIVE@)
 set(GGML_LTO    @GGML_LTO@)
 set(GGML_CCACHE @GGML_CCACHE@)
 set(GGML_AVX    @GGML_AVX@)
 set(GGML_AVX2   @GGML_AVX2@)
 set(GGML_AVX512 @GGML_AVX512@)
 set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
 set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
 set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
 set(GGML_AMX_TILE @GGML_AMX_TILE@)
 set(GGML_AMX_INT8 @GGML_AMX_INT8@)
 set(GGML_AMX_BF16 @GGML_AMX_BF16@)
 set(GGML_FMA  @GGML_FMA@)
 set(GGML_LASX @GGML_LASX@)
 set(GGML_LSX  @GGML_LSX@)
 set(GGML_RVV  @GGML_RVV@)
 set(GGML_SVE  @GGML_SVE@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
 set(GGML_OPENMP  @GGML_OPENMP@)
 set(GGML_CPU_HBM @GGML_CPU_HBM@)
 set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
 set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
 set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
 set(GGML_CUDA_F16          @GGML_CUDA_F16@)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
 set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
 set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
 set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
 set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
 set(GGML_HIP_UMA @GGML_HIP_UMA@)
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
 set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
 set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
 set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
 set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
 set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
 set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
 set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
 set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
 set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
 set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
 set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
 set(GGML_METAL_STD @GGML_METAL_STD@)
 set(GGML_SYCL_F16    @GGML_SYCL_F16@)
 set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
 set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
@PACKAGE_INIT@
 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
-find_package(Threads REQUIRED)
+find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
 set(_llama_link_deps "")
 set(_llama_link_opts "")
 foreach(_ggml_lib ggml ggml-base)
    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
    find_library(${_ggml_lib_var} ${_ggml_lib}
        REQUIRED
        HINTS ${LLAMA_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH
    )
    list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
    message(STATUS "Found ${${_ggml_lib_var}}")
 endforeach()
 foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
    string(TOUPPER "GGML_${backend}" backend_id)
    set(_ggml_lib "ggml-${backend}")
    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
    find_library(${_ggml_lib_var} ${_ggml_lib}
        HINTS ${LLAMA_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH
    )
    if(${_ggml_lib_var})
        list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
        set(${backend_id} ON)
        message(STATUS "Found backend ${${_ggml_lib_var}}")
    else()
        set(${backend_id} OFF)
    endif()
 endforeach()
 if (NOT LLAMA_SHARED_LIB)
    if (APPLE AND GGML_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
    endif()
    if (GGML_OPENMP)
        find_package(OpenMP REQUIRED)
        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
    endif()
    if (GGML_CPU_HBM)
        find_library(memkind memkind REQUIRED)
        list(APPEND _llama_link_deps memkind)
    endif()
    if (GGML_BLAS)
        find_package(BLAS REQUIRED)
        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
    endif()
    if (GGML_CUDA)
        find_package(CUDAToolkit REQUIRED)
    endif()
    if (GGML_METAL)
        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
        find_library(METAL_FRAMEWORK    Metal REQUIRED)
        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
    endif()
    if (GGML_VULKAN)
        find_package(Vulkan REQUIRED)
        list(APPEND _llama_link_deps Vulkan::Vulkan)
    endif()
    if (GGML_HIP)
        find_package(hip     REQUIRED)
        find_package(hipblas REQUIRED)
        find_package(rocblas REQUIRED)
        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
    endif()
    if (GGML_SYCL)
        find_package(DNNL)
        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
            list(APPEND _llama_link_deps DNNL::dnnl)
        endif()
        if (WIN32)
            find_package(IntelSYCL REQUIRED)
            find_package(MKL       REQUIRED)
            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
        endif()
    endif()
 endif()
 find_library(llama_LIBRARY llama
    REQUIRED
@ -167,12 +21,10 @@ add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES cxx_std_11
+        INTERFACE_COMPILE_FEATURES c_std_90
-        POSITION_INDEPENDENT_CODE ON )
+        POSITION_INDEPENDENT_CODE ON)
 check_required_components(Llama)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -1,10 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
+exec_prefix=@CMAKE_INSTALL_PREFIX@
-libdir=${exec_prefix}/lib
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
-includedir=${prefix}/include
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
+Version: @LLAMA_INSTALL_VERSION@
-Libs: -L${libdir} -lllama
+Libs: -L${libdir} -lggml -lggml-base -lllama
 Cflags: -I${includedir}
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@ -0,0 +1,5 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR x86_64 )
 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -2,11 +2,13 @@
 find_package(Threads REQUIRED)
 llama_add_compile_flags()
 # Build info header
 #
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
    # Is git submodule
    if(NOT IS_DIRECTORY "${GIT_DIR}")
@ -16,34 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
        if (SLASH_POS EQUAL 0)
            set(GIT_DIR "${REAL_GIT_DIR}")
        else()
-            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+            set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
        endif()
    endif()
    if(EXISTS "${GIT_DIR}/index")
-        set(GIT_INDEX "${GIT_DIR}/index")
+        # For build-info.cpp below
        set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
    else()
        message(WARNING "Git index not found in git repository.")
        set(GIT_INDEX "")
    endif()
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
 endif()
-# Add a custom command to rebuild build-info.cpp when .git/index changes
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
-add_custom_command(
+set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
-    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
+configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-    COMMENT "Generating build details from Git"
+
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
 )
 set(TARGET build_info)
-add_library(${TARGET} OBJECT build-info.cpp)
+add_library(${TARGET} OBJECT ${OUTPUT_FILE})
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@ -54,18 +48,28 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
    chat-parser.cpp
    chat-parser.h
    chat.cpp
    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
    json-partial.cpp
    json-partial.h
    json-schema-to-grammar.cpp
-    json.hpp
+    llguidance.cpp
    log.cpp
    log.h
    ngram-cache.cpp
    ngram-cache.h
    regex-partial.cpp
    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
    )
 if (BUILD_SHARED_LIBS)
@ -76,13 +80,84 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
 # Use curl to download model url
 if (LLAMA_CURL)
-    find_package(CURL REQUIRED)
+    find_package(CURL)
-    add_definitions(-DLLAMA_USE_CURL)
+    if (NOT CURL_FOUND)
        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
    endif()
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
-target_include_directories(${TARGET} PUBLIC .)
+if (LLAMA_LLGUIDANCE)
-target_compile_features   (${TARGET} PUBLIC cxx_std_11)
+    include(ExternalProject)
    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
    # Set the correct library file extension based on platform
    if (WIN32)
        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
        # Add Windows-specific libraries
        set(LLGUIDANCE_PLATFORM_LIBS
            ws2_32    # Windows Sockets API
            userenv   # For GetUserProfileDirectoryW
            ntdll     # For NT functions
            bcrypt    # For BCryptGenRandom
        )
    else()
        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
        set(LLGUIDANCE_PLATFORM_LIBS "")
    endif()
    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
        # v0.7.20 (+ fix to build on GCC 15):
        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND cargo build --release
        INSTALL_COMMAND ""
        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
        UPDATE_COMMAND ""
    )
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
    add_library(llguidance STATIC IMPORTED)
    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
    add_dependencies(llguidance llguidance_ext)
    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
    # Add platform libraries to the main target
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()
 target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 #
 # copy the license files
 #
 # Check if running in GitHub Actions
 if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
    message(STATUS "Running inside GitHub Actions - copying license files")
    # Copy all files from licenses/ to build/bin/
    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
    foreach(LICENSE_FILE ${LICENSE_FILES})
        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
        add_custom_command(
            POST_BUILD
            TARGET ${TARGET}
            COMMAND ${CMAKE_COMMAND} -E copy_if_different
                "${LICENSE_FILE}"
                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
    endforeach()
 endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -12,6 +12,7 @@
 struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
@ -53,9 +54,11 @@ struct common_arg {
    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
    common_arg & set_sparam();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
    bool get_value_from_env(std::string & output);
    bool has_value_from_env();
    std::string to_string();
@ -75,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 bool common_has_curl();
 struct common_remote_params {
    std::vector<std::string> headers;
    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
 };
 // get remote file content, returns <http_code, raw_response_body>
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
+int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
+char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
 char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
 char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@ -0,0 +1,385 @@
 #include "chat-parser.h"
 #include "common.h"
 #include "log.h"
 #include "regex-partial.h"
 #include <optional>
 #include <stdexcept>
 #include <string>
 #include <vector>
 using json = nlohmann::ordered_json;
 common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
    : input_(input), is_partial_(is_partial), syntax_(syntax)
 {
    result_.role = "assistant";
    while (true) {
        std::string id = std::to_string(std::rand());
        if (input.find(id) == std::string::npos) {
            healing_marker_ = id;
            break;
        }
    }
 }
 std::string common_chat_msg_parser::str(const common_string_range & rng) const {
    GGML_ASSERT(rng.begin <= rng.end);
    return input_.substr(rng.begin, rng.end - rng.begin);
 }
 void common_chat_msg_parser::add_content(const std::string &content) {
    result_.content += content;
 }
 void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
    result_.reasoning_content += reasoning_content;
 }
 bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
    if (name.empty()) {
        return false;
    }
    common_chat_tool_call tool_call;
    tool_call.name = name;
    tool_call.arguments = arguments;
    tool_call.id = id;
    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
    result_.tool_calls.emplace_back(tool_call);
    return true;
 }
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
    return add_tool_call(name, id, arguments);
 }
 bool common_chat_msg_parser::add_tool_calls(const json & arr) {
    for (const auto & item : arr) {
        if (!add_tool_call(item)) {
            return false;
        }
    }
    return true;
 }
 void common_chat_msg_parser::finish() {
    if (!is_partial_ && pos_ != input_.size()) {
        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
    }
 }
 bool common_chat_msg_parser::consume_spaces() {
    const auto length = input_.size();
    auto consumed = false;
    while (pos_ < length && std::isspace(input_[pos_])) {
        ++pos_;
        consumed = true;
    }
    return consumed;
 }
 bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
    auto pos = pos_;
    for (auto i = 0u; i < literal.size(); ++i) {
        if (pos >= input_.size()) {
            return false;
        }
        if (input_[pos] != literal[i]) {
            return false;
        }
        ++pos;
    }
    pos_ = pos;
    return true;
 }
 std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
    auto idx = input_.find(literal, pos_);
    if (idx != std::string::npos) {
        find_regex_result res;
        res.prelude = input_.substr(pos_, idx - pos_);
        auto end = idx + literal.size();
        res.groups.emplace_back(common_string_range{idx, end});
        move_to(end);
        return res;
    }
    if (is_partial_) {
        idx = string_find_partial_stop(input_, literal);
        if (idx != std::string::npos && idx >= pos_) {
            find_regex_result res;
            res.prelude = input_.substr(pos_, idx - pos_);
            auto end = input_.size();
            res.groups.emplace_back(common_string_range{idx, end});
            move_to(end);
            return res;
        }
    }
    return std::nullopt;
 }
 void common_chat_msg_parser::consume_literal(const std::string & literal) {
    if (!try_consume_literal(literal)) {
        throw common_chat_msg_partial_exception(literal);
    }
 }
 bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
        auto stripped_reasoning = string_strip(reasoning);
        if (stripped_reasoning.empty()) {
            return;
        }
        if (syntax_.reasoning_in_content) {
            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
            add_content(stripped_reasoning);
            if (closed) {
                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
            }
        } else {
            add_reasoning_content(stripped_reasoning);
        }
    };
    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
            if (auto res = try_find_literal(end_think)) {
                handle_reasoning(res->prelude, /* closed */ true);
                consume_spaces();
                return true;
            }
            auto rest = consume_rest();
            if (!rest.empty()) {
                handle_reasoning(rest, /* closed */ !is_partial());
            }
            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
            // if (!syntax_.thinking_forced_open) {
            //     throw common_chat_msg_partial_exception(end_think);
            // }
            return true;
        }
    }
    return false;
 }
 std::string common_chat_msg_parser::consume_rest() {
    auto rest = input_.substr(pos_);
    pos_ = input_.size();
    return rest;
 }
 // Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
 std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
        return std::nullopt;
    }
    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
    pos_ = m.groups[0].end;
    if (add_prelude_to_content) {
        add_content(prelude);
    }
    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
        if (is_partial()) {
            throw common_chat_msg_partial_exception(regex.str());
        }
        return std::nullopt;
    }
    return find_regex_result{prelude, m.groups};
 }
 common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
    if (auto result = try_consume_regex(regex)) {
        return *result;
    }
    throw common_chat_msg_partial_exception(regex.str());
 }
 std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
    auto m = regex.search(input_, pos_);
    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
        return std::nullopt;
    }
    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
        if (is_partial()) {
            throw common_chat_msg_partial_exception(regex.str());
        }
        return std::nullopt;
    }
    if (m.groups[0].begin != pos_) {
        // Didn't match at the current position.
        return std::nullopt;
    }
    pos_ = m.groups[0].end;
    return find_regex_result {
        /* .prelude = */ "",
        m.groups,
    };
 }
 std::optional<common_json> common_chat_msg_parser::try_consume_json() {
    auto it = input_.cbegin() + pos_;
    const auto end = input_.cend();
    common_json result;
    if (!common_json_parse(it, end, healing_marker_, result)) {
        return std::nullopt;
    }
    pos_ = std::distance(input_.cbegin(), it);
    if (result.healing_marker.marker.empty()) {
        // No healing marker, just return the parsed json
        return result;
    }
    if (!is_partial()) {
        throw common_chat_msg_partial_exception("JSON");
    }
    return result;
 }
 common_json common_chat_msg_parser::consume_json() {
    if (auto result = try_consume_json()) {
        return *result;
    }
    throw common_chat_msg_partial_exception("JSON");
 }
 common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
    const std::vector<std::vector<std::string>> & args_paths,
    const std::vector<std::vector<std::string>> & content_paths
 ) {
    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
        return *result;
    }
    throw common_chat_msg_partial_exception("JSON");
 }
 std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
    const std::vector<std::vector<std::string>> & args_paths,
    const std::vector<std::vector<std::string>> & content_paths
 ) {
    auto partial = try_consume_json();
    if (!partial) {
        return std::nullopt;
    }
    auto is_arguments_path = [&](const std::vector<std::string> & path) {
        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
    };
    auto is_content_path = [&](const std::vector<std::string> & path) {
        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
    };
    if (partial->healing_marker.marker.empty()) {
        if (args_paths.empty()) {
            // No arguments to dump, and JSON was parsed fully.
            return consume_json_result {
                partial->json,
                /* .is_partial = */ false,
            };
        }
        if (is_arguments_path({})) {
            // Entire JSON is the arguments and was parsed fully.
            return consume_json_result {
                partial->json.dump(),
                /* .is_partial = */ false,
            };
        }
    }
    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
    auto found_healing_marker = false;
    std::vector<std::string> path;
    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
        if (is_arguments_path(path)) {
            auto arguments = j.dump();
            if (is_partial() && !partial->healing_marker.marker.empty()) {
                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
                if (idx != std::string::npos) {
                    arguments.resize(idx);
                    found_healing_marker = true;
                }
                if (arguments == "\"") {
                    // This happens because of completing `:"$magic` after `"arguments"`
                    arguments = "";
                }
            }
            return arguments;
        }
        if (is_content_path(path)) {
            if (!j.is_string()) {
                throw std::runtime_error("Content path must be a string");
            }
            std::string str = j;
            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
            if (idx != std::string::npos) {
                str.resize(idx);
                found_healing_marker = true;
            }
            return str;
        }
        if (j.is_object()) {
            auto obj = json::object();
            for (const auto & p : j.items()) {
                const auto & key = p.key();
                const auto & value = p.value();
                const std::string key_str = key; // NOLINT
                auto idx = key_str.find(healing_marker_);
                if (idx != std::string::npos) {
                    found_healing_marker = true;
                    break;
                }
                path.push_back(key_str);
                if (value.is_string()) {
                    const std::string value_str = value;
                    if (value_str.find(healing_marker_) != std::string::npos) {
                        found_healing_marker = true;
                        if (is_content_path(path)) {
                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
                                obj[key] = remove_unsupported_healings_and_dump_args(value);
                            }
                        }
                        break;
                    }
                    obj[key] = value;
                } else {
                    obj[key] = remove_unsupported_healings_and_dump_args(value);
                }
                path.pop_back();
            }
            return obj;
        }
        if (j.is_array()) {
            auto arr = json::array();
            for (const auto & value : j) {
                if (value.is_string()) {
                    std::string str = value;
                    auto idx = str.find(healing_marker_);
                    if (idx != std::string::npos) {
                        // Don't heal array values that aren't in the arguments.
                        found_healing_marker = true;
                        break;
                    }
                }
                arr.push_back(remove_unsupported_healings_and_dump_args(value));
            }
            return arr;
        }
        return j;
    };
    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
    return consume_json_result {
        cleaned,
        /* .is_partial = */ found_healing_marker,
    };
 }
 void common_chat_msg_parser::clear_tools() {
    result_.tool_calls.clear();
 }
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@ -0,0 +1,120 @@
 #pragma once
 #include "chat.h"
 #include "json-partial.h"
 #include "regex-partial.h"
 #include <nlohmann/json.hpp>
 #include <optional>
 #include <string>
 #include <vector>
 class common_chat_msg_partial_exception : public std::runtime_error {
  public:
    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
 };
 class common_chat_msg_parser {
    std::string input_;
    bool is_partial_;
    common_chat_syntax syntax_;
    std::string healing_marker_;
    size_t pos_ = 0;
    common_chat_msg result_;
  public:
    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
    const std::string & input() const { return input_; }
    size_t pos() const { return pos_; }
    const std::string & healing_marker() const { return healing_marker_; }
    const bool & is_partial() const { return is_partial_; }
    const common_chat_msg & result() const { return result_; }
    const common_chat_syntax & syntax() const { return syntax_; }
    void move_to(size_t pos) {
        if (pos > input_.size()) {
            throw std::runtime_error("Invalid position!");
        }
        pos_ = pos;
    }
    void move_back(size_t n) {
        if (pos_ < n) {
            throw std::runtime_error("Can't move back that far!");
        }
        pos_ -= n;
    }
    // Get the substring of the input at the given range
    std::string str(const common_string_range & rng) const;
    // Appends to the result.content field
    void add_content(const std::string & content);
    // Appends to the result.reasoning_content field
    void add_reasoning_content(const std::string & reasoning_content);
    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
    bool add_tool_call(const nlohmann::ordered_json & tool_call);
    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);
    void finish();
    bool consume_spaces();
    void consume_literal(const std::string & literal);
    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
    std::string consume_rest();
    struct find_regex_result {
        std::string prelude;
        std::vector<common_string_range> groups;
    };
    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
    bool try_consume_literal(const std::string & literal);
    std::optional<find_regex_result> try_find_literal(const std::string & literal);
    find_regex_result consume_regex(const common_regex & regex);
    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
    std::optional<common_json> try_consume_json();
    common_json consume_json();
    struct consume_json_result {
        nlohmann::ordered_json value;
        bool is_partial;
    };
    /*
        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
    */
    consume_json_result consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
    std::optional<consume_json_result> try_consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
    void clear_tools();
 };
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@ -0,0 +1,204 @@
 // Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
 #pragma once
 #include "common.h"
 #include <functional>
 #include <chrono>
 #include <string>
 #include <vector>
 #include <map>
 struct common_chat_templates;
 struct common_chat_tool_call {
    std::string name;
    std::string arguments;
    std::string id;
    bool operator==(const common_chat_tool_call & other) const {
        return name == other.name && arguments == other.arguments && id == other.id;
    }
 };
 struct common_chat_msg_content_part {
    std::string type;
    std::string text;
    bool operator==(const common_chat_msg_content_part & other) const {
        return type == other.type && text == other.text;
    }
 };
 struct common_chat_msg {
    std::string role;
    std::string content;
    std::vector<common_chat_msg_content_part> content_parts = {};
    std::vector<common_chat_tool_call> tool_calls = {};
    std::string reasoning_content;
    std::string tool_name;
    std::string tool_call_id;
    template <class T> T to_json_oaicompat() const;
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
    }
    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
                auto id = tool_calls[i].id;
                if (id.empty()) {
                    id = gen_tool_call_id();
                }
                ids_cache.push_back(id);
            }
            tool_calls[i].id = ids_cache[i];
        }
    }
    bool operator==(const common_chat_msg & other) const {
        return role == other.role
            && content == other.content
            && content_parts == other.content_parts
            && tool_calls == other.tool_calls
            && reasoning_content == other.reasoning_content
            && tool_name == other.tool_name
            && tool_call_id == other.tool_call_id;
    }
    bool operator!=(const common_chat_msg & other) const {
        return !(*this == other);
    }
 };
 struct common_chat_msg_diff {
    std::string reasoning_content_delta;
    std::string content_delta;
    size_t tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;
    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
    bool operator==(const common_chat_msg_diff & other) const {
        return content_delta == other.content_delta
        && tool_call_index == other.tool_call_index
        && tool_call_delta == other.tool_call_delta;
    }
 };
 struct common_chat_tool {
    std::string name;
    std::string description;
    std::string parameters;
 };
 enum common_chat_tool_choice {
    COMMON_CHAT_TOOL_CHOICE_AUTO,
    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
    COMMON_CHAT_TOOL_CHOICE_NONE,
 };
 enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
 struct common_chat_templates_inputs {
    std::vector<common_chat_msg> messages;
    std::string grammar;
    std::string json_schema;
    bool add_generation_prompt = true;
    bool use_jinja = true;
    // Parameters below only supported when use_jinja is true
    std::vector<common_chat_tool> tools;
    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
    bool parallel_tool_calls = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
    bool enable_thinking = true;
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    std::map<std::string, std::string> chat_template_kwargs;
 };
 struct common_chat_params {
    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy = false;
    bool                                thinking_forced_open = false;
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
 };
 struct common_chat_syntax {
    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                     reasoning_in_content  = false;
    bool                     thinking_forced_open  = false;
    bool                     parse_tool_calls      = true;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
 void common_chat_templates_free(struct common_chat_templates * tmpls);
 struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
 typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
 common_chat_templates_ptr common_chat_templates_init(
                                    const struct llama_model * model,
                                           const std::string & chat_template_override,
                                           const std::string & bos_token_override = "",
                                           const std::string & eos_token_override = "");
 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
 const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
 struct common_chat_params      common_chat_templates_apply(
    const struct common_chat_templates * tmpls,
    const struct common_chat_templates_inputs & inputs);
 // Format single message, while taking into account the position of that message in chat history
 std::string common_chat_format_single(
        const struct common_chat_templates * tmpls,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
        bool add_ass,
        bool use_jinja);
 // Returns an example of formatted chat
 std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
    bool use_jinja);
 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
 template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
 // Parses a JSON array of tools in OpenAI's chat completion tool call API format.
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
 template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/common/cmake/build-info-gen-cpp.cmake
+++ b/common/cmake/build-info-gen-cpp.cmake
@ -1,24 +0,0 @@
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 # Only write the build info if it changed
 if(EXISTS ${OUTPUT_FILE})
    file(READ ${OUTPUT_FILE} CONTENTS)
    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMMIT ${CMAKE_MATCH_1})
    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMPILER ${CMAKE_MATCH_1})
    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_TARGET ${CMAKE_MATCH_1})
    if (
        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
        NOT OLD_TARGET   STREQUAL BUILD_TARGET
    )
        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
    endif()
 else()
    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
 endif()
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -2,10 +2,13 @@
 #pragma once
-#include "llama.h"
+#include "llama-cpp.h"
 #include <set>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
 #include <sstream>
 #ifdef _WIN32
@ -24,20 +27,20 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-struct common_lora_adapter_info {
+struct common_adapter_lora_info {
    std::string path;
    float scale;
    struct llama_adapter_lora * ptr;
 };
-struct common_lora_adapter_container : common_lora_adapter_info {
+using llama_tokens = std::vector<llama_token>;
    struct llama_lora_adapter * adapter;
 };
 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
+extern const char * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
+extern const char * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern const char * LLAMA_BUILD_TARGET;
 struct common_control_vector_load_info;
@ -65,7 +68,6 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@ -75,9 +77,10 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_COUNT,
 };
@ -93,6 +96,8 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };
 // dimensionality reduction methods, used by cvector-generator
@ -101,8 +106,27 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };
-// sampler parameters
+enum common_conversation_mode {
-struct common_sampler_params {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
    COMMON_CONVERSATION_MODE_ENABLED  = 1,
    COMMON_CONVERSATION_MODE_AUTO     = 2,
 };
 enum common_grammar_trigger_type {
    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
 };
 struct common_grammar_trigger {
    common_grammar_trigger_type type;
    std::string value;
    llama_token token = LLAMA_TOKEN_NULL;
 };
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
    int32_t n_prev             = 64;    // number of previous tokens to remember
@ -126,17 +150,20 @@ struct common_sampler_params {
    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   top_n_sigma        = -1.00f;// -1.0 = disabled
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
    bool    penalize_nl        = false; // consider newlines as a repeatable token
    bool    ignore_eos         = false;
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
@ -145,7 +172,10 @@ struct common_sampler_params {
        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };
-    std::string grammar; // optional BNF-like grammar to constrain sampling
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
    bool                                grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;
    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@ -153,21 +183,55 @@ struct common_sampler_params {
    std::string print() const;
 };
 struct common_params_model {
    std::string path    = ""; // model local path                                           // NOLINT
    std::string url     = ""; // model url to download                                      // NOLINT
    std::string hf_repo = ""; // HF repo                                                    // NOLINT
    std::string hf_file = ""; // HF file                                                    // NOLINT
 };
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
    struct common_params_model model;
 };
 struct common_params_vocoder {
    struct common_params_model model;
    std::string speaker_file = ""; // speaker file path                                      // NOLINT
    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
    float   p_split               =  0.1f; // speculative decoding split probability
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@ -180,31 +244,37 @@ struct common_params {
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
    struct cpu_params draft_cpuparams;
    struct cpu_params draft_cpuparams_batch;
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct common_sampler_params sparams;
+    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;
-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_alias          = ""; // model alias                                                   // NOLINT
    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
@ -212,20 +282,21 @@ struct common_params {
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
-    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@ -243,11 +314,11 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence
    bool usage             = false; // print usage
    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
@ -258,23 +329,29 @@ struct common_params {
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
-    std::string cache_type_k = "f16"; // KV cache data type for the K
+    bool single_turn       = false; // single turn chat conversation
    std::string cache_type_v = "f16"; // KV cache data type for the V
-    // multimodal models (see examples/llava)
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
    // multimodal models (see tools/mtmd)
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
    // embedding
@ -282,7 +359,7 @@ struct common_params {
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
    std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
+    std::string cls_sep    = "\t";  // separator of classification sequences
    // server params
    int32_t port           = 8080;         // server listens on this network port
@ -294,13 +371,19 @@ struct common_params {
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
    std::vector<std::string> api_keys;
    std::string ssl_file_key  = "";                                                                         // NOLINT
    std::string ssl_file_cert = "";                                                                         // NOLINT
    std::map<std::string, std::string> default_template_kwargs;
    // "advanced" endpoints are disabled by default for better security
    bool webui            = true;
    bool endpoint_slots   = false;
@ -332,29 +415,33 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text
    // imatrix params
    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization
    // cvector-generator params
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_outfile       = "control_vector.gguf";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
    // batched-bench params
    bool batched_bench_output_jsonl = false;
    // common params
    std::string out_file; // output filename for all example programs
    // optional callback for model loading progress and cancellation:
    // called with a progress value between 0.0 and 1.0.
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
 };
 // call once at the start of a program if it uses libcommon
@ -373,13 +460,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
 //
 #ifdef __GNUC__
-#ifdef __MINGW32__
+#    if defined(__MINGW32__) && !defined(__clang__)
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #    else
 #        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #    endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 #else
 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@ -388,8 +475,14 @@ std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
 std::string string_join(const std::vector<std::string> & values, const std::string & separator);
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
 std::string string_repeat(const std::string & str, size_t n);
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 std::string regex_escape(const std::string & s);
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@ -421,6 +514,15 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
    return parts;
 }
 static bool string_starts_with(const std::string & str,
                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
    return str.rfind(prefix, 0) == 0;
 }
 // While we wait for C++20's std::string::ends_with...
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
 size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@ -443,25 +545,28 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
 // note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
+    llama_model_ptr   model;
-    struct llama_context * context = nullptr;
+    llama_context_ptr context;
-    std::vector<common_lora_adapter_container> lora_adapters;
+
    std::vector<llama_adapter_lora_ptr> lora;
 };
 struct common_init_result     common_init_from_params(common_params & params);
-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 std::string                   get_model_endpoint();
 //
 // Batch utils
 //
 void common_batch_clear(struct llama_batch & batch);
@ -472,6 +577,16 @@ void common_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);
 //
 // Token utils
 //
 // longest common prefix
 size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
 // longet common subsequence
 size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
 //
 // Vocab utils
 //
@ -485,7 +600,7 @@ std::vector<llama_token> common_tokenize(
                        bool   parse_special = false);
 std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+    const struct llama_vocab * vocab,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);
@ -497,61 +612,30 @@ std::string common_token_to_piece(
                       llama_token   token,
                       bool          special = true);
 std::string common_token_to_piece(
          const struct llama_vocab * vocab,
                       llama_token   token,
                       bool          special = true);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string common_detokenize(
-                         llama_context * ctx,
+            const struct llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);
-//
+std::string common_detokenize(
-// Chat template utils
+              const struct llama_vocab * vocab,
-//
+        const std::vector<llama_token> & tokens,
-
+                                  bool   special = true);
 // same with llama_chat_message, but uses std::string
 struct common_chat_msg {
    std::string role;
    std::string content;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
 std::string common_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & chat,
        bool add_ass);
 // Format single message, while taking into account the position of that message in chat history
 std::string common_chat_format_single(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
        bool add_ass);
 // Returns an example of formatted chat
 std::string common_chat_format_example(const struct llama_model * model,
        const std::string & tmpl);
 //
 // KV cache utils
 //
 // Dump the KV cache view with the number of sequences per cell.
 void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 // Dump the KV cache view showing individual sequences in each cell (long output).
 void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 //
 // Embedding utils
 //
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
 void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -580,6 +664,16 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
+namespace {
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+const char * const LLM_KV_SPLIT_NO            = "split.no";
 const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 }
 //
 // training utils
 //
 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@ -0,0 +1,256 @@
 #include "json-partial.h"
 #include "log.h"
 #include <nlohmann/json.hpp>
 #include <string>
 using json = nlohmann::ordered_json;
 enum common_json_stack_element_type {
    COMMON_JSON_STACK_ELEMENT_OBJECT,
    COMMON_JSON_STACK_ELEMENT_KEY,
    COMMON_JSON_STACK_ELEMENT_ARRAY,
 };
 struct common_json_stack_element {
    common_json_stack_element_type type;
    std::string key;
 };
 bool common_json_parse(
    const std::string & input,
    const std::string & healing_marker,
    common_json & out)
 {
    std::string::const_iterator it = input.begin();
    const auto end = input.end();
    return common_json_parse(it, end, healing_marker, out);
 }
 bool common_json_parse(
    std::string::const_iterator & it,
    const std::string::const_iterator & end,
    const std::string & healing_marker,
    common_json & out)
 {
    // // https://json.nlohmann.me/features/parsing/sax_interface/
    struct json_error_locator : public nlohmann::json_sax<json> {
        std::size_t position;
        bool found_error;
        std::string last_token;
        std::string exception_message;
        std::vector<common_json_stack_element> stack;
        json_error_locator() : position(0), found_error(false) {}
        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
            this->position = position - 1;
            this->found_error = true;
            this->last_token = last_token;
            this->exception_message = ex.what();
            return false;
        }
        void close_value() {
            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
                stack.pop_back();
            }
        }
        bool null() override { // NOLINT
            close_value();
            return true;
        }
        bool boolean(bool) override { // NOLINT
            close_value();
            return true;
        }
        bool number_integer(number_integer_t) override { // NOLINT
            close_value();
            return true;
        }
        bool number_unsigned(number_unsigned_t) override { // NOLINT
            close_value();
            return true;
        }
        bool number_float(number_float_t, const string_t &) override { // NOLINT
            close_value();
            return true;
        }
        bool string(string_t &) override { // NOLINT
            close_value();
            return true;
        }
        bool binary(binary_t &) override { // NOLINT
            close_value();
            return true;
        }
        bool start_object(std::size_t) override { // NOLINT
            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
            return true;
        }
        bool end_object() override {
            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
            stack.pop_back();
            close_value();
            return true;
        }
        bool key(string_t & key) override { // NOLINT
            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
            return true;
        }
        bool start_array(std::size_t) override { // NOLINT
            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
            return true;
        }
        bool end_array() override {
            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
            stack.pop_back();
            close_value();
            return true;
        }
    };
    json_error_locator err_loc;
    auto start = it;
    json::sax_parse(it, end, &err_loc);
    if (err_loc.found_error) {
        it = start;
        auto temptative_end = it + err_loc.position;
        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
        auto input = std::string(it, temptative_end);
        try {
            out.json = json::parse(input);
            // out.json = json::parse(it, temptative_end);
            it = temptative_end;
            return true;
        } catch (const std::exception & ex) {
            // No, needs healing.
            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
        }
        auto can_parse = [](const std::string & str) {
            try {
                auto _ = json::parse(str); // NOLINT
                return true;
            } catch (const std::exception &) {
                return false;
            }
        };
        if (!healing_marker.empty() && !err_loc.stack.empty()) {
            std::string str(it, temptative_end);
            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
            if (last_non_sp_pos == std::string::npos) {
                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
            }
            auto last_non_sp_char = str[last_non_sp_pos];
            // Used to detect stops on a number, which may not be complete.
            auto was_maybe_number = [&]() {
                if (!str.empty() && std::isspace(str.back())) {
                    return false;
                }
                return std::isdigit(last_non_sp_char) ||
                    last_non_sp_char == '.' ||
                    last_non_sp_char == 'e' ||
                    last_non_sp_char == 'E' ||
                    last_non_sp_char == '-';
            };
            std::string closing;
            for (size_t i = err_loc.stack.size(); i > 0; i--) {
                auto & el = err_loc.stack[i - 1];
                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
                    closing += "}";
                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
                    closing += "]";
                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
                    throw std::runtime_error("Unexpected stack element type");
                }
            }
            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
                // We're inside an object value
                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
                    // Was about to create an object value
                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
                } else if (can_parse(str + ": 1" + closing)) {
                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
                    // Was about to create an object
                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
                } else if (can_parse(str + "\"" + closing)) {
                    // Was inside an object value string
                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
                    // Was inside an object value string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
                } else {
                    // find last :
                    auto last_pos = str.find_last_of(':');
                    if (last_pos == std::string::npos) {
                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
                    }
                    // Cutting back to opening : for object value
                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
                }
            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
                    // Was about to create an array value
                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
                } else if (can_parse(str + "\"" + closing)) {
                    // Was inside an array value string
                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
                    // Was inside an array value string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
                    // Had just finished a value
                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
                } else {
                    auto last_pos = str.find_last_of("[,");
                    if (last_pos == std::string::npos) {
                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
                    }
                    // Cutting back to last [ or , for array value
                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
                }
            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
                    // Was about to create an object key+value
                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
                    // Was about to create an object key+value
                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
                } else if (can_parse(str + "\": 1" + closing)) {
                    // Was inside an object key string
                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
                    // Was inside an object key string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
                } else {
                    auto last_pos = str.find_last_of(':');
                    if (last_pos == std::string::npos) {
                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
                    }
                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
                }
            } else {
                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
            }
            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
            out.json = json::parse(str);
            it = temptative_end;
            return true;
        }
        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
        // fprintf(stderr, "Closing: TODO\n");
        return false;
    }
    out.json = json::parse(it, end);
    it = end;
    return true;
 }
--- a/common/json-partial.h
+++ b/common/json-partial.h
@ -0,0 +1,38 @@
 #pragma once
 #include <nlohmann/json.hpp>
 // Healing marker (empty if the JSON was fully parsed / wasn't healed).
 struct common_healing_marker {
    // Raw marker.
    std::string marker;
    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
    std::string json_dump_marker;
 };
 // Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
 struct common_json {
    nlohmann::ordered_json json;
    common_healing_marker healing_marker;
 };
 // Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
 //
 // Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
 // This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
 // (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
 //
 // For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
 bool common_json_parse(
    const std::string & input,
    const std::string & healing_marker,
    common_json & out);
 // Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
 bool common_json_parse(
    std::string::const_iterator & it,
    const std::string::const_iterator & end,
    const std::string & healing_marker,
    common_json & out);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -1,6 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"
 #include <nlohmann/json.hpp>
 #include <algorithm>
 #include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
@ -11,14 +14,12 @@
 using json = nlohmann::ordered_json;
 template <typename Iterator>
 static std::string join(Iterator begin, Iterator end, const std::string & separator);
 static std::string repeat(const std::string & str, size_t n);
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();
    if (max_items == 0) {
        return "";
    }
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
@ -40,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }
 /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
 class string_view {
    const std::string & _str;
    const size_t _start;
    const size_t _end;
 public:
    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
    size_t size() const {
        return _end - _start;
    }
    size_t length() const {
        return size();
    }
    operator std::string() const {
        return str();
    }
    std::string str() const {
        return _str.substr(_start, _end - _start);
    }
    string_view substr(size_t pos, size_t len = std::string::npos) const {
        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
    }
    char operator[](size_t pos) const {
        auto index = _start + pos;
        if (index >= _end) {
            throw std::out_of_range("string_view index out of range");
        }
        return _str[_start + pos];
    }
    bool operator==(const string_view & other) const {
        std::string this_str = *this;
        std::string other_str = other;
        return this_str == other_str;
    }
 };
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int>::min();
    auto has_max = max_value != std::numeric_limits<int>::max();
@ -111,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        }
        out << "}";
    };
-    std::function<void(const string_view &, const string_view &)> uniform_range =
+    std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
-        [&](const string_view & from, const string_view & to) {
+        [&](const std::string_view & from, const std::string_view & to) {
            size_t i = 0;
            while (i < from.length() && i < to.length() && from[i] == to[i]) {
                i++;
            }
            if (i > 0) {
-                out << "\"" << from.substr(0, i).str() << "\"";
+                out << "\"" << from.substr(0, i) << "\"";
            }
            if (i < from.length() && i < to.length()) {
                if (i > 0) {
@ -128,8 +86,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
-                    auto sub_zeros = repeat("0", sub_len);
+                    auto sub_zeros = string_repeat("0", sub_len);
-                    auto sub_nines = repeat("9", sub_len);
+                    auto sub_nines = string_repeat("9", sub_len);
                    auto to_reached = false;
                    out << "(";
@ -188,8 +146,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
        auto max_digits = max_s.length();
        for (auto digits = min_digits; digits < max_digits; digits++) {
-            uniform_range(min_s, repeat("9", digits));
+            uniform_range(min_s, string_repeat("9", digits));
-            min_s = "1" + repeat("0", digits);
+            min_s = "1" + string_repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
@ -267,7 +225,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }
-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
 struct BuiltinRule {
    std::string content;
@ -318,49 +276,6 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
 std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
    std::ostringstream result;
    if (begin != end) {
        result << *begin;
        for (Iterator it = begin + 1; it != end; ++it) {
            result << separator << *it;
        }
    }
    return result.str();
 }
 static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
    std::vector<std::string> tokens;
    size_t start = 0;
    size_t end = str.find(delimiter);
    while (end != std::string::npos) {
        tokens.push_back(str.substr(start, end - start));
        start = end + delimiter.length();
        end = str.find(delimiter, start);
    }
    tokens.push_back(str.substr(start));
    return tokens;
 }
 static std::string repeat(const std::string & str, size_t n) {
    if (n == 0) {
        return "";
    }
    std::string result;
    result.reserve(str.length() * n);
    for (size_t i = 0; i < n; ++i) {
        result += str;
    }
    return result;
 }
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
    std::string result;
@ -389,6 +304,7 @@ static std::string format_literal(const std::string & literal) {
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
@ -418,7 +334,7 @@ private:
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
-        return join(rules.begin(), rules.end(), " | ");
+        return string_join(rules, " | ");
    }
    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@ -481,7 +397,7 @@ private:
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
-                return std::make_pair(join(results.begin(), results.end(), " "), false);
+                return std::make_pair(string_join(results, " "), false);
            };
            while (i < length) {
@ -539,7 +455,7 @@ private:
                    }
                    curly_brackets += '}';
                    i++;
-                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = std::numeric_limits<int>::max();
                    try {
@ -854,7 +770,7 @@ public:
                            return;
                        }
                        std::string pointer = ref.substr(ref.find('#') + 1);
-                        std::vector<std::string> tokens = split(pointer, "/");
+                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
                            if (target.is_null() || !target.contains(sel)) {
@ -905,7 +821,7 @@ public:
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@ -1019,10 +935,10 @@ public:
    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
+            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
-            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
        }
    }
@ -1035,11 +951,35 @@ public:
    }
 };
-std::string json_schema_to_grammar(const json & schema) {
+std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
-    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+#ifdef LLAMA_USE_LLGUIDANCE
-    auto copy = schema;
+    if (!force_gbnf) {
-    converter.resolve_refs(copy, "input");
+        return "%llguidance {}\nstart: %json " + schema.dump();
-    converter.visit(copy, "");
+    }
 #else
    (void)force_gbnf;
 #endif // LLAMA_USE_LLGUIDANCE
    return build_grammar([&](const common_grammar_builder & callbacks) {
        auto copy = schema;
        callbacks.resolve_refs(copy);
        callbacks.add_schema("", copy);
    });
 }
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
    common_grammar_builder builder {
        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
            return converter._add_rule(name, rule);
        },
        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
            return converter.visit(schema, name == "root" ? "" : name);
        },
        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
            converter.resolve_refs(schema, "");
        }
    };
    cb(builder);
    converter.check_errors();
    return converter.format_grammar();
 }
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@ -1,8 +1,21 @@
 #pragma once
-#include "ggml.h"
+#include <nlohmann/json_fwd.hpp>
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
+#include <functional>
 #include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);
 struct common_grammar_builder {
    std::function<std::string(const std::string &, const std::string &)> add_rule;
    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
    std::function<void(nlohmann::ordered_json &)> resolve_refs;
 };
 struct common_grammar_options {
    bool dotall = false;
 };
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@ -0,0 +1,254 @@
 #include "sampling.h"
 #include "log.h"
 #ifdef LLAMA_USE_LLGUIDANCE
 #    include "llguidance.h"
 #    include <cmath>
 struct llama_sampler_llg {
    const llama_vocab * vocab;
    std::string         grammar_kind;
    std::string         grammar_data;
    LlgTokenizer *      tokenizer;
    LlgMatcher *        grammar;
 };
 static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
                                          const char * grammar_data) {
    LlgConstraintInit cinit;
    llg_constraint_init_set_defaults(&cinit, tokenizer);
    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
    if (log_level && *log_level) {
        cinit.log_stderr_level = atoi(log_level);
    }
    auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
    if (llg_matcher_get_error(c)) {
        LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
        llg_free_matcher(c);
        return nullptr;
    }
    return c;
 }
 static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
    return "llguidance";
 }
 static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
        llg_matcher_consume_token(ctx->grammar, token);
    }
 }
 static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
        const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
        if (mask == nullptr) {
            if (llg_matcher_compute_mask(ctx->grammar) == 0) {
                mask = llg_matcher_get_mask(ctx->grammar);
            } else {
                LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
                llg_free_matcher(ctx->grammar);
                ctx->grammar = nullptr;
                return;
            }
        }
        for (size_t i = 0; i < cur_p->size; ++i) {
            auto token = cur_p->data[i].id;
            if ((mask[token / 32] & (1 << (token % 32))) == 0) {
                cur_p->data[i].logit = -INFINITY;
            }
        }
    }
 }
 static void llama_sampler_llg_reset(llama_sampler * smpl) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
        llg_matcher_reset(ctx->grammar);
    }
 }
 static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
    auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
    // copy the state
    {
        auto * result_ctx = (llama_sampler_llg *) result->ctx;
        if (ctx->grammar) {
            result_ctx->grammar_kind = ctx->grammar_kind;
            result_ctx->grammar_data = ctx->grammar_data;
            result_ctx->grammar      = llg_clone_matcher(ctx->grammar);
            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
        }
    }
    return result;
 }
 static void llama_sampler_llg_free(llama_sampler * smpl) {
    const auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
        llg_free_matcher(ctx->grammar);
        llg_free_tokenizer(ctx->tokenizer);
    }
    delete ctx;
 }
 static llama_sampler_i llama_sampler_llg_i = {
    /* .name   = */ llama_sampler_llg_name,
    /* .accept = */ llama_sampler_llg_accept_impl,
    /* .apply  = */ llama_sampler_llg_apply,
    /* .reset  = */ llama_sampler_llg_reset,
    /* .clone  = */ llama_sampler_llg_clone,
    /* .free   = */ llama_sampler_llg_free,
 };
 static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
                                            uint32_t * output_tokens, size_t output_tokens_len) {
    const llama_vocab * vocab = (const llama_vocab *) user_data;
    int                 r     = 0;
    try {
        r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
                           true);
    } catch (const std::exception & e) {
        GGML_ABORT("llama_tokenize failed: %s\n", e.what());
    }
    if (r < 0) {
        return -r;
    }
    return r;
 }
 static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
    // TODO store the tokenizer in the vocab somehow
    static const llama_vocab * vocab_cache;
    static LlgTokenizer *      tokenizer_cache;
    if (vocab_cache == vocab) {
        return llg_clone_tokenizer(tokenizer_cache);
    }
    auto tok_eos = llama_vocab_eot(vocab);
    if (tok_eos == LLAMA_TOKEN_NULL) {
        tok_eos = llama_vocab_eos(vocab);
    }
    size_t vocab_size = llama_vocab_n_tokens(vocab);
    auto token_lens       = new uint32_t[vocab_size];
    // we typically have ~7 bytes per token; let's go on the safe side here
    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
    auto token_bytes      = new uint8_t[token_bytes_size];
    size_t offset = 0;
    for (size_t i = 0; i < vocab_size; i++) {
        size_t max_token = 1024;
        if (token_bytes_size - offset < max_token) {
            GGML_ABORT("token_bytes buffer too small\n");
        }
        llama_token token = i;
        auto        dp    = (char *) token_bytes + offset;
        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
        if (size < 0) {
            GGML_ABORT("llama_detokenize failed\n");
        }
        if (size == 0) {
            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
            if (size < 0) {
                GGML_ABORT("llama_detokenize failed\n");
            }
            if (size != 0) {
                *dp = '\xff';  // special token prefix marker
                size += 1;
            }
        }
        token_lens[i] = size;
        offset += size;
    }
    LlgTokenizerInit tinit = {
        /* .vocab_size                         = */ (uint32_t) vocab_size,
        /* .tok_eos                            = */ (uint32_t) tok_eos,
        /* .token_lens                         = */ token_lens,
        /* .token_bytes                        = */ token_bytes,
        /* .tokenizer_json                     = */ nullptr,
        /* .tokenize_assumes_string            = */ true,
        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
        /* .use_approximate_greedy_tokenize_fn = */ false,
        /* .tokenize_user_data                 = */ vocab,
        /* .slices                             = */ nullptr,
    };
    char           error_buffer[1024];
    LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
    delete[] token_bytes;
    delete[] token_lens;
    if (tokenizer == nullptr) {
        LOG_ERR("llg tokenizer error: %s\n", error_buffer);
        return tokenizer;
    }
    if (tokenizer_cache) {
        llg_free_tokenizer(tokenizer_cache);
    }
    vocab_cache     = vocab;
    tokenizer_cache = tokenizer;
    return llg_clone_tokenizer(tokenizer_cache);
 }
 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
                                       const char * grammar_data) {
    auto * ctx = new llama_sampler_llg;
    if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
        auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
        *ctx           = {
            /* .vocab        = */ vocab,
            /* .grammar_kind = */ grammar_kind,
            /* .grammar_data = */ grammar_data,
            /* .tokenizer    = */ tokenizer,
            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
        };
        if (ctx->grammar) {
            GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
                        llg_matcher_get_mask_byte_size(ctx->grammar));
        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
            /* .grammar_kind = */ {},
            /* .grammar_data = */ {},
            /* .tokenizer    = */ nullptr,
            /* .grammar      = */ nullptr,
        };
    }
    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
        /* .ctx   = */ ctx);
 }
 #else
 llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
    LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
    return nullptr;
 }
 #endif  // LLAMA_USE_LLGUIDANCE
--- a/common/log.cpp
+++ b/common/log.cpp
@ -1,5 +1,6 @@
 #include "log.h"
 #include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
@ -14,16 +15,6 @@ void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }
 #define LOG_COL_DEFAULT "\033[0m"
 #define LOG_COL_BOLD    "\033[1m"
 #define LOG_COL_RED     "\033[31m"
 #define LOG_COL_GREEN   "\033[32m"
 #define LOG_COL_YELLOW  "\033[33m"
 #define LOG_COL_BLUE    "\033[34m"
 #define LOG_COL_MAGENTA "\033[35m"
 #define LOG_COL_CYAN    "\033[36m"
 #define LOG_COL_WHITE   "\033[37m"
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@ -206,6 +197,7 @@ public:
                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
            }
 #endif
            va_end(args_copy);
        }
        entry.level = level;
--- a/common/log.h
+++ b/common/log.h
@ -2,9 +2,20 @@
 #include "ggml.h" // for ggml_log_level
 #define LOG_CLR_TO_EOL  "\033[K\r"
 #define LOG_COL_DEFAULT "\033[0m"
 #define LOG_COL_BOLD    "\033[1m"
 #define LOG_COL_RED     "\033[31m"
 #define LOG_COL_GREEN   "\033[32m"
 #define LOG_COL_YELLOW  "\033[33m"
 #define LOG_COL_BLUE    "\033[34m"
 #define LOG_COL_MAGENTA "\033[35m"
 #define LOG_COL_CYAN    "\033[36m"
 #define LOG_COL_WHITE   "\033[37m"
 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@ -7,6 +7,7 @@
 #include <cstdio>
 #include <fstream>
 #include <thread>
 #include <algorithm>
 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
@ -65,13 +66,13 @@ constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
 static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
    if (part_static_it == nc_static.end()) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
    }
    const common_ngram_cache_part part_static = part_static_it->second;
    int max_count_static  = 0;
    int sum_count_static  = 0;
-    llama_token max_token = -1;
+    llama_token max_token = LLAMA_TOKEN_NULL;
    for (std::pair<llama_token, int> token_count_static : part_static) {
        const llama_token token = token_count_static.first;
@ -85,10 +86,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
    }
    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
    }
    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
-        return -1;
+        return LLAMA_TOKEN_NULL;
    }
    return max_token;
 }
@ -98,9 +99,9 @@ static llama_token try_draft(
    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
    const int * min_sample_size, const int * min_percent) {
-    llama_token drafted_token = -1;
+    llama_token drafted_token = LLAMA_TOKEN_NULL;
-    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
        const common_ngram ngram_primary = ngrams_primary[i];
        common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@ -112,7 +113,7 @@ static llama_token try_draft(
        int max_count_primary = 0;
        int max_count_static  = 0;
        int sum_count_primary = 0;
-        llama_token max_token = -1;
+        llama_token max_token = LLAMA_TOKEN_NULL;
        for (std::pair<llama_token, int> token_count_primary : part_primary) {
            const llama_token token = token_count_primary.first;
@ -154,7 +155,7 @@ void common_ngram_cache_draft(
    }
    while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
+        llama_token drafted_token = LLAMA_TOKEN_NULL;
        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
        common_ngram ngram_static;
@ -177,17 +178,17 @@ void common_ngram_cache_draft(
            }
            ngrams_cd.push_back(ngram_cd);
        }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
        }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
        }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
            drafted_token = try_draft(nc_static, ngram_static);
        }
-        if (drafted_token == -1) {
+        if (drafted_token == LLAMA_TOKEN_NULL) {
            break;
        }
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -17,13 +17,13 @@ struct common_ngram {
    common_ngram() {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = -1;
+            tokens[i] = LLAMA_TOKEN_NULL;
        }
    }
    common_ngram(const llama_token * input, const int ngram_size) {
        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = i < ngram_size ? input[i] : -1;
+            tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
        }
    }
--- a/common/regex-partial.cpp
+++ b/common/regex-partial.cpp
@ -0,0 +1,204 @@
 #include "regex-partial.h"
 #include "common.h"
 #include <functional>
 #include <optional>
 common_regex::common_regex(const std::string & pattern) :
    pattern(pattern),
    rx(pattern),
    rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
 common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
    std::smatch match;
    if (pos > input.size()) {
        throw std::runtime_error("Position out of bounds");
    }
    auto start = input.begin() + pos;
    auto found = as_match
        ? std::regex_match(start, input.end(), match, rx)
        : std::regex_search(start, input.end(), match, rx);
    if (found) {
        common_regex_match res;
        res.type = COMMON_REGEX_MATCH_TYPE_FULL;
        for (size_t i = 0; i < match.size(); ++i) {
            auto begin = pos + match.position(i);
            res.groups.emplace_back(begin, begin + match.length(i));
        }
        return res;
    }
    std::match_results<std::string::const_reverse_iterator> srmatch;
    if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
        auto group = srmatch[1].str();
        if (group.length() != 0) {
            auto it = srmatch[1].second.base();
            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
            if ((!as_match) || it == input.begin()) {
                common_regex_match res;
                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
                const size_t begin = std::distance(input.begin(), it);
                const size_t end = input.size();
                if (begin == std::string::npos || end == std::string::npos || begin > end) {
                    throw std::runtime_error("Invalid range");
                }
                res.groups.push_back({begin, end});
                return res;
            }
        }
    }
    return {};
 }
 /*
  Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
  Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
  - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
  - /a|b/ -> (a|b).*
  - /a*?/ -> error, could match ""
  - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
  - /.*?ab/ -> ((?:b)?a).* (merge .*)
  - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
  - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
  - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
  - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
  (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
 */
 std::string regex_to_reversed_partial_regex(const std::string & pattern) {
    auto it = pattern.begin();
    const auto end = pattern.end();
    std::function<std::string()> process = [&]() {
        std::vector<std::vector<std::string>> alternatives(1);
        std::vector<std::string> * sequence = &alternatives.back();
        while (it != end) {
            if (*it == '[') {
                auto start = it;
                ++it;
                while (it != end) {
                    if ((*it == '\\') && (++it != end)) {
                        ++it;
                    } else if ((it != end) && (*it == ']')) {
                        break;
                    } else {
                        ++it;
                    }
                }
                if (it == end) {
                    throw std::runtime_error("Unmatched '[' in pattern");
                }
                ++it;
                sequence->push_back(std::string(start, it));
            } else if (*it == '*' || *it == '?' || *it == '+') {
                if (sequence->empty()) {
                    throw std::runtime_error("Quantifier without preceding element");
                }
                sequence->back() += *it;
                auto is_star = *it == '*';
                ++it;
                if (is_star) {
                    if (*it == '?') {
                        ++it;
                    }
                }
            } else if (*it == '{') {
                if (sequence->empty()) {
                    throw std::runtime_error("Repetition without preceding element");
                }
                ++it;
                auto start = it;
                while (it != end && *it != '}') {
                    ++it;
                }
                if (it == end) {
                    throw std::runtime_error("Unmatched '{' in pattern");
                }
                auto parts = string_split(std::string(start, it), ",");
                ++it;
                if (parts.size() > 2) {
                    throw std::runtime_error("Invalid repetition range in pattern");
                }
                auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
                    if (s.empty()) {
                        return def;
                    }
                    return std::stoi(s);
                };
                auto min = parseOptInt(parts[0], 0);
                auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
                if (min && max && *max < *min) {
                    throw std::runtime_error("Invalid repetition range in pattern");
                }
                // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
                auto part = sequence->back();
                sequence->pop_back();
                for (int i = 0; i < *min; i++) {
                    sequence->push_back(part);
                }
                if (max) {
                    for (int i = *min; i < *max; i++) {
                        sequence->push_back(part + "?");
                    }
                } else {
                    sequence->push_back(part + "*");
                }
            } else if (*it == '(') {
                ++it;
                if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
                    it += 2;
                }
                auto sub = process();
                if (*it != ')') {
                    throw std::runtime_error("Unmatched '(' in pattern");
                }
                ++it;
                auto & part = sequence->emplace_back("(?:");
                part += sub;
                part += ")";
            } else if (*it == ')') {
                break;
            } else if (*it == '|') {
                ++it;
                alternatives.emplace_back();
                sequence = &alternatives.back();
            } else if (*it == '\\' && (++it != end)) {
                auto str = std::string("\\") + *it;
                sequence->push_back(str);
                ++it;
            } else if (it != end) {
                sequence->push_back(std::string(1, *it));
                ++it;
            }
        }
        // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
        // We'll do the outermost capturing group and final .* in the enclosing function.
        std::vector<std::string> res_alts;
        for (const auto & parts : alternatives) {
            auto & res = res_alts.emplace_back();
            for (size_t i = 0; i < parts.size() - 1; i++) {
                res += "(?:";
            }
            for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
                res += *it;
                if (it != parts.rend() - 1) {
                    res += ")?";
                }
            }
        }
        return string_join(res_alts, "|");
    };
    auto res = process();
    if (it != end) {
        throw std::runtime_error("Unmatched '(' in pattern");
    }
    return "(" + res + ")[\\s\\S]*";
 }
--- a/common/regex-partial.h
+++ b/common/regex-partial.h
@ -0,0 +1,56 @@
 #pragma once
 #include <regex>
 #include <string>
 enum common_regex_match_type {
    COMMON_REGEX_MATCH_TYPE_NONE,
    COMMON_REGEX_MATCH_TYPE_PARTIAL,
    COMMON_REGEX_MATCH_TYPE_FULL,
 };
 struct common_string_range {
    size_t begin;
    size_t end;
    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
        if (begin > end) {
            throw std::runtime_error("Invalid range");
        }
    }
    // prevent default ctor
    common_string_range() = delete;
    bool empty() const {
        return begin == end;
    }
    bool operator==(const common_string_range & other) const {
        return begin == other.begin && end == other.end;
    }
 };
 struct common_regex_match {
    common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
    std::vector<common_string_range> groups;
    bool operator==(const common_regex_match & other) const {
        return type == other.type && groups == other.groups;
    }
    bool operator!=(const common_regex_match & other) const {
        return !(*this == other);
    }
 };
 class common_regex {
    std::string pattern;
    std::regex rx;
    std::regex rx_reversed_partial;
  public:
    explicit common_regex(const std::string & pattern);
    common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
    const std::string & str() const { return pattern; }
 };
 // For testing only (pretty print of failures).
 std::string regex_to_reversed_partial_regex(const std::string & pattern);
--- a/Show More
+++ b/Show More