CLI: fixed adding cli and completion into docker containers, improved docs (#18003)
Co-authored-by: Andrew Aladjev <andrew.aladjev@gmail.com>
This commit is contained in:
parent
5f5f9b4637
commit
fb644247de
|
|
@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||||
# ENTRYPOINT ["/app/llama-server"]
|
# ENTRYPOINT ["/app/llama-server"]
|
||||||
|
|
||||||
### Target: light
|
### Target: light
|
||||||
# Lightweight image containing only llama-cli
|
# Lightweight image containing only llama-cli and llama-completion
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM base AS light
|
FROM base AS light
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
RUN echo "Building with static libs" && \
|
RUN echo "Building with static libs" && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli && \
|
||||||
|
cmake --build build --config Release --target llama-completion
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
# TODO: use image with NNRT
|
||||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ make -j GGML_CUDA=1
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||||
|
|
||||||
|
|
@ -68,6 +69,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/llama-cuda-cli
|
||||||
|
%{_bindir}/llama-cuda-completion
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/llama-cuda-server
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/llama-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@ make -j
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||||
|
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||||
|
|
||||||
|
|
@ -70,6 +71,7 @@ rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/llama-cli
|
||||||
|
%{_bindir}/llama-completion
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/llama-server
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/llama-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/llama.service
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,9 @@
|
||||||
## Images
|
## Images
|
||||||
We have three Docker images available for this project:
|
We have three Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
|
||||||
|
|
||||||
Additionally, there the following images, similar to the above:
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
|
@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o
|
||||||
On completion, you are ready to play!
|
On completion, you are ready to play!
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a light image:
|
or with a light image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf
|
||||||
|
docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
@ -59,6 +61,8 @@ or with a server image:
|
||||||
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
|
In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container.
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
|
||||||
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||||
|
|
@ -80,9 +84,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -114,9 +118,9 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables.
|
||||||
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue