From 7479388116f55065f03f8d15f8d5127bdfad457a Mon Sep 17 00:00:00 2001 From: Andrew Aladjev Date: Mon, 15 Dec 2025 13:14:21 +0300 Subject: [PATCH] CLI: llama cli and completion cosmetics --- .dockerignore | 1 + .github/ISSUE_TEMPLATE/019-bug-misc.yml | 1 + .github/copilot-instructions.md | 3 ++- README.md | 2 +- docs/backend/SYCL.md | 2 +- docs/backend/hexagon/README.md | 1 + docs/backend/hexagon/developer.md | 2 +- scripts/fetch_server_test_models.py | 8 ++++---- 8 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.dockerignore b/.dockerignore index 064b7c7be8..c479caa759 100644 --- a/.dockerignore +++ b/.dockerignore @@ -13,6 +13,7 @@ build*/ models/* /llama-cli +/llama-completion /llama-quantize arm_neon.h diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 1904e31fdc..8e6dc3e568 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -44,6 +44,7 @@ body: - Documentation/Github - libllama (core library) - llama-cli + - llama-completion - llama-server - llama-bench - llama-quantize diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index ad13c6ea8d..930ef6d000 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -183,7 +183,8 @@ Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI i ### Built Executables (in `build/bin/`) Primary tools: -- **`llama-cli`**: Main inference tool +- **`llama-cli`**: Main CLI tool +- **`llama-completion`**: Legacy CLI tool - **`llama-server`**: OpenAI-compatible HTTP server - **`llama-quantize`**: Model quantization utility - **`llama-perplexity`**: Model evaluation tool diff --git a/README.md b/README.md index b7d24c9dd7..38ab316c92 100644 --- a/README.md +++ b/README.md @@ -583,7 +583,7 @@ Command-line completion is available for some environments. #### Bash Completion ```bash -$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash +$ build/bin/llama-completion --completion-bash > ~/.llama-completion.bash $ source ~/.llama-completion.bash ``` Optionally this can be added to your `.bashrc` or `.bash_profile` to load it diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 02a72a9d51..578bf54c5f 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -116,7 +116,7 @@ SYCL backend supports Intel GPU Family: *Notes:* - **Memory** - - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. + - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`. - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. - **Execution Unit (EU)** diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md index 85f136ef9e..7d80414138 100644 --- a/docs/backend/hexagon/README.md +++ b/docs/backend/hexagon/README.md @@ -62,6 +62,7 @@ To generate an installable "package" simply use cmake --install: ... -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli +-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-completion ... ``` diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md index 200a7aabc0..37272c433e 100644 --- a/docs/backend/hexagon/developer.md +++ b/docs/backend/hexagon/developer.md @@ -53,7 +53,7 @@ M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapd ... LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib -GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf +GGML_HEXAGON_NDEV=4 ./bin/llama-completion --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt ... llama_model_loader: - type f32: 289 tensors diff --git a/scripts/fetch_server_test_models.py b/scripts/fetch_server_test_models.py index ac483ef5d7..9efcf37bea 100755 --- a/scripts/fetch_server_test_models.py +++ b/scripts/fetch_server_test_models.py @@ -74,11 +74,11 @@ if __name__ == '__main__': for m in models: logging.info(f' - {m.hf_repo} / {m.hf_file}') - cli_path = os.environ.get( + completion_path = os.environ.get( 'LLAMA_CLI_BIN_PATH', os.path.join( os.path.dirname(__file__), - '../build/bin/Release/llama-cli.exe' if os.name == 'nt' else '../build/bin/llama-cli')) + '../build/bin/Release/llama-completion.exe' if os.name == 'nt' else '../build/bin/llama-completion')) for m in models: if '<' in m.hf_repo or (m.hf_file is not None and '<' in m.hf_file): @@ -86,9 +86,9 @@ if __name__ == '__main__': if m.hf_file is not None and '-of-' in m.hf_file: logging.warning(f'Skipping model at {m.hf_repo} / {m.hf_file} because it is a split file') continue - logging.info(f'Using llama-cli to ensure model {m.hf_repo}/{m.hf_file} was fetched') + logging.info(f'Using llama-completion to ensure model {m.hf_repo}/{m.hf_file} was fetched') cmd = [ - cli_path, + completion_path, '-hfr', m.hf_repo, *([] if m.hf_file is None else ['-hff', m.hf_file]), '-n', '1',