diff --git a/.dockerignore b/.dockerignore
index 064b7c7be8..c479caa759 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -13,6 +13,7 @@ build*/
 models/*
 
 /llama-cli
+/llama-completion
 /llama-quantize
 
 arm_neon.h
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 1904e31fdc..8e6dc3e568 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -44,6 +44,7 @@ body:
         - Documentation/Github
         - libllama (core library)
         - llama-cli
+        - llama-completion
         - llama-server
         - llama-bench
         - llama-quantize
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index ad13c6ea8d..930ef6d000 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -183,7 +183,8 @@ Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI i
 
 ### Built Executables (in `build/bin/`)
 Primary tools:
-- **`llama-cli`**: Main inference tool
+- **`llama-cli`**: Main CLI tool
+- **`llama-completion`**: Legacy CLI tool
 - **`llama-server`**: OpenAI-compatible HTTP server
 - **`llama-quantize`**: Model quantization utility
 - **`llama-perplexity`**: Model evaluation tool
diff --git a/README.md b/README.md
index 5f2076d0a3..ebcb549366 100644
--- a/README.md
+++ b/README.md
@@ -584,7 +584,7 @@ Command-line completion is available for some environments.
 
 #### Bash Completion
 ```bash
-$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
+$ build/bin/llama-completion --completion-bash > ~/.llama-completion.bash
 $ source ~/.llama-completion.bash
 ```
 Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index f44458ed3b..b3c19e21d8 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -118,7 +118,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 *Notes:*
 
 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
   - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
 
 - **Execution Unit (EU)**
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index 85f136ef9e..7d80414138 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -62,6 +62,7 @@ To generate an installable "package" simply use cmake --install:
 ...
 -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
 -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-completion
 ...
 ```
 
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
index 200a7aabc0..37272c433e 100644
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -53,7 +53,7 @@ M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapd
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
-GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
+GGML_HEXAGON_NDEV=4 ./bin/llama-completion --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
       -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
 ...
 llama_model_loader: - type  f32:  289 tensors
diff --git a/scripts/fetch_server_test_models.py b/scripts/fetch_server_test_models.py
index ac483ef5d7..9efcf37bea 100755
--- a/scripts/fetch_server_test_models.py
+++ b/scripts/fetch_server_test_models.py
@@ -74,11 +74,11 @@ if __name__ == '__main__':
     for m in models:
         logging.info(f'  - {m.hf_repo} / {m.hf_file}')
 
-    cli_path = os.environ.get(
+    completion_path = os.environ.get(
         'LLAMA_CLI_BIN_PATH',
         os.path.join(
             os.path.dirname(__file__),
-            '../build/bin/Release/llama-cli.exe' if os.name == 'nt' else '../build/bin/llama-cli'))
+            '../build/bin/Release/llama-completion.exe' if os.name == 'nt' else '../build/bin/llama-completion'))
 
     for m in models:
         if '<' in m.hf_repo or (m.hf_file is not None and '<' in m.hf_file):
@@ -86,9 +86,9 @@ if __name__ == '__main__':
         if m.hf_file is not None and '-of-' in m.hf_file:
             logging.warning(f'Skipping model at {m.hf_repo} / {m.hf_file} because it is a split file')
             continue
-        logging.info(f'Using llama-cli to ensure model {m.hf_repo}/{m.hf_file} was fetched')
+        logging.info(f'Using llama-completion to ensure model {m.hf_repo}/{m.hf_file} was fetched')
         cmd = [
-            cli_path,
+            completion_path,
             '-hfr', m.hf_repo,
             *([] if m.hf_file is None else ['-hff', m.hf_file]),
             '-n', '1',