From c494c70a0664e0f45c6bcfeac42d802e89c2c196 Mon Sep 17 00:00:00 2001
From: David Baker <david@dandm.online>
Date: Fri, 6 Mar 2026 16:40:36 +0000
Subject: [PATCH 1/4] Implement output flag on cli

---
 common/arg.cpp             |  2 +-
 tools/cli/cli.cpp          | 65 ++++++++++++++++++++++++++++++++++++--
 tools/server/server-task.h |  3 +-
 3 files changed, 65 insertions(+), 5 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index cd73d96420..331a06fbdc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2659,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 13bedf31eb..c5f35886d3 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -56,6 +56,7 @@ struct cli_context {
     std::vector<raw_buffer> input_files;
     task_params defaults;
     bool verbose_prompt;
+    common_reasoning_format reasoning_format;
 
     // thread for showing "loading" animation
     std::atomic<bool> loading_show;
@@ -66,15 +67,17 @@ struct cli_context {
         defaults.n_keep      = params.n_keep;
         defaults.n_predict   = params.n_predict;
         defaults.antiprompt  = params.antiprompt;
+        defaults.special_characters = params.special;
 
         defaults.stream = true; // make sure we always use streaming mode
         defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
         // defaults.return_progress = true; // TODO: show progress
 
         verbose_prompt = params.verbose_prompt;
+        reasoning_format = params.reasoning_format;
     }
 
-    std::string generate_completion(result_timings & out_timings) {
+    std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) {
         server_response_reader rd = ctx_server.get_response_reader();
         auto chat_params = format_chat();
         {
@@ -89,7 +92,7 @@ struct cli_context {
 
             // chat template settings
             task.params.chat_parser_params = common_chat_parser_params(chat_params);
-            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+            task.params.chat_parser_params.reasoning_format = reasoning_format;
             if (!chat_params.parser.empty()) {
                 task.params.chat_parser_params.parser.load(chat_params.parser);
             }
@@ -103,6 +106,18 @@ struct cli_context {
             console::set_display(DISPLAY_TYPE_RESET);
         }
 
+        // check if we are doing file output
+        bool file_streaming  = (file_out != nullptr && file_out->is_open());
+        if (file_streaming) {
+            if (defaults.special_characters) {
+                *file_out << chat_params.prompt;
+            }
+            else {
+                *file_out << "[Prompt]: " << messages.back()["content"].get<std::string>() << "\n\n";
+            }
+            file_out->flush();
+        }
+
         // wait for first result
         console::spinner::start();
         server_task_result_ptr result = rd.next(should_stop);
@@ -110,6 +125,7 @@ struct cli_context {
         console::spinner::stop();
         std::string curr_content;
         bool is_thinking = false;
+        bool content_started  = false;
 
         while (result) {
             if (should_stop()) {
@@ -132,26 +148,60 @@ struct cli_context {
                         if (is_thinking) {
                             console::log("\n[End thinking]\n\n");
                             console::set_display(DISPLAY_TYPE_RESET);
+                            if (file_streaming && is_thinking) {
+                                if (defaults.special_characters) {
+                                    *file_out << "<\\think>";
+                                }
+                                else {
+                                    *file_out << "\n\n";
+                                }
+                            }
                             is_thinking = false;
                         }
                         curr_content += diff.content_delta;
                         console::log("%s", diff.content_delta.c_str());
                         console::flush();
+                        if (file_streaming) {
+                            if (!content_started && !defaults.special_characters) {
+                                *file_out << "[Assistant]: ";
+                            }
+                            content_started = true;
+                            *file_out << diff.content_delta;
+                            file_out->flush();
+                        }
                     }
                     if (!diff.reasoning_content_delta.empty()) {
                         console::set_display(DISPLAY_TYPE_REASONING);
                         if (!is_thinking) {
                             console::log("[Start thinking]\n");
+                            if (file_streaming) {
+                                if (defaults.special_characters) {
+                                    *file_out << "<think>";
+                                }
+                                else {
+                                    *file_out << "[Thinking]: ";
+                                }
+                            }
                         }
                         is_thinking = true;
                         console::log("%s", diff.reasoning_content_delta.c_str());
                         console::flush();
+                        if (file_streaming) {
+                            *file_out << diff.reasoning_content_delta;
+                            file_out->flush();
+                        }
                     }
                 }
             }
             auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
             if (res_final) {
                 out_timings = std::move(res_final->timings);
+                if (file_streaming) {
+                    if (!defaults.special_characters) {
+                        *file_out << "\n\n";
+                    }
+                    file_out->flush();
+                }
                 break;
             }
             result = rd.next(should_stop);
@@ -341,6 +391,15 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
+    // open output file early to fail fast
+    std::ofstream output_file;
+    if (!params.out_file.empty()) {
+        output_file.open(params.out_file, std::ios::binary);
+        if (!output_file) {
+            console::error("Failed to open output file '%s'\n", params.out_file.c_str());
+            return 1;
+        }
+    }
     console::set_display(DISPLAY_TYPE_RESET);
     console::set_completion_callback(auto_completion_callback);
 
@@ -531,7 +590,7 @@ int main(int argc, char ** argv) {
             cur_msg.clear();
         }
         result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
+        std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file);
         ctx_cli.messages.push_back({
             {"role",    "assistant"},
             {"content", assistant_content}
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index e2e3e5a582..102b43d246 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -51,7 +51,8 @@ struct task_params {
     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
     bool return_tokens   = false;
     bool return_progress = false;
-
+    bool special_characters = false; // whether to include special tokens in the output (e.g. <s>, </s>, <pad>, etc.)
+    
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
     int32_t n_predict = -1; // new tokens to predict

From 96fc9a91ef47d60de04dae0f9eba68402a3aedd4 Mon Sep 17 00:00:00 2001
From: David Baker <david@dandm.online>
Date: Fri, 6 Mar 2026 16:54:23 +0000
Subject: [PATCH 2/4] Added documentation line

---
 tools/cli/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/cli/README.md b/tools/cli/README.md
index 22d3fc87e9..7681917bae 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -39,6 +39,7 @@
 | `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
+| `-o, --output FNAME` | a file to which to save the output (default: none) |
 | `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
 | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |

From c2baff91615bfba425247833f4c2713215f2a703 Mon Sep 17 00:00:00 2001
From: David Baker <david@dandm.online>
Date: Wed, 11 Mar 2026 13:53:31 +0000
Subject: [PATCH 3/4] Refactor to use a common function to do file output,
 which both outputs to file and selects different outputs for special token
 and plain text cases

---
 tools/cli/cli.cpp | 76 ++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 43 deletions(-)

diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index c929946861..2b8d9298ef 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -58,6 +58,8 @@ struct cli_context {
     task_params defaults;
     bool verbose_prompt;
     common_reasoning_format reasoning_format;
+    bool file_streaming = false;
+    std::ofstream * file_out = nullptr;
 
     // thread for showing "loading" animation
     std::atomic<bool> loading_show;
@@ -78,7 +80,7 @@ struct cli_context {
         reasoning_format = params.reasoning_format;
     }
 
-    std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) {
+    std::string generate_completion(result_timings & out_timings, std::ofstream * file_to_use = nullptr) {
         server_response_reader rd = ctx_server.get_response_reader();
         auto chat_params = format_chat();
         {
@@ -108,16 +110,12 @@ struct cli_context {
         }
 
         // check if we are doing file output
-        bool file_streaming  = (file_out != nullptr && file_out->is_open());
-        if (file_streaming) {
-            if (defaults.special_characters) {
-                *file_out << chat_params.prompt;
-            }
-            else {
-                *file_out << "[Prompt]: " << messages.back()["content"].get<std::string>() << "\n\n";
-            }
-            file_out->flush();
-        }
+        file_out = file_to_use;
+        file_streaming  = (file_out != nullptr && file_out->is_open());
+        append_file_out(
+            "[Prompt]: " + messages.back()["content"].get<std::string>() + "\n\n", 
+            chat_params.prompt
+        );
 
         // wait for first result
         console::spinner::start();
@@ -149,60 +147,40 @@ struct cli_context {
                         if (is_thinking) {
                             console::log("\n[End thinking]\n\n");
                             console::set_display(DISPLAY_TYPE_RESET);
-                            if (file_streaming && is_thinking) {
-                                if (defaults.special_characters) {
-                                    *file_out << "<\\think>";
-                                }
-                                else {
-                                    *file_out << "\n\n";
-                                }
-                            }
+                            append_file_out("\n\n", "</think>");
+
                             is_thinking = false;
                         }
                         curr_content += diff.content_delta;
                         console::log("%s", diff.content_delta.c_str());
                         console::flush();
-                        if (file_streaming) {
-                            if (!content_started && !defaults.special_characters) {
-                                *file_out << "[Assistant]: ";
-                            }
+                        if (!content_started) {
+                            append_file_out("[Assistant]: ", "");
                             content_started = true;
-                            *file_out << diff.content_delta;
-                            file_out->flush();
                         }
+                        append_file_out(diff.content_delta);
                     }
                     if (!diff.reasoning_content_delta.empty()) {
                         console::set_display(DISPLAY_TYPE_REASONING);
+                        std::string reasoning_delta = diff.reasoning_content_delta;
                         if (!is_thinking) {
                             console::log("[Start thinking]\n");
-                            if (file_streaming) {
-                                if (defaults.special_characters) {
-                                    *file_out << "<think>";
-                                }
-                                else {
-                                    *file_out << "[Thinking]: ";
-                                }
+                            append_file_out("[Thinking]: ", "<think>");
+                            if (reasoning_delta == "<think>") {
+                                reasoning_delta = "";
                             }
                         }
                         is_thinking = true;
-                        console::log("%s", diff.reasoning_content_delta.c_str());
+                        console::log("%s", reasoning_delta.c_str());
                         console::flush();
-                        if (file_streaming) {
-                            *file_out << diff.reasoning_content_delta;
-                            file_out->flush();
-                        }
+                        append_file_out(reasoning_delta);
                     }
                 }
             }
             auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
             if (res_final) {
                 out_timings = std::move(res_final->timings);
-                if (file_streaming) {
-                    if (!defaults.special_characters) {
-                        *file_out << "\n\n";
-                    }
-                    file_out->flush();
-                }
+                append_file_out("\n\n","");
                 break;
             }
             result = rd.next(should_stop);
@@ -229,6 +207,18 @@ struct cli_context {
         }
     }
 
+    void append_file_out(const std::string & content, const std::optional<std::string> & special_characters_content = std::nullopt) {
+        if (!file_streaming) {
+            return;
+        }
+        if (defaults.special_characters && special_characters_content.has_value()) {
+            *file_out << special_characters_content.value();
+        } else {
+            *file_out << content;
+        }
+        file_out->flush();
+    }
+
     common_chat_params format_chat() {
         auto meta = ctx_server.get_meta();
         auto & chat_params = meta.chat_params;

From c522288ab662adf1c848a0eab73df315778b237c Mon Sep 17 00:00:00 2001
From: David Baker <david@dandm.online>
Date: Fri, 13 Mar 2026 19:22:02 +0000
Subject: [PATCH 4/4] Switch to storing the pointer in a
 std::optional<std::ofstream *> as part of the context class.

---
 tools/cli/cli.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index e1ce4416ea..94890e572e 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -60,8 +60,7 @@ struct cli_context {
     int reasoning_budget = -1;
     std::string reasoning_budget_message;
     common_reasoning_format reasoning_format;
-    bool file_streaming = false;
-    std::ofstream * file_out = nullptr;
+    std::optional<std::ofstream *> file_out = std::nullopt;
 
     // thread for showing "loading" animation
     std::atomic<bool> loading_show;
@@ -84,7 +83,7 @@ struct cli_context {
         reasoning_format = params.reasoning_format;
     }
 
-    std::string generate_completion(result_timings & out_timings, std::ofstream * file_to_use = nullptr) {
+    std::string generate_completion(result_timings & out_timings) {
         server_response_reader rd = ctx_server.get_response_reader();
         auto chat_params = format_chat();
         {
@@ -131,9 +130,6 @@ struct cli_context {
             console::set_display(DISPLAY_TYPE_RESET);
         }
 
-        // check if we are doing file output
-        file_out = file_to_use;
-        file_streaming  = (file_out != nullptr && file_out->is_open());
         append_file_out(
             "[Prompt]: " + messages.back()["content"].get<std::string>() + "\n\n", 
             chat_params.prompt
@@ -230,15 +226,15 @@ struct cli_context {
     }
 
     void append_file_out(const std::string & content, const std::optional<std::string> & special_characters_content = std::nullopt) {
-        if (!file_streaming) {
+        if (!file_out.has_value()) {
             return;
         }
         if (defaults.special_characters && special_characters_content.has_value()) {
-            *file_out << special_characters_content.value();
+            *file_out.value() << special_characters_content.value();
         } else {
-            *file_out << content;
+            *file_out.value() << content;
         }
-        file_out->flush();
+        file_out.value()->flush();
     }
 
     common_chat_params format_chat() {
@@ -409,11 +405,13 @@ int main(int argc, char ** argv) {
     std::ofstream output_file;
     if (!params.out_file.empty()) {
         output_file.open(params.out_file, std::ios::binary);
-        if (!output_file) {
+        if (!output_file || !output_file.is_open()) {
             console::error("Failed to open output file '%s'\n", params.out_file.c_str());
             return 1;
         }
+        ctx_cli.file_out = &output_file;
     }
+    
     console::set_display(DISPLAY_TYPE_RESET);
     console::set_completion_callback(auto_completion_callback);
 
@@ -604,7 +602,7 @@ int main(int argc, char ** argv) {
             cur_msg.clear();
         }
         result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file);
+        std::string assistant_content = ctx_cli.generate_completion(timings);
         ctx_cli.messages.push_back({
             {"role",    "assistant"},
             {"content", assistant_content}