From c494c70a0664e0f45c6bcfeac42d802e89c2c196 Mon Sep 17 00:00:00 2001 From: David Baker Date: Fri, 6 Mar 2026 16:40:36 +0000 Subject: [PATCH 1/4] Implement output flag on cli --- common/arg.cpp | 2 +- tools/cli/cli.cpp | 65 ++++++++++++++++++++++++++++++++++++-- tools/server/server-task.h | 3 +- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index cd73d96420..331a06fbdc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2659,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 13bedf31eb..c5f35886d3 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -56,6 +56,7 @@ struct cli_context { std::vector input_files; task_params defaults; bool verbose_prompt; + common_reasoning_format reasoning_format; // thread for showing "loading" animation std::atomic loading_show; @@ -66,15 +67,17 @@ struct cli_context { defaults.n_keep = params.n_keep; defaults.n_predict = params.n_predict; defaults.antiprompt = params.antiprompt; + defaults.special_characters = params.special; defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress verbose_prompt = params.verbose_prompt; + reasoning_format = params.reasoning_format; } - std::string generate_completion(result_timings & out_timings) { + std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) { server_response_reader rd = ctx_server.get_response_reader(); auto chat_params = format_chat(); { @@ -89,7 +92,7 @@ struct cli_context { // chat template settings task.params.chat_parser_params = common_chat_parser_params(chat_params); - task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + task.params.chat_parser_params.reasoning_format = reasoning_format; if (!chat_params.parser.empty()) { task.params.chat_parser_params.parser.load(chat_params.parser); } @@ -103,6 +106,18 @@ struct cli_context { console::set_display(DISPLAY_TYPE_RESET); } + // check if we are doing file output + bool file_streaming = (file_out != nullptr && file_out->is_open()); + if (file_streaming) { + if (defaults.special_characters) { + *file_out << chat_params.prompt; + } + else { + *file_out << "[Prompt]: " << messages.back()["content"].get() << "\n\n"; + } + file_out->flush(); + } + // wait for first result console::spinner::start(); server_task_result_ptr result = rd.next(should_stop); @@ -110,6 +125,7 @@ struct cli_context { console::spinner::stop(); std::string curr_content; bool is_thinking = false; + bool content_started = false; while (result) { if (should_stop()) { @@ -132,26 +148,60 @@ struct cli_context { if (is_thinking) { console::log("\n[End thinking]\n\n"); console::set_display(DISPLAY_TYPE_RESET); + if (file_streaming && is_thinking) { + if (defaults.special_characters) { + *file_out << "<\\think>"; + } + else { + *file_out << "\n\n"; + } + } is_thinking = false; } curr_content += diff.content_delta; console::log("%s", diff.content_delta.c_str()); console::flush(); + if (file_streaming) { + if (!content_started && !defaults.special_characters) { + *file_out << "[Assistant]: "; + } + content_started = true; + *file_out << diff.content_delta; + file_out->flush(); + } } if (!diff.reasoning_content_delta.empty()) { console::set_display(DISPLAY_TYPE_REASONING); if (!is_thinking) { console::log("[Start thinking]\n"); + if (file_streaming) { + if (defaults.special_characters) { + *file_out << ""; + } + else { + *file_out << "[Thinking]: "; + } + } } is_thinking = true; console::log("%s", diff.reasoning_content_delta.c_str()); console::flush(); + if (file_streaming) { + *file_out << diff.reasoning_content_delta; + file_out->flush(); + } } } } auto res_final = dynamic_cast(result.get()); if (res_final) { out_timings = std::move(res_final->timings); + if (file_streaming) { + if (!defaults.special_characters) { + *file_out << "\n\n"; + } + file_out->flush(); + } break; } result = rd.next(should_stop); @@ -341,6 +391,15 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); + // open output file early to fail fast + std::ofstream output_file; + if (!params.out_file.empty()) { + output_file.open(params.out_file, std::ios::binary); + if (!output_file) { + console::error("Failed to open output file '%s'\n", params.out_file.c_str()); + return 1; + } + } console::set_display(DISPLAY_TYPE_RESET); console::set_completion_callback(auto_completion_callback); @@ -531,7 +590,7 @@ int main(int argc, char ** argv) { cur_msg.clear(); } result_timings timings; - std::string assistant_content = ctx_cli.generate_completion(timings); + std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file); ctx_cli.messages.push_back({ {"role", "assistant"}, {"content", assistant_content} diff --git a/tools/server/server-task.h b/tools/server/server-task.h index e2e3e5a582..102b43d246 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -51,7 +51,8 @@ struct task_params { bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt bool return_tokens = false; bool return_progress = false; - + bool special_characters = false; // whether to include special tokens in the output (e.g. , , , etc.) + int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_predict = -1; // new tokens to predict From 96fc9a91ef47d60de04dae0f9eba68402a3aedd4 Mon Sep 17 00:00:00 2001 From: David Baker Date: Fri, 6 Mar 2026 16:54:23 +0000 Subject: [PATCH 2/4] Added documentation line --- tools/cli/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cli/README.md b/tools/cli/README.md index 22d3fc87e9..7681917bae 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -39,6 +39,7 @@ | `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | | `-f, --file FNAME` | a file containing the prompt (default: none) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | +| `-o, --output FNAME` | a file to which to save the output (default: none) | | `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | From c2baff91615bfba425247833f4c2713215f2a703 Mon Sep 17 00:00:00 2001 From: David Baker Date: Wed, 11 Mar 2026 13:53:31 +0000 Subject: [PATCH 3/4] Refactor to use a common function to do file output, which both outputs to file and selects different outputs for special token and plain text cases --- tools/cli/cli.cpp | 76 ++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index c929946861..2b8d9298ef 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -58,6 +58,8 @@ struct cli_context { task_params defaults; bool verbose_prompt; common_reasoning_format reasoning_format; + bool file_streaming = false; + std::ofstream * file_out = nullptr; // thread for showing "loading" animation std::atomic loading_show; @@ -78,7 +80,7 @@ struct cli_context { reasoning_format = params.reasoning_format; } - std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) { + std::string generate_completion(result_timings & out_timings, std::ofstream * file_to_use = nullptr) { server_response_reader rd = ctx_server.get_response_reader(); auto chat_params = format_chat(); { @@ -108,16 +110,12 @@ struct cli_context { } // check if we are doing file output - bool file_streaming = (file_out != nullptr && file_out->is_open()); - if (file_streaming) { - if (defaults.special_characters) { - *file_out << chat_params.prompt; - } - else { - *file_out << "[Prompt]: " << messages.back()["content"].get() << "\n\n"; - } - file_out->flush(); - } + file_out = file_to_use; + file_streaming = (file_out != nullptr && file_out->is_open()); + append_file_out( + "[Prompt]: " + messages.back()["content"].get() + "\n\n", + chat_params.prompt + ); // wait for first result console::spinner::start(); @@ -149,60 +147,40 @@ struct cli_context { if (is_thinking) { console::log("\n[End thinking]\n\n"); console::set_display(DISPLAY_TYPE_RESET); - if (file_streaming && is_thinking) { - if (defaults.special_characters) { - *file_out << "<\\think>"; - } - else { - *file_out << "\n\n"; - } - } + append_file_out("\n\n", "
"); + is_thinking = false; } curr_content += diff.content_delta; console::log("%s", diff.content_delta.c_str()); console::flush(); - if (file_streaming) { - if (!content_started && !defaults.special_characters) { - *file_out << "[Assistant]: "; - } + if (!content_started) { + append_file_out("[Assistant]: ", ""); content_started = true; - *file_out << diff.content_delta; - file_out->flush(); } + append_file_out(diff.content_delta); } if (!diff.reasoning_content_delta.empty()) { console::set_display(DISPLAY_TYPE_REASONING); + std::string reasoning_delta = diff.reasoning_content_delta; if (!is_thinking) { console::log("[Start thinking]\n"); - if (file_streaming) { - if (defaults.special_characters) { - *file_out << ""; - } - else { - *file_out << "[Thinking]: "; - } + append_file_out("[Thinking]: ", ""); + if (reasoning_delta == "") { + reasoning_delta = ""; } } is_thinking = true; - console::log("%s", diff.reasoning_content_delta.c_str()); + console::log("%s", reasoning_delta.c_str()); console::flush(); - if (file_streaming) { - *file_out << diff.reasoning_content_delta; - file_out->flush(); - } + append_file_out(reasoning_delta); } } } auto res_final = dynamic_cast(result.get()); if (res_final) { out_timings = std::move(res_final->timings); - if (file_streaming) { - if (!defaults.special_characters) { - *file_out << "\n\n"; - } - file_out->flush(); - } + append_file_out("\n\n",""); break; } result = rd.next(should_stop); @@ -229,6 +207,18 @@ struct cli_context { } } + void append_file_out(const std::string & content, const std::optional & special_characters_content = std::nullopt) { + if (!file_streaming) { + return; + } + if (defaults.special_characters && special_characters_content.has_value()) { + *file_out << special_characters_content.value(); + } else { + *file_out << content; + } + file_out->flush(); + } + common_chat_params format_chat() { auto meta = ctx_server.get_meta(); auto & chat_params = meta.chat_params; From c522288ab662adf1c848a0eab73df315778b237c Mon Sep 17 00:00:00 2001 From: David Baker Date: Fri, 13 Mar 2026 19:22:02 +0000 Subject: [PATCH 4/4] Switch to storing the pointer in a std::optional as part of the context class. --- tools/cli/cli.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index e1ce4416ea..94890e572e 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -60,8 +60,7 @@ struct cli_context { int reasoning_budget = -1; std::string reasoning_budget_message; common_reasoning_format reasoning_format; - bool file_streaming = false; - std::ofstream * file_out = nullptr; + std::optional file_out = std::nullopt; // thread for showing "loading" animation std::atomic loading_show; @@ -84,7 +83,7 @@ struct cli_context { reasoning_format = params.reasoning_format; } - std::string generate_completion(result_timings & out_timings, std::ofstream * file_to_use = nullptr) { + std::string generate_completion(result_timings & out_timings) { server_response_reader rd = ctx_server.get_response_reader(); auto chat_params = format_chat(); { @@ -131,9 +130,6 @@ struct cli_context { console::set_display(DISPLAY_TYPE_RESET); } - // check if we are doing file output - file_out = file_to_use; - file_streaming = (file_out != nullptr && file_out->is_open()); append_file_out( "[Prompt]: " + messages.back()["content"].get() + "\n\n", chat_params.prompt @@ -230,15 +226,15 @@ struct cli_context { } void append_file_out(const std::string & content, const std::optional & special_characters_content = std::nullopt) { - if (!file_streaming) { + if (!file_out.has_value()) { return; } if (defaults.special_characters && special_characters_content.has_value()) { - *file_out << special_characters_content.value(); + *file_out.value() << special_characters_content.value(); } else { - *file_out << content; + *file_out.value() << content; } - file_out->flush(); + file_out.value()->flush(); } common_chat_params format_chat() { @@ -409,11 +405,13 @@ int main(int argc, char ** argv) { std::ofstream output_file; if (!params.out_file.empty()) { output_file.open(params.out_file, std::ios::binary); - if (!output_file) { + if (!output_file || !output_file.is_open()) { console::error("Failed to open output file '%s'\n", params.out_file.c_str()); return 1; } + ctx_cli.file_out = &output_file; } + console::set_display(DISPLAY_TYPE_RESET); console::set_completion_callback(auto_completion_callback); @@ -604,7 +602,7 @@ int main(int argc, char ** argv) { cur_msg.clear(); } result_timings timings; - std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file); + std::string assistant_content = ctx_cli.generate_completion(timings); ctx_cli.messages.push_back({ {"role", "assistant"}, {"content", assistant_content}