From c494c70a0664e0f45c6bcfeac42d802e89c2c196 Mon Sep 17 00:00:00 2001 From: David Baker Date: Fri, 6 Mar 2026 16:40:36 +0000 Subject: [PATCH] Implement output flag on cli --- common/arg.cpp | 2 +- tools/cli/cli.cpp | 65 ++++++++++++++++++++++++++++++++++++-- tools/server/server-task.h | 3 +- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index cd73d96420..331a06fbdc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2659,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 13bedf31eb..c5f35886d3 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -56,6 +56,7 @@ struct cli_context { std::vector input_files; task_params defaults; bool verbose_prompt; + common_reasoning_format reasoning_format; // thread for showing "loading" animation std::atomic loading_show; @@ -66,15 +67,17 @@ struct cli_context { defaults.n_keep = params.n_keep; defaults.n_predict = params.n_predict; defaults.antiprompt = params.antiprompt; + defaults.special_characters = params.special; defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress verbose_prompt = params.verbose_prompt; + reasoning_format = params.reasoning_format; } - std::string generate_completion(result_timings & out_timings) { + std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) { server_response_reader rd = ctx_server.get_response_reader(); auto chat_params = format_chat(); { @@ -89,7 +92,7 @@ struct cli_context { // chat template settings task.params.chat_parser_params = common_chat_parser_params(chat_params); - task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + task.params.chat_parser_params.reasoning_format = reasoning_format; if (!chat_params.parser.empty()) { task.params.chat_parser_params.parser.load(chat_params.parser); } @@ -103,6 +106,18 @@ struct cli_context { console::set_display(DISPLAY_TYPE_RESET); } + // check if we are doing file output + bool file_streaming = (file_out != nullptr && file_out->is_open()); + if (file_streaming) { + if (defaults.special_characters) { + *file_out << chat_params.prompt; + } + else { + *file_out << "[Prompt]: " << messages.back()["content"].get() << "\n\n"; + } + file_out->flush(); + } + // wait for first result console::spinner::start(); server_task_result_ptr result = rd.next(should_stop); @@ -110,6 +125,7 @@ struct cli_context { console::spinner::stop(); std::string curr_content; bool is_thinking = false; + bool content_started = false; while (result) { if (should_stop()) { @@ -132,26 +148,60 @@ struct cli_context { if (is_thinking) { console::log("\n[End thinking]\n\n"); console::set_display(DISPLAY_TYPE_RESET); + if (file_streaming && is_thinking) { + if (defaults.special_characters) { + *file_out << "<\\think>"; + } + else { + *file_out << "\n\n"; + } + } is_thinking = false; } curr_content += diff.content_delta; console::log("%s", diff.content_delta.c_str()); console::flush(); + if (file_streaming) { + if (!content_started && !defaults.special_characters) { + *file_out << "[Assistant]: "; + } + content_started = true; + *file_out << diff.content_delta; + file_out->flush(); + } } if (!diff.reasoning_content_delta.empty()) { console::set_display(DISPLAY_TYPE_REASONING); if (!is_thinking) { console::log("[Start thinking]\n"); + if (file_streaming) { + if (defaults.special_characters) { + *file_out << ""; + } + else { + *file_out << "[Thinking]: "; + } + } } is_thinking = true; console::log("%s", diff.reasoning_content_delta.c_str()); console::flush(); + if (file_streaming) { + *file_out << diff.reasoning_content_delta; + file_out->flush(); + } } } } auto res_final = dynamic_cast(result.get()); if (res_final) { out_timings = std::move(res_final->timings); + if (file_streaming) { + if (!defaults.special_characters) { + *file_out << "\n\n"; + } + file_out->flush(); + } break; } result = rd.next(should_stop); @@ -341,6 +391,15 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); + // open output file early to fail fast + std::ofstream output_file; + if (!params.out_file.empty()) { + output_file.open(params.out_file, std::ios::binary); + if (!output_file) { + console::error("Failed to open output file '%s'\n", params.out_file.c_str()); + return 1; + } + } console::set_display(DISPLAY_TYPE_RESET); console::set_completion_callback(auto_completion_callback); @@ -531,7 +590,7 @@ int main(int argc, char ** argv) { cur_msg.clear(); } result_timings timings; - std::string assistant_content = ctx_cli.generate_completion(timings); + std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file); ctx_cli.messages.push_back({ {"role", "assistant"}, {"content", assistant_content} diff --git a/tools/server/server-task.h b/tools/server/server-task.h index e2e3e5a582..102b43d246 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -51,7 +51,8 @@ struct task_params { bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt bool return_tokens = false; bool return_progress = false; - + bool special_characters = false; // whether to include special tokens in the output (e.g. , , , etc.) + int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_predict = -1; // new tokens to predict