Implement output flag on cli

2026-03-06 16:40:36 +00:00 · 2026-03-06 16:40:36 +00:00 · c494c70a06
parent f6235a41ef
commit c494c70a06
3 changed files with 65 additions and 5 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2659,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@ -56,6 +56,7 @@ struct cli_context {
    std::vector<raw_buffer> input_files;
    task_params defaults;
    bool verbose_prompt;
+    common_reasoning_format reasoning_format;

    // thread for showing "loading" animation
    std::atomic<bool> loading_show;
@ -66,15 +67,17 @@ struct cli_context {
        defaults.n_keep      = params.n_keep;
        defaults.n_predict   = params.n_predict;
        defaults.antiprompt  = params.antiprompt;
+        defaults.special_characters = params.special;

        defaults.stream = true; // make sure we always use streaming mode
        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
        // defaults.return_progress = true; // TODO: show progress

        verbose_prompt = params.verbose_prompt;
+        reasoning_format = params.reasoning_format;
    }

-    std::string generate_completion(result_timings & out_timings) {
+    std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) {
        server_response_reader rd = ctx_server.get_response_reader();
        auto chat_params = format_chat();
        {
@ -89,7 +92,7 @@ struct cli_context {

            // chat template settings
            task.params.chat_parser_params = common_chat_parser_params(chat_params);
-            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+            task.params.chat_parser_params.reasoning_format = reasoning_format;
            if (!chat_params.parser.empty()) {
                task.params.chat_parser_params.parser.load(chat_params.parser);
            }
@ -103,6 +106,18 @@ struct cli_context {
            console::set_display(DISPLAY_TYPE_RESET);
        }

+        // check if we are doing file output
+        bool file_streaming  = (file_out != nullptr && file_out->is_open());
+        if (file_streaming) {
+            if (defaults.special_characters) {
+                *file_out << chat_params.prompt;
+            }
+            else {
+                *file_out << "[Prompt]: " << messages.back()["content"].get<std::string>() << "\n\n";
+            }
+            file_out->flush();
+        }
+
        // wait for first result
        console::spinner::start();
        server_task_result_ptr result = rd.next(should_stop);
@ -110,6 +125,7 @@ struct cli_context {
        console::spinner::stop();
        std::string curr_content;
        bool is_thinking = false;
+        bool content_started  = false;

        while (result) {
            if (should_stop()) {
@ -132,26 +148,60 @@ struct cli_context {
                        if (is_thinking) {
                            console::log("\n[End thinking]\n\n");
                            console::set_display(DISPLAY_TYPE_RESET);
+                            if (file_streaming && is_thinking) {
+                                if (defaults.special_characters) {
+                                    *file_out << "<\\think>";
+                                }
+                                else {
+                                    *file_out << "\n\n";
+                                }
+                            }
                            is_thinking = false;
                        }
                        curr_content += diff.content_delta;
                        console::log("%s", diff.content_delta.c_str());
                        console::flush();
+                        if (file_streaming) {
+                            if (!content_started && !defaults.special_characters) {
+                                *file_out << "[Assistant]: ";
+                            }
+                            content_started = true;
+                            *file_out << diff.content_delta;
+                            file_out->flush();
+                        }
                    }
                    if (!diff.reasoning_content_delta.empty()) {
                        console::set_display(DISPLAY_TYPE_REASONING);
                        if (!is_thinking) {
                            console::log("[Start thinking]\n");
+                            if (file_streaming) {
+                                if (defaults.special_characters) {
+                                    *file_out << "<think>";
+                                }
+                                else {
+                                    *file_out << "[Thinking]: ";
+                                }
+                            }
                        }
                        is_thinking = true;
                        console::log("%s", diff.reasoning_content_delta.c_str());
                        console::flush();
+                        if (file_streaming) {
+                            *file_out << diff.reasoning_content_delta;
+                            file_out->flush();
+                        }
                    }
                }
            }
            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
            if (res_final) {
                out_timings = std::move(res_final->timings);
+                if (file_streaming) {
+                    if (!defaults.special_characters) {
+                        *file_out << "\n\n";
+                    }
+                    file_out->flush();
+                }
                break;
            }
            result = rd.next(should_stop);
@ -341,6 +391,15 @@ int main(int argc, char ** argv) {
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

+    // open output file early to fail fast
+    std::ofstream output_file;
+    if (!params.out_file.empty()) {
+        output_file.open(params.out_file, std::ios::binary);
+        if (!output_file) {
+            console::error("Failed to open output file '%s'\n", params.out_file.c_str());
+            return 1;
+        }
+    }
    console::set_display(DISPLAY_TYPE_RESET);
    console::set_completion_callback(auto_completion_callback);

@ -531,7 +590,7 @@ int main(int argc, char ** argv) {
            cur_msg.clear();
        }
        result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
+        std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file);
        ctx_cli.messages.push_back({
            {"role",    "assistant"},
            {"content", assistant_content}
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@ -51,7 +51,8 @@ struct task_params {
    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
    bool return_tokens   = false;
    bool return_progress = false;
-
+    bool special_characters = false; // whether to include special tokens in the output (e.g. <s>, </s>, <pad>, etc.)
+    
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict