Implement output flag on cli

This commit is contained in:
David Baker 2026-03-06 16:40:36 +00:00
parent f6235a41ef
commit c494c70a06
No known key found for this signature in database
GPG Key ID: 89298D31E5B7B548
3 changed files with 65 additions and 5 deletions

View File

@ -2659,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.out_file = value; params.out_file = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N", {"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

View File

@ -56,6 +56,7 @@ struct cli_context {
std::vector<raw_buffer> input_files; std::vector<raw_buffer> input_files;
task_params defaults; task_params defaults;
bool verbose_prompt; bool verbose_prompt;
common_reasoning_format reasoning_format;
// thread for showing "loading" animation // thread for showing "loading" animation
std::atomic<bool> loading_show; std::atomic<bool> loading_show;
@ -66,15 +67,17 @@ struct cli_context {
defaults.n_keep = params.n_keep; defaults.n_keep = params.n_keep;
defaults.n_predict = params.n_predict; defaults.n_predict = params.n_predict;
defaults.antiprompt = params.antiprompt; defaults.antiprompt = params.antiprompt;
defaults.special_characters = params.special;
defaults.stream = true; // make sure we always use streaming mode defaults.stream = true; // make sure we always use streaming mode
defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
// defaults.return_progress = true; // TODO: show progress // defaults.return_progress = true; // TODO: show progress
verbose_prompt = params.verbose_prompt; verbose_prompt = params.verbose_prompt;
reasoning_format = params.reasoning_format;
} }
std::string generate_completion(result_timings & out_timings) { std::string generate_completion(result_timings & out_timings, std::ofstream * file_out = nullptr) {
server_response_reader rd = ctx_server.get_response_reader(); server_response_reader rd = ctx_server.get_response_reader();
auto chat_params = format_chat(); auto chat_params = format_chat();
{ {
@ -89,7 +92,7 @@ struct cli_context {
// chat template settings // chat template settings
task.params.chat_parser_params = common_chat_parser_params(chat_params); task.params.chat_parser_params = common_chat_parser_params(chat_params);
task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; task.params.chat_parser_params.reasoning_format = reasoning_format;
if (!chat_params.parser.empty()) { if (!chat_params.parser.empty()) {
task.params.chat_parser_params.parser.load(chat_params.parser); task.params.chat_parser_params.parser.load(chat_params.parser);
} }
@ -103,6 +106,18 @@ struct cli_context {
console::set_display(DISPLAY_TYPE_RESET); console::set_display(DISPLAY_TYPE_RESET);
} }
// check if we are doing file output
bool file_streaming = (file_out != nullptr && file_out->is_open());
if (file_streaming) {
if (defaults.special_characters) {
*file_out << chat_params.prompt;
}
else {
*file_out << "[Prompt]: " << messages.back()["content"].get<std::string>() << "\n\n";
}
file_out->flush();
}
// wait for first result // wait for first result
console::spinner::start(); console::spinner::start();
server_task_result_ptr result = rd.next(should_stop); server_task_result_ptr result = rd.next(should_stop);
@ -110,6 +125,7 @@ struct cli_context {
console::spinner::stop(); console::spinner::stop();
std::string curr_content; std::string curr_content;
bool is_thinking = false; bool is_thinking = false;
bool content_started = false;
while (result) { while (result) {
if (should_stop()) { if (should_stop()) {
@ -132,26 +148,60 @@ struct cli_context {
if (is_thinking) { if (is_thinking) {
console::log("\n[End thinking]\n\n"); console::log("\n[End thinking]\n\n");
console::set_display(DISPLAY_TYPE_RESET); console::set_display(DISPLAY_TYPE_RESET);
if (file_streaming && is_thinking) {
if (defaults.special_characters) {
*file_out << "<\\think>";
}
else {
*file_out << "\n\n";
}
}
is_thinking = false; is_thinking = false;
} }
curr_content += diff.content_delta; curr_content += diff.content_delta;
console::log("%s", diff.content_delta.c_str()); console::log("%s", diff.content_delta.c_str());
console::flush(); console::flush();
if (file_streaming) {
if (!content_started && !defaults.special_characters) {
*file_out << "[Assistant]: ";
}
content_started = true;
*file_out << diff.content_delta;
file_out->flush();
}
} }
if (!diff.reasoning_content_delta.empty()) { if (!diff.reasoning_content_delta.empty()) {
console::set_display(DISPLAY_TYPE_REASONING); console::set_display(DISPLAY_TYPE_REASONING);
if (!is_thinking) { if (!is_thinking) {
console::log("[Start thinking]\n"); console::log("[Start thinking]\n");
if (file_streaming) {
if (defaults.special_characters) {
*file_out << "<think>";
}
else {
*file_out << "[Thinking]: ";
}
}
} }
is_thinking = true; is_thinking = true;
console::log("%s", diff.reasoning_content_delta.c_str()); console::log("%s", diff.reasoning_content_delta.c_str());
console::flush(); console::flush();
if (file_streaming) {
*file_out << diff.reasoning_content_delta;
file_out->flush();
}
} }
} }
} }
auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get()); auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
if (res_final) { if (res_final) {
out_timings = std::move(res_final->timings); out_timings = std::move(res_final->timings);
if (file_streaming) {
if (!defaults.special_characters) {
*file_out << "\n\n";
}
file_out->flush();
}
break; break;
} }
result = rd.next(should_stop); result = rd.next(should_stop);
@ -341,6 +391,15 @@ int main(int argc, char ** argv) {
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); }); atexit([]() { console::cleanup(); });
// open output file early to fail fast
std::ofstream output_file;
if (!params.out_file.empty()) {
output_file.open(params.out_file, std::ios::binary);
if (!output_file) {
console::error("Failed to open output file '%s'\n", params.out_file.c_str());
return 1;
}
}
console::set_display(DISPLAY_TYPE_RESET); console::set_display(DISPLAY_TYPE_RESET);
console::set_completion_callback(auto_completion_callback); console::set_completion_callback(auto_completion_callback);
@ -531,7 +590,7 @@ int main(int argc, char ** argv) {
cur_msg.clear(); cur_msg.clear();
} }
result_timings timings; result_timings timings;
std::string assistant_content = ctx_cli.generate_completion(timings); std::string assistant_content = ctx_cli.generate_completion(timings, params.out_file.empty() ? nullptr : &output_file);
ctx_cli.messages.push_back({ ctx_cli.messages.push_back({
{"role", "assistant"}, {"role", "assistant"},
{"content", assistant_content} {"content", assistant_content}

View File

@ -51,7 +51,8 @@ struct task_params {
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false; bool return_tokens = false;
bool return_progress = false; bool return_progress = false;
bool special_characters = false; // whether to include special tokens in the output (e.g. <s>, </s>, <pad>, etc.)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict