From e489a5ca0e0eb864c83111afaec8529fa8c4bb14 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 14 Apr 2026 11:09:52 +0200 Subject: [PATCH] server: support OAI /v1/audio/transcriptions API (#21863) * server: support OAI /v1/audio/transcriptions API * address autoreview comments * correct default response_format value --- tools/server/server-common.cpp | 54 +++++++++++++++++++++++ tools/server/server-common.h | 6 +++ tools/server/server-context.cpp | 27 ++++++++++++ tools/server/server-context.h | 1 + tools/server/server-http.cpp | 30 ++++++++++++- tools/server/server-http.h | 4 ++ tools/server/server-task.cpp | 27 ++++++++++++ tools/server/server-task.h | 5 +++ tools/server/server.cpp | 78 +++++++++++++++++---------------- 9 files changed, 194 insertions(+), 38 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index ed5e306fc5..e3f2439023 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1433,6 +1433,60 @@ json convert_responses_to_chatcmpl(const json & response_body) { return chatcmpl_body; } +json convert_transcriptions_to_chatcmpl( + const json & inp_body, + const std::map & in_files, + std::vector & out_files) { + // TODO @ngxson : this function may need to be improved in the future + // handle input files + out_files.clear(); + auto it = in_files.find("file"); + if (it != in_files.end()) { + out_files.push_back(it->second); + } else { + throw std::invalid_argument("No input file found for transcription"); + } + + // handle input data + std::string prompt = json_value(inp_body, "prompt", std::string()); + std::string language = json_value(inp_body, "language", std::string()); + std::string response_format = json_value(inp_body, "response_format", std::string("json")); + if (response_format != "json") { + throw std::invalid_argument("Only 'json' response_format is supported for transcription"); + } + if (prompt.empty()) { + prompt = "Transcribe audio to text"; + } + if (!language.empty()) { + prompt += string_format(" (language: %s)", language.c_str()); + } + prompt += mtmd_default_marker(); + + json chatcmpl_body = inp_body; // copy all fields + chatcmpl_body["messages"] = json::array({ + { + {"role", "user"}, + {"content", prompt}, + }, + }); + + // because input from form-data, everything is string, we need to correct the types here + std::string stream = json_value(inp_body, "stream", std::string("false")); + chatcmpl_body["stream"] = stream == "true"; + + if (inp_body.contains("max_tokens")) { + std::string inp = inp_body["max_tokens"].get(); + chatcmpl_body["max_tokens"] = std::stoul(inp); + } + + if (inp_body.contains("temperature")) { + std::string inp = inp_body["temperature"].get(); + chatcmpl_body["temperature"] = std::stof(inp); + } + + return chatcmpl_body; +} + json convert_anthropic_to_oai(const json & body) { json oai_body; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 213ae52bb0..440ebc597a 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -305,6 +305,12 @@ json oaicompat_chat_params_parse( // convert OpenAI Responses API format to OpenAI Chat Completions API format json convert_responses_to_chatcmpl(const json & body); +// convert OpenAI transcriptions API format to OpenAI Chat Completions API format +json convert_transcriptions_to_chatcmpl( + const json & body, + const std::map & in_files, + std::vector & out_files); + // convert Anthropic Messages API format to OpenAI Chat Completions API format json convert_anthropic_to_oai(const json & body); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b31981c562..e134b3cfb2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3732,6 +3732,33 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_RESP); }; + this->post_transcriptions_oai = [this](const server_http_req & req) { + auto res = create_response(); + + if (!meta->has_mtmd || !meta->chat_params.allow_audio) { + res->error(format_error_response("The current model does not support audio input.", ERROR_TYPE_NOT_SUPPORTED)); + return res; + } + + std::vector files; + json body = convert_transcriptions_to_chatcmpl( + json::parse(req.body), + req.files, + files); + SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions"); + SRV_DBG("converted request: %s\n", body.dump().c_str()); + json body_parsed = oaicompat_chat_params_parse( + body, + meta->chat_params, + files); + return handle_completions_impl( + req, + SERVER_TASK_TYPE_COMPLETION, + body_parsed, + files, + TASK_RESPONSE_TYPE_OAI_ASR); + }; + this->post_anthropic_messages = [this](const server_http_req & req) { auto res = create_response(); std::vector files; diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 6ea9afc0a5..6856043fad 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -111,6 +111,7 @@ struct server_routes { server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; server_http_context::handler_t post_responses_oai; + server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; server_http_context::handler_t post_apply_template; diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 37e7cbe9c4..83f656f5c9 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -428,6 +428,7 @@ void server_http_context::get(const std::string & path, const server_http_contex req.path, build_query_string(req), req.body, + {}, req.is_connection_closed }); server_http_res_ptr response = handler(*request); @@ -437,12 +438,39 @@ void server_http_context::get(const std::string & path, const server_http_contex void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const { pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + std::string body = req.body; + std::map files; + + if (req.is_multipart_form_data()) { + // translate text fields to a JSON object and use it as the body + json form_json = json::object(); + for (const auto & [key, field] : req.form.fields) { + if (form_json.contains(key)) { + // if the key already exists, convert it to an array + if (!form_json[key].is_array()) { + json existing_value = form_json[key]; + form_json[key] = json::array({existing_value}); + } + form_json[key].push_back(field.content); + } else { + form_json[key] = field.content; + } + } + body = form_json.dump(); + + // populate files from multipart form + for (const auto & [key, file] : req.form.files) { + files[key] = raw_buffer(file.content.begin(), file.content.end()); + } + } + server_http_req_ptr request = std::make_unique(server_http_req{ get_params(req), get_headers(req), req.path, build_query_string(req), - req.body, + body, + std::move(files), req.is_connection_closed }); server_http_res_ptr response = handler(*request); diff --git a/tools/server/server-http.h b/tools/server/server-http.h index f8a174c440..68ae2170cf 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include struct common_params; @@ -32,6 +34,7 @@ struct server_http_res { // unique pointer, used by set_chunked_content_provider // httplib requires the stream provider to be stored in heap using server_http_res_ptr = std::unique_ptr; +using raw_buffer = std::vector; struct server_http_req { std::map params; // path_params + query_params @@ -39,6 +42,7 @@ struct server_http_req { std::string path; std::string query_string; // query parameters string (e.g. "action=save") std::string body; + std::map files; // used for file uploads (form data) const std::function & should_stop; std::string get_param(const std::string & key, const std::string & def = "") const { diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 6a06171d76..0312f098a3 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -725,6 +725,8 @@ json server_task_result_cmpl_final::to_json() { return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); case TASK_RESPONSE_TYPE_OAI_RESP: return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp(); + case TASK_RESPONSE_TYPE_OAI_ASR: + return to_json_oaicompat_asr(); case TASK_RESPONSE_TYPE_ANTHROPIC: return stream ? to_json_anthropic_stream() : to_json_anthropic(); default: @@ -1102,6 +1104,21 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { return server_sent_events; } +json server_task_result_cmpl_final::to_json_oaicompat_asr() { + json event = json { + {"type", "transcript.text.done"}, + {"text", content}, + {"usage", json { + {"type", "tokens"}, + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }}, + }}, + }; + return event; +} + json server_task_result_cmpl_final::to_json_anthropic() { std::string stop_reason = "max_tokens"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { @@ -1400,6 +1417,8 @@ json server_task_result_cmpl_partial::to_json() { return to_json_oaicompat_chat(); case TASK_RESPONSE_TYPE_OAI_RESP: return to_json_oaicompat_resp(); + case TASK_RESPONSE_TYPE_OAI_ASR: + return to_json_oaicompat_asr(); case TASK_RESPONSE_TYPE_ANTHROPIC: return to_json_anthropic(); default: @@ -1650,6 +1669,14 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { return events; } +json server_task_result_cmpl_partial::to_json_oaicompat_asr() { + json event = json { + {"type", "transcript.text.delta"}, + {"delta", content}, + }; + return event; +} + json server_task_result_cmpl_partial::to_json_anthropic() { json events = json::array(); bool first = (n_decoded == 1); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 243e47a8ed..95f39207b1 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -34,6 +34,7 @@ enum task_response_type { TASK_RESPONSE_TYPE_OAI_CHAT, TASK_RESPONSE_TYPE_OAI_CMPL, TASK_RESPONSE_TYPE_OAI_RESP, + TASK_RESPONSE_TYPE_OAI_ASR, // transcriptions API TASK_RESPONSE_TYPE_OAI_EMBD, TASK_RESPONSE_TYPE_ANTHROPIC, }; @@ -401,6 +402,8 @@ struct server_task_result_cmpl_final : server_task_result { json to_json_oaicompat_resp_stream(); + json to_json_oaicompat_asr(); + json to_json_anthropic(); json to_json_anthropic_stream(); @@ -457,6 +460,8 @@ struct server_task_result_cmpl_partial : server_task_result { json to_json_oaicompat_resp(); + json to_json_oaicompat_asr(); + json to_json_anthropic(); }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b9e320d9cb..fe640b978b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -145,6 +145,7 @@ int main(int argc, char ** argv) { routes.post_completions_oai = models_routes->proxy_post; routes.post_chat_completions = models_routes->proxy_post; routes.post_responses_oai = models_routes->proxy_post; + routes.post_transcriptions_oai = models_routes->proxy_post; routes.post_anthropic_messages = models_routes->proxy_post; routes.post_anthropic_count_tokens = models_routes->proxy_post; routes.post_infill = models_routes->proxy_post; @@ -160,48 +161,51 @@ int main(int argc, char ** argv) { routes.post_slots = models_routes->proxy_post; // custom routes for router - routes.get_props = models_routes->get_router_props; - routes.get_models = models_routes->get_router_models; - ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); - ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + routes.get_props = models_routes->get_router_props; + routes.get_models = models_routes->get_router_models; + + ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); + ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); } - ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) - ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) - ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); - ctx_http.get ("/props", ex_wrapper(routes.get_props)); - ctx_http.post("/props", ex_wrapper(routes.post_props)); - ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); - ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) - ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) - ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) - ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy - ctx_http.post("/completions", ex_wrapper(routes.post_completions)); - ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); - ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); - ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); - ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint - ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); - ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai)); - ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API + ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); + ctx_http.get ("/props", ex_wrapper(routes.get_props)); + ctx_http.post("/props", ex_wrapper(routes.post_props)); + ctx_http.post("/api/show", ex_wrapper(routes.get_api_show)); + ctx_http.get ("/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/v1/models", ex_wrapper(routes.get_models)); // public endpoint (no API key check) + ctx_http.get ("/api/tags", ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check) + ctx_http.post("/completion", ex_wrapper(routes.post_completions)); // legacy + ctx_http.post("/completions", ex_wrapper(routes.post_completions)); + ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); + ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/api/chat", ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint + ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); + ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai)); + ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); + ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); + ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting - ctx_http.post("/infill", ex_wrapper(routes.post_infill)); - ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy - ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); - ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); - ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); - ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); - ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); - ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); - ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); - ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); - ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); + ctx_http.post("/infill", ex_wrapper(routes.post_infill)); + ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy + ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); + ctx_http.post("/v1/embeddings", ex_wrapper(routes.post_embeddings_oai)); + ctx_http.post("/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/rerank", ex_wrapper(routes.post_rerank)); + ctx_http.post("/v1/reranking", ex_wrapper(routes.post_rerank)); + ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); + ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); + ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); // LoRA adapters hotswap - ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); - ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); + ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); + ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); // Save & load slots - ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); - ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); + ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); + ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP) if (params.webui_mcp_proxy) { SRV_WRN("%s", "-----------------\n");