From e489a5ca0e0eb864c83111afaec8529fa8c4bb14 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 14 Apr 2026 11:09:52 +0200
Subject: [PATCH] server: support OAI /v1/audio/transcriptions API (#21863)

* server: support OAI /v1/audio/transcriptions API

* address autoreview comments

* correct default response_format value
---
 tools/server/server-common.cpp  | 54 +++++++++++++++++++++++
 tools/server/server-common.h    |  6 +++
 tools/server/server-context.cpp | 27 ++++++++++++
 tools/server/server-context.h   |  1 +
 tools/server/server-http.cpp    | 30 ++++++++++++-
 tools/server/server-http.h      |  4 ++
 tools/server/server-task.cpp    | 27 ++++++++++++
 tools/server/server-task.h      |  5 +++
 tools/server/server.cpp         | 78 +++++++++++++++++----------------
 9 files changed, 194 insertions(+), 38 deletions(-)

diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index ed5e306fc5..e3f2439023 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1433,6 +1433,60 @@ json convert_responses_to_chatcmpl(const json & response_body) {
     return chatcmpl_body;
 }
 
+json convert_transcriptions_to_chatcmpl(
+        const json & inp_body,
+        const std::map<std::string, raw_buffer> & in_files,
+        std::vector<raw_buffer> & out_files) {
+    // TODO @ngxson : this function may need to be improved in the future
+    // handle input files
+    out_files.clear();
+    auto it = in_files.find("file");
+    if (it != in_files.end()) {
+        out_files.push_back(it->second);
+    } else {
+        throw std::invalid_argument("No input file found for transcription");
+    }
+
+    // handle input data
+    std::string prompt = json_value(inp_body, "prompt", std::string());
+    std::string language = json_value(inp_body, "language", std::string());
+    std::string response_format = json_value(inp_body, "response_format", std::string("json"));
+    if (response_format != "json") {
+        throw std::invalid_argument("Only 'json' response_format is supported for transcription");
+    }
+    if (prompt.empty()) {
+        prompt = "Transcribe audio to text";
+    }
+    if (!language.empty()) {
+        prompt += string_format(" (language: %s)", language.c_str());
+    }
+    prompt += mtmd_default_marker();
+
+    json chatcmpl_body = inp_body; // copy all fields
+    chatcmpl_body["messages"] = json::array({
+        {
+            {"role", "user"},
+            {"content", prompt},
+        },
+    });
+
+    // because input from form-data, everything is string, we need to correct the types here
+    std::string stream = json_value(inp_body, "stream", std::string("false"));
+    chatcmpl_body["stream"] = stream == "true";
+
+    if (inp_body.contains("max_tokens")) {
+        std::string inp = inp_body["max_tokens"].get<std::string>();
+        chatcmpl_body["max_tokens"] = std::stoul(inp);
+    }
+
+    if (inp_body.contains("temperature")) {
+        std::string inp = inp_body["temperature"].get<std::string>();
+        chatcmpl_body["temperature"] = std::stof(inp);
+    }
+
+    return chatcmpl_body;
+}
+
 json convert_anthropic_to_oai(const json & body) {
     json oai_body;
 
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 213ae52bb0..440ebc597a 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -305,6 +305,12 @@ json oaicompat_chat_params_parse(
 // convert OpenAI Responses API format to OpenAI Chat Completions API format
 json convert_responses_to_chatcmpl(const json & body);
 
+// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
+json convert_transcriptions_to_chatcmpl(
+    const json & body,
+    const std::map<std::string, raw_buffer> & in_files,
+    std::vector<raw_buffer> & out_files);
+
 // convert Anthropic Messages API format to OpenAI Chat Completions API format
 json convert_anthropic_to_oai(const json & body);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b31981c562..e134b3cfb2 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3732,6 +3732,33 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_RESP);
     };
 
+    this->post_transcriptions_oai = [this](const server_http_req & req) {
+        auto res = create_response();
+
+        if (!meta->has_mtmd || !meta->chat_params.allow_audio) {
+            res->error(format_error_response("The current model does not support audio input.", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        std::vector<raw_buffer> files;
+        json body = convert_transcriptions_to_chatcmpl(
+            json::parse(req.body),
+            req.files,
+            files);
+        SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            TASK_RESPONSE_TYPE_OAI_ASR);
+    };
+
     this->post_anthropic_messages = [this](const server_http_req & req) {
         auto res = create_response();
         std::vector<raw_buffer> files;
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 6ea9afc0a5..6856043fad 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -111,6 +111,7 @@ struct server_routes {
     server_http_context::handler_t post_completions_oai;
     server_http_context::handler_t post_chat_completions;
     server_http_context::handler_t post_responses_oai;
+    server_http_context::handler_t post_transcriptions_oai;
     server_http_context::handler_t post_anthropic_messages;
     server_http_context::handler_t post_anthropic_count_tokens;
     server_http_context::handler_t post_apply_template;
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 37e7cbe9c4..83f656f5c9 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -428,6 +428,7 @@ void server_http_context::get(const std::string & path, const server_http_contex
             req.path,
             build_query_string(req),
             req.body,
+            {},
             req.is_connection_closed
         });
         server_http_res_ptr response = handler(*request);
@@ -437,12 +438,39 @@ void server_http_context::get(const std::string & path, const server_http_contex
 
 void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
     pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        std::string body = req.body;
+        std::map<std::string, raw_buffer> files;
+
+        if (req.is_multipart_form_data()) {
+            // translate text fields to a JSON object and use it as the body
+            json form_json = json::object();
+            for (const auto & [key, field] : req.form.fields) {
+                if (form_json.contains(key)) {
+                    // if the key already exists, convert it to an array
+                    if (!form_json[key].is_array()) {
+                        json existing_value = form_json[key];
+                        form_json[key] = json::array({existing_value});
+                    }
+                    form_json[key].push_back(field.content);
+                } else {
+                    form_json[key] = field.content;
+                }
+            }
+            body = form_json.dump();
+
+            // populate files from multipart form
+            for (const auto & [key, file] : req.form.files) {
+                files[key] = raw_buffer(file.content.begin(), file.content.end());
+            }
+        }
+
         server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
             get_params(req),
             get_headers(req),
             req.path,
             build_query_string(req),
-            req.body,
+            body,
+            std::move(files),
             req.is_connection_closed
         });
         server_http_res_ptr response = handler(*request);
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index f8a174c440..68ae2170cf 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -5,6 +5,8 @@
 #include <map>
 #include <string>
 #include <thread>
+#include <vector>
+#include <cstdint>
 
 struct common_params;
 
@@ -32,6 +34,7 @@ struct server_http_res {
 // unique pointer, used by set_chunked_content_provider
 // httplib requires the stream provider to be stored in heap
 using server_http_res_ptr = std::unique_ptr<server_http_res>;
+using raw_buffer = std::vector<uint8_t>;
 
 struct server_http_req {
     std::map<std::string, std::string> params; // path_params + query_params
@@ -39,6 +42,7 @@ struct server_http_req {
     std::string path;
     std::string query_string; // query parameters string (e.g. "action=save")
     std::string body;
+    std::map<std::string, raw_buffer> files; // used for file uploads (form data)
     const std::function<bool()> & should_stop;
 
     std::string get_param(const std::string & key, const std::string & def = "") const {
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 6a06171d76..0312f098a3 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -725,6 +725,8 @@ json server_task_result_cmpl_final::to_json() {
             return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
         case TASK_RESPONSE_TYPE_OAI_RESP:
             return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
+        case TASK_RESPONSE_TYPE_OAI_ASR:
+            return to_json_oaicompat_asr();
         case TASK_RESPONSE_TYPE_ANTHROPIC:
             return stream ? to_json_anthropic_stream() : to_json_anthropic();
         default:
@@ -1102,6 +1104,21 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
     return server_sent_events;
 }
 
+json server_task_result_cmpl_final::to_json_oaicompat_asr() {
+    json event = json {
+        {"type",  "transcript.text.done"},
+        {"text",  content},
+        {"usage", json {
+            {"type",         "tokens"},
+            {"input_tokens",  n_prompt_tokens},
+            {"output_tokens", n_decoded},
+            {"total_tokens",  n_decoded + n_prompt_tokens},
+            {"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
+        }},
+    };
+    return event;
+}
+
 json server_task_result_cmpl_final::to_json_anthropic() {
     std::string stop_reason = "max_tokens";
     if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@@ -1400,6 +1417,8 @@ json server_task_result_cmpl_partial::to_json() {
             return to_json_oaicompat_chat();
         case TASK_RESPONSE_TYPE_OAI_RESP:
             return to_json_oaicompat_resp();
+        case TASK_RESPONSE_TYPE_OAI_ASR:
+            return to_json_oaicompat_asr();
         case TASK_RESPONSE_TYPE_ANTHROPIC:
             return to_json_anthropic();
         default:
@@ -1650,6 +1669,14 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
     return events;
 }
 
+json server_task_result_cmpl_partial::to_json_oaicompat_asr() {
+    json event = json {
+        {"type", "transcript.text.delta"},
+        {"delta", content},
+    };
+    return event;
+}
+
 json server_task_result_cmpl_partial::to_json_anthropic() {
     json events = json::array();
     bool first = (n_decoded == 1);
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 243e47a8ed..95f39207b1 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -34,6 +34,7 @@ enum task_response_type {
     TASK_RESPONSE_TYPE_OAI_CHAT,
     TASK_RESPONSE_TYPE_OAI_CMPL,
     TASK_RESPONSE_TYPE_OAI_RESP,
+    TASK_RESPONSE_TYPE_OAI_ASR, // transcriptions API
     TASK_RESPONSE_TYPE_OAI_EMBD,
     TASK_RESPONSE_TYPE_ANTHROPIC,
 };
@@ -401,6 +402,8 @@ struct server_task_result_cmpl_final : server_task_result {
 
     json to_json_oaicompat_resp_stream();
 
+    json to_json_oaicompat_asr();
+
     json to_json_anthropic();
 
     json to_json_anthropic_stream();
@@ -457,6 +460,8 @@ struct server_task_result_cmpl_partial : server_task_result {
 
     json to_json_oaicompat_resp();
 
+    json to_json_oaicompat_asr();
+
     json to_json_anthropic();
 };
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b9e320d9cb..fe640b978b 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -145,6 +145,7 @@ int main(int argc, char ** argv) {
         routes.post_completions_oai        = models_routes->proxy_post;
         routes.post_chat_completions       = models_routes->proxy_post;
         routes.post_responses_oai          = models_routes->proxy_post;
+        routes.post_transcriptions_oai     = models_routes->proxy_post;
         routes.post_anthropic_messages     = models_routes->proxy_post;
         routes.post_anthropic_count_tokens = models_routes->proxy_post;
         routes.post_infill                 = models_routes->proxy_post;
@@ -160,48 +161,51 @@ int main(int argc, char ** argv) {
         routes.post_slots                  = models_routes->proxy_post;
 
         // custom routes for router
-        routes.get_props  = models_routes->get_router_props;
-        routes.get_models = models_routes->get_router_models;
-        ctx_http.post("/models/load",   ex_wrapper(models_routes->post_router_models_load));
-        ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
+        routes.get_props                   = models_routes->get_router_props;
+        routes.get_models                  = models_routes->get_router_models;
+
+        ctx_http.post("/models/load",          ex_wrapper(models_routes->post_router_models_load));
+        ctx_http.post("/models/unload",        ex_wrapper(models_routes->post_router_models_unload));
     }
 
-    ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
-    ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
-    ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
-    ctx_http.get ("/props",               ex_wrapper(routes.get_props));
-    ctx_http.post("/props",               ex_wrapper(routes.post_props));
-    ctx_http.post("/api/show",            ex_wrapper(routes.get_api_show));
-    ctx_http.get ("/models",              ex_wrapper(routes.get_models)); // public endpoint (no API key check)
-    ctx_http.get ("/v1/models",           ex_wrapper(routes.get_models)); // public endpoint (no API key check)
-    ctx_http.get ("/api/tags",            ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
-    ctx_http.post("/completion",          ex_wrapper(routes.post_completions)); // legacy
-    ctx_http.post("/completions",         ex_wrapper(routes.post_completions));
-    ctx_http.post("/v1/completions",      ex_wrapper(routes.post_completions_oai));
-    ctx_http.post("/chat/completions",    ex_wrapper(routes.post_chat_completions));
-    ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
-    ctx_http.post("/api/chat",            ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
-    ctx_http.post("/v1/responses",        ex_wrapper(routes.post_responses_oai));
-    ctx_http.post("/responses",           ex_wrapper(routes.post_responses_oai));
-    ctx_http.post("/v1/messages",         ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
+    ctx_http.get ("/health",                   ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/health",                ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/metrics",                  ex_wrapper(routes.get_metrics));
+    ctx_http.get ("/props",                    ex_wrapper(routes.get_props));
+    ctx_http.post("/props",                    ex_wrapper(routes.post_props));
+    ctx_http.post("/api/show",                 ex_wrapper(routes.get_api_show));
+    ctx_http.get ("/models",                   ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/models",                ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/api/tags",                 ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
+    ctx_http.post("/completion",               ex_wrapper(routes.post_completions)); // legacy
+    ctx_http.post("/completions",              ex_wrapper(routes.post_completions));
+    ctx_http.post("/v1/completions",           ex_wrapper(routes.post_completions_oai));
+    ctx_http.post("/chat/completions",         ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions",      ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/api/chat",                 ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
+    ctx_http.post("/v1/responses",             ex_wrapper(routes.post_responses_oai));
+    ctx_http.post("/responses",                ex_wrapper(routes.post_responses_oai));
+    ctx_http.post("/v1/audio/transcriptions",  ex_wrapper(routes.post_transcriptions_oai));
+    ctx_http.post("/audio/transcriptions",     ex_wrapper(routes.post_transcriptions_oai));
+    ctx_http.post("/v1/messages",              ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
     ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
-    ctx_http.post("/infill",              ex_wrapper(routes.post_infill));
-    ctx_http.post("/embedding",           ex_wrapper(routes.post_embeddings)); // legacy
-    ctx_http.post("/embeddings",          ex_wrapper(routes.post_embeddings));
-    ctx_http.post("/v1/embeddings",       ex_wrapper(routes.post_embeddings_oai));
-    ctx_http.post("/rerank",              ex_wrapper(routes.post_rerank));
-    ctx_http.post("/reranking",           ex_wrapper(routes.post_rerank));
-    ctx_http.post("/v1/rerank",           ex_wrapper(routes.post_rerank));
-    ctx_http.post("/v1/reranking",        ex_wrapper(routes.post_rerank));
-    ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
-    ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
-    ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
+    ctx_http.post("/infill",                   ex_wrapper(routes.post_infill));
+    ctx_http.post("/embedding",                ex_wrapper(routes.post_embeddings)); // legacy
+    ctx_http.post("/embeddings",               ex_wrapper(routes.post_embeddings));
+    ctx_http.post("/v1/embeddings",            ex_wrapper(routes.post_embeddings_oai));
+    ctx_http.post("/rerank",                   ex_wrapper(routes.post_rerank));
+    ctx_http.post("/reranking",                ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/rerank",                ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/reranking",             ex_wrapper(routes.post_rerank));
+    ctx_http.post("/tokenize",                 ex_wrapper(routes.post_tokenize));
+    ctx_http.post("/detokenize",               ex_wrapper(routes.post_detokenize));
+    ctx_http.post("/apply-template",           ex_wrapper(routes.post_apply_template));
     // LoRA adapters hotswap
-    ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
-    ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));
+    ctx_http.get ("/lora-adapters",            ex_wrapper(routes.get_lora_adapters));
+    ctx_http.post("/lora-adapters",            ex_wrapper(routes.post_lora_adapters));
     // Save & load slots
-    ctx_http.get ("/slots",               ex_wrapper(routes.get_slots));
-    ctx_http.post("/slots/:id_slot",      ex_wrapper(routes.post_slots));
+    ctx_http.get ("/slots",                    ex_wrapper(routes.get_slots));
+    ctx_http.post("/slots/:id_slot",           ex_wrapper(routes.post_slots));
     // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
     if (params.webui_mcp_proxy) {
         SRV_WRN("%s", "-----------------\n");