server: remove default "gpt-3.5-turbo" model name (#17668)

* server: remove default "gpt-3.5-turbo" model name

* do not reflect back model name from request

* fix test
This commit is contained in:
Xuan-Son Nguyen 2025-12-02 11:38:57 +01:00 committed by GitHub
parent fd3abe849e
commit 5d6bd842ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 40 additions and 18 deletions

View File

@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) {
return oai_body; return oai_body;
} }
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) { json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64) {
json data = json::array(); json data = json::array();
int32_t n_tokens = 0; int32_t n_tokens = 0;
int i = 0; int i = 0;
@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
} }
json res = json { json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"model", json_value(request, "model", model_name)},
{"object", "list"}, {"object", "list"},
{"usage", json { {"usage", json {
{"prompt_tokens", n_tokens}, {"prompt_tokens", n_tokens},
@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
json format_response_rerank( json format_response_rerank(
const json & request, const json & request,
const std::string & model_name,
const json & ranks, const json & ranks,
bool is_tei_format, bool is_tei_format,
std::vector<std::string> & texts, std::vector<std::string> & texts,
@ -1338,7 +1343,7 @@ json format_response_rerank(
if (is_tei_format) return results; if (is_tei_format) return results;
json res = json{ json res = json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"model", json_value(request, "model", model_name)},
{"object", "list"}, {"object", "list"},
{"usage", json{ {"usage", json{
{"prompt_tokens", n_tokens}, {"prompt_tokens", n_tokens},

View File

@ -13,8 +13,6 @@
#include <vector> #include <vector>
#include <cinttypes> #include <cinttypes>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -298,11 +296,16 @@ json oaicompat_chat_params_parse(
json convert_anthropic_to_oai(const json & body); json convert_anthropic_to_oai(const json & body);
// TODO: move it to server-task.cpp // TODO: move it to server-task.cpp
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false); json format_embeddings_response_oaicompat(
const json & request,
const std::string & model_name,
const json & embeddings,
bool use_base64 = false);
// TODO: move it to server-task.cpp // TODO: move it to server-task.cpp
json format_response_rerank( json format_response_rerank(
const json & request, const json & request,
const std::string & model_name,
const json & ranks, const json & ranks,
bool is_tei_format, bool is_tei_format,
std::vector<std::string> & texts, std::vector<std::string> & texts,

View File

@ -17,6 +17,7 @@
#include <cinttypes> #include <cinttypes>
#include <memory> #include <memory>
#include <unordered_set> #include <unordered_set>
#include <filesystem>
// fix problem with std::min and std::max // fix problem with std::min and std::max
#if defined(_WIN32) #if defined(_WIN32)
@ -518,6 +519,8 @@ struct server_context_impl {
// Necessary similarity of prompt for slot selection // Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f; float slot_prompt_similarity = 0.0f;
std::string model_name; // name of the loaded model, to be used by API
common_chat_templates_ptr chat_templates; common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt; oaicompat_parser_options oai_parser_opt;
@ -758,6 +761,18 @@ struct server_context_impl {
} }
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
// user explicitly specified model name
model_name = params_base.model_alias;
} else if (!params_base.model.name.empty()) {
// use model name in registry format (for models in cache)
model_name = params_base.model.name;
} else {
// fallback: derive model name from file name
auto model_path = std::filesystem::path(params_base.model.path);
model_name = model_path.filename().string();
}
// thinking is enabled if: // thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0) // 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it // 2. The chat template supports it
@ -2611,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
// OAI-compat // OAI-compat
task.params.res_type = res_type; task.params.res_type = res_type;
task.params.oaicompat_cmpl_id = completion_id; task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl task.params.oaicompat_model = ctx_server.model_name;
tasks.push_back(std::move(task)); tasks.push_back(std::move(task));
} }
@ -2939,7 +2954,7 @@ void server_routes::init_routes() {
json data = { json data = {
{ "default_generation_settings", default_generation_settings_for_props }, { "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel }, { "total_slots", ctx_server.params_base.n_parallel },
{ "model_alias", ctx_server.params_base.model_alias }, { "model_alias", ctx_server.model_name },
{ "model_path", ctx_server.params_base.model.path }, { "model_path", ctx_server.params_base.model.path },
{ "modalities", json { { "modalities", json {
{"vision", ctx_server.oai_parser_opt.allow_image}, {"vision", ctx_server.oai_parser_opt.allow_image},
@ -3181,8 +3196,8 @@ void server_routes::init_routes() {
json models = { json models = {
{"models", { {"models", {
{ {
{"name", params.model_alias.empty() ? params.model.path : params.model_alias}, {"name", ctx_server.model_name},
{"model", params.model_alias.empty() ? params.model.path : params.model_alias}, {"model", ctx_server.model_name},
{"modified_at", ""}, {"modified_at", ""},
{"size", ""}, {"size", ""},
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
@ -3204,7 +3219,7 @@ void server_routes::init_routes() {
{"object", "list"}, {"object", "list"},
{"data", { {"data", {
{ {
{"id", params.model_alias.empty() ? params.model.path : params.model_alias}, {"id", ctx_server.model_name},
{"object", "model"}, {"object", "model"},
{"created", std::time(0)}, {"created", std::time(0)},
{"owned_by", "llamacpp"}, {"owned_by", "llamacpp"},
@ -3351,6 +3366,7 @@ void server_routes::init_routes() {
// write JSON response // write JSON response
json root = format_response_rerank( json root = format_response_rerank(
body, body,
ctx_server.model_name,
responses, responses,
is_tei_format, is_tei_format,
documents, documents,
@ -3613,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
// write JSON response // write JSON response
json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
? format_embeddings_response_oaicompat(body, responses, use_base64) ? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
: json(responses); : json(responses);
res->ok(root); res->ok(root);
return res; return res;

View File

@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
} }
} }
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
params.oaicompat_model = json_value(data, "model", model_name);
return params; return params;
} }

View File

@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
assert res.status_code == 200 assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["system_fingerprint"].startswith("b") assert res.body["system_fingerprint"].startswith("b")
assert res.body["model"] == model if model is not None else server.model_alias # we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
# assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0] choice = res.body["choices"][0]
@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
) )
def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server global server
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL server.model_alias = "llama-test-model"
server.start() server.start()
res = server.make_stream_request("POST", "/chat/completions", data={ res = server.make_stream_request("POST", "/chat/completions", data={
"max_tokens": max_tokens, "max_tokens": max_tokens,
@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
else: else:
assert "role" not in choice["delta"] assert "role" not in choice["delta"]
assert data["system_fingerprint"].startswith("b") assert data["system_fingerprint"].startswith("b")
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future assert data["model"] == "llama-test-model"
if last_cmpl_id is None: if last_cmpl_id is None:
last_cmpl_id = data["id"] last_cmpl_id = data["id"]
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream