diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 3a6fb1dc80..6c5e898ba6 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/src/lib/services/completion.ts b/tools/server/webui/src/lib/services/completion.ts
index 096c9bc838..52ce257afd 100644
--- a/tools/server/webui/src/lib/services/completion.ts
+++ b/tools/server/webui/src/lib/services/completion.ts
@@ -72,8 +72,7 @@ export class CompletionService {
if (temperature !== undefined) requestBody.temperature = temperature;
if (max_tokens !== undefined) {
- // On the completion endpoint, max_tokens is called n_predict
- requestBody.n_predict = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
+ requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
}
if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
@@ -83,7 +82,10 @@ export class CompletionService {
if (min_p !== undefined) requestBody.min_p = min_p;
if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
- if (typ_p !== undefined) requestBody.typ_p = typ_p;
+ if (typ_p !== undefined) {
+ // On the completion endpoint, typ_p is called typical_p
+ requestBody.typical_p = typ_p;
+ }
if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index 1a304f2b7c..b8a742f23e 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -219,23 +219,35 @@ export interface ApiChatCompletionRequest {
timings_per_token?: boolean;
}
+// Reference: https://github.com/ggml-org/llama.cpp/tree/master/tools/server#post-completion-given-a-prompt-it-returns-the-predicted-completion
export interface ApiCompletionRequest {
prompt: string;
stream?: boolean;
- cache_prompt?: boolean;
model?: string;
+ // Configure return
+ return_progress?: boolean;
+ return_tokens?: boolean;
+ timings_per_token?: boolean;
+ post_sampling_probs?: boolean;
+ response_fields?: string[];
// Generation parameters
temperature?: number;
- n_predict?: number;
// Sampling parameters
dynatemp_range?: number;
dynatemp_exponent?: number;
top_k?: number;
top_p?: number;
min_p?: number;
+ // We can use either n_predict or max_tokens
+ max_tokens?: number;
+ n_indent?: number;
+ n_keep?: number;
+ n_cmpl?: number;
+ n_cache_reuse?: number;
+ stop?: string[];
+ typical_p?: number;
xtc_probability?: number;
xtc_threshold?: number;
- typ_p?: number;
// Penalty parameters
repeat_last_n?: number;
repeat_penalty?: number;
@@ -245,12 +257,24 @@ export interface ApiCompletionRequest {
dry_base?: number;
dry_allowed_length?: number;
dry_penalty_last_n?: number;
+ dry_sequence_breakers?: string[];
+ mirostat?: number;
+ mirostat_tau?: number;
+ mirostat_eta?: number;
+ grammar?: string;
+ json_schema?: string;
+ seed?: number;
+ ignore_eos?: boolean;
+ n_probs?: number;
+ min_keep?: number;
+ t_max_predict_ms?: number;
+ id_slot?: number;
+ cache_prompt?: boolean;
// Sampler configuration
samplers?: string[];
backend_sampling?: boolean;
// Custom parameters (JSON string)
custom?: Record;
- timings_per_token?: boolean;
}
export interface ApiChatCompletionToolCallFunctionDelta {