diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 73ed15b55b..c90fae85c8 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/src/lib/services/completion.ts b/tools/server/webui/src/lib/services/completion.ts index 016ca4838c..02dc4bb995 100644 --- a/tools/server/webui/src/lib/services/completion.ts +++ b/tools/server/webui/src/lib/services/completion.ts @@ -1,11 +1,7 @@ import { getJsonHeaders } from '$lib/utils'; import { ChatService } from '$lib/services/chat'; -import type { - ApiCompletionRequest, - ApiCompletionResponse, - ApiCompletionStreamChunk -} from '$lib/types/api'; +import type { ApiCompletionRequest, ApiCompletionStreamChunk } from '$lib/types/api'; import type { ChatMessageTimings, ChatMessagePromptProgress } from '$lib/types/chat'; import type { SettingsChatServiceOptions } from '$lib/types/settings'; @@ -16,11 +12,11 @@ import type { SettingsChatServiceOptions } from '$lib/types/settings'; export class CompletionService { /** * Sends a completion request to the llama.cpp server. - * Supports both streaming and non-streaming responses. + * Supports only streaming responses. * * @param prompt - The text prompt to complete * @param options - Configuration options for the completion request - * @returns {Promise} that resolves to the complete response string (non-streaming) or void (streaming) + * @returns {Promise} that resolves to void * @throws {Error} if the request fails or is aborted */ static async sendCompletion( @@ -29,7 +25,6 @@ export class CompletionService { signal?: AbortSignal ): Promise { const { - stream, onChunk, onComplete, onError, @@ -63,9 +58,14 @@ export class CompletionService { timings_per_token } = options; + // We only support streaming responses + const stream: boolean = true; + const cache_prompt: boolean = true; + const requestBody: ApiCompletionRequest = { prompt, - stream + stream, + cache_prompt }; // Include model in request if provided @@ -75,7 +75,8 @@ export class CompletionService { if (temperature !== undefined) requestBody.temperature = temperature; if (max_tokens !== undefined) { - requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1; + // On the completion endpoint, max_tokens is called n_predict + requestBody.n_predict = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1; } if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range; @@ -131,25 +132,16 @@ export class CompletionService { throw error; } - if (stream) { - await CompletionService.handleCompletionStreamResponse( - response, - onChunk, - onComplete, - onError, - onModel, - onTimings, - signal - ); - return; - } else { - return CompletionService.handleCompletionNonStreamResponse( - response, - onComplete, - onError, - onModel - ); - } + await CompletionService.handleCompletionStreamResponse( + response, + onChunk, + onComplete, + onError, + onModel, + onTimings, + signal + ); + return; } catch (error) { if (error instanceof Error && error.name === 'AbortError') { console.log('Completion request was aborted'); @@ -299,49 +291,4 @@ export class CompletionService { reader.releaseLock(); } } - - /** - * Handles non-streaming response from the completion API - */ - private static async handleCompletionNonStreamResponse( - response: Response, - onComplete?: ( - response: string, - reasoningContent?: string, - timings?: ChatMessageTimings, - toolCalls?: string - ) => void, - onError?: (error: Error) => void, - onModel?: (model: string) => void - ): Promise { - try { - const responseText = await response.text(); - - if (!responseText.trim()) { - const noResponseError = new Error('No response received from server. Please try again.'); - throw noResponseError; - } - - const data: ApiCompletionResponse = JSON.parse(responseText); - - if (data.model) { - onModel?.(data.model); - } - - const content = data.content || ''; - - if (!content.trim()) { - const noResponseError = new Error('No response received from server. Please try again.'); - throw noResponseError; - } - - onComplete?.(content, undefined, data.timings, undefined); - - return content; - } catch (error) { - const err = error instanceof Error ? error : new Error('Parse error'); - onError?.(err); - throw err; - } - } } diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts index 33e4a92c86..1a304f2b7c 100644 --- a/tools/server/webui/src/lib/types/api.d.ts +++ b/tools/server/webui/src/lib/types/api.d.ts @@ -222,10 +222,11 @@ export interface ApiChatCompletionRequest { export interface ApiCompletionRequest { prompt: string; stream?: boolean; + cache_prompt?: boolean; model?: string; // Generation parameters temperature?: number; - max_tokens?: number; + n_predict?: number; // Sampling parameters dynatemp_range?: number; dynatemp_exponent?: number; @@ -305,19 +306,6 @@ export interface ApiCompletionStreamChunk { prompt_progress?: ChatMessagePromptProgress; } -export interface ApiCompletionResponse { - content: string; - stop: boolean; - model: string; - timings?: { - prompt_n?: number; - prompt_ms?: number; - predicted_n?: number; - predicted_ms?: number; - cache_n?: number; - }; -} - export interface ApiChatCompletionResponse { model?: string; choices: Array<{