diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index e2f96687d0..faa4e5cc7a 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/src/lib/services/chat.ts b/tools/server/webui/src/lib/services/chat.ts
index a7f2e0b6b0..cd9193c666 100644
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@@ -259,176 +259,7 @@ export class ChatService {
}
}
- /**
- * Sends a completion request to the llama.cpp server.
- * Supports both streaming and non-streaming responses.
- *
- * @param prompt - The text prompt to complete
- * @param options - Configuration options for the completion request
- * @returns {Promise} that resolves to the complete response string (non-streaming) or void (streaming)
- * @throws {Error} if the request fails or is aborted
- */
- static async sendCompletion(
- prompt: string,
- options: SettingsChatServiceOptions = {},
- signal?: AbortSignal
- ): Promise {
- const {
- stream,
- onChunk,
- onComplete,
- onError,
- onModel,
- onTimings,
- // Generation parameters
- temperature,
- max_tokens,
- // Sampling parameters
- dynatemp_range,
- dynatemp_exponent,
- top_k,
- top_p,
- min_p,
- xtc_probability,
- xtc_threshold,
- typ_p,
- // Penalty parameters
- repeat_last_n,
- repeat_penalty,
- presence_penalty,
- frequency_penalty,
- dry_multiplier,
- dry_base,
- dry_allowed_length,
- dry_penalty_last_n,
- // Other parameters
- samplers,
- backend_sampling,
- custom,
- timings_per_token
- } = options;
- const requestBody: ApiCompletionRequest = {
- prompt,
- stream
- };
-
- // Include model in request if provided
- if (options.model) {
- requestBody.model = options.model;
- }
-
- if (temperature !== undefined) requestBody.temperature = temperature;
- if (max_tokens !== undefined) {
- requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
- }
-
- if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
- if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;
- if (top_k !== undefined) requestBody.top_k = top_k;
- if (top_p !== undefined) requestBody.top_p = top_p;
- if (min_p !== undefined) requestBody.min_p = min_p;
- if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
- if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
- if (typ_p !== undefined) requestBody.typ_p = typ_p;
-
- if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
- if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
- if (presence_penalty !== undefined) requestBody.presence_penalty = presence_penalty;
- if (frequency_penalty !== undefined) requestBody.frequency_penalty = frequency_penalty;
- if (dry_multiplier !== undefined) requestBody.dry_multiplier = dry_multiplier;
- if (dry_base !== undefined) requestBody.dry_base = dry_base;
- if (dry_allowed_length !== undefined) requestBody.dry_allowed_length = dry_allowed_length;
- if (dry_penalty_last_n !== undefined) requestBody.dry_penalty_last_n = dry_penalty_last_n;
-
- if (samplers !== undefined) {
- requestBody.samplers =
- typeof samplers === 'string'
- ? samplers.split(';').filter((s: string) => s.trim())
- : samplers;
- }
-
- if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
- if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
-
- if (custom) {
- try {
- const customParams = typeof custom === 'string' ? JSON.parse(custom) : custom;
- Object.assign(requestBody, customParams);
- } catch (error) {
- console.warn('Failed to parse custom parameters:', error);
- }
- }
-
- try {
- const response = await fetch(`./completion`, {
- method: 'POST',
- headers: getJsonHeaders(),
- body: JSON.stringify(requestBody),
- signal
- });
-
- if (!response.ok) {
- const error = await ChatService.parseErrorResponse(response);
- if (onError) {
- onError(error);
- }
- throw error;
- }
-
- if (stream) {
- await ChatService.handleCompletionStreamResponse(
- response,
- onChunk,
- onComplete,
- onError,
- onModel,
- onTimings,
- signal
- );
- return;
- } else {
- return ChatService.handleCompletionNonStreamResponse(
- response,
- onComplete,
- onError,
- onModel
- );
- }
- } catch (error) {
- if (error instanceof Error && error.name === 'AbortError') {
- console.log('Completion request was aborted');
- return;
- }
-
- let userFriendlyError: Error;
-
- if (error instanceof Error) {
- if (error.name === 'TypeError' && error.message.includes('fetch')) {
- userFriendlyError = new Error(
- 'Unable to connect to server - please check if the server is running'
- );
- userFriendlyError.name = 'NetworkError';
- } else if (error.message.includes('ECONNREFUSED')) {
- userFriendlyError = new Error('Connection refused - server may be offline');
- userFriendlyError.name = 'NetworkError';
- } else if (error.message.includes('ETIMEDOUT')) {
- userFriendlyError = new Error('Request timed out - the server took too long to respond');
- userFriendlyError.name = 'TimeoutError';
- } else {
- userFriendlyError = error;
- }
- } else {
- userFriendlyError = new Error('Unknown error occurred while sending completion');
- }
-
- console.error('Error in sendCompletion:', error);
- if (onError) {
- onError(userFriendlyError);
- }
- throw userFriendlyError;
- }
- }
// ─────────────────────────────────────────────────────────────────────────────
// Streaming
@@ -869,7 +700,7 @@ export class ChatService {
* @param response - HTTP response object
* @returns Promise - Parsed error with context info if available
*/
- private static async parseErrorResponse(
+ public static async parseErrorResponse(
response: Response
): Promise {
try {
@@ -912,7 +743,7 @@ export class ChatService {
* @returns Model name string if found, undefined otherwise
* @private
*/
- private static extractModelName(data: unknown): string | undefined {
+ public static extractModelName(data: unknown): string | undefined {
const asRecord = (value: unknown): Record | undefined => {
return typeof value === 'object' && value !== null
? (value as Record)
@@ -953,7 +784,7 @@ export class ChatService {
* @param onTimingsCallback - Callback function to invoke with timing data
* @private
*/
- private static notifyTimings(
+ public static notifyTimings(
timings: ChatMessageTimings | undefined,
promptProgress: ChatMessagePromptProgress | undefined,
onTimingsCallback:
@@ -965,168 +796,6 @@ export class ChatService {
onTimingsCallback(timings, promptProgress);
}
- /**
- * Handles streaming response from the completion API
- */
- private static async handleCompletionStreamResponse(
- response: Response,
- onChunk?: (chunk: string) => void,
- onComplete?: (
- response: string,
- reasoningContent?: string,
- timings?: ChatMessageTimings,
- toolCalls?: string
- ) => void,
- onError?: (error: Error) => void,
- onModel?: (model: string) => void,
- onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
- abortSignal?: AbortSignal
- ): Promise {
- const reader = response.body?.getReader();
- if (!reader) {
- throw new Error('No response body');
- }
-
- const decoder = new TextDecoder();
- let aggregatedContent = '';
- let lastTimings: ChatMessageTimings | undefined;
- let streamFinished = false;
- let modelEmitted = false;
-
- try {
- let chunk = '';
- while (true) {
- if (abortSignal?.aborted) {
- break;
- }
-
- const { done, value } = await reader.read();
- if (done) {
- break;
- }
-
- if (abortSignal?.aborted) {
- break;
- }
-
- chunk += decoder.decode(value, { stream: true });
- const lines = chunk.split('\n');
- chunk = lines.pop() || '';
-
- for (const line of lines) {
- if (abortSignal?.aborted) {
- break;
- }
-
- if (line.startsWith('data: ')) {
- const data = line.slice(6);
- if (data === '[DONE]') {
- streamFinished = true;
- continue;
- }
-
- try {
- const parsed: ApiCompletionStreamChunk = JSON.parse(data);
- const content = parsed.content;
- const timings = parsed.timings;
- const model = parsed.model;
- const promptProgress = parsed.prompt_progress;
-
- if (parsed.stop) {
- streamFinished = true;
- }
-
- if (model && !modelEmitted) {
- modelEmitted = true;
- onModel?.(model);
- }
-
- if (promptProgress) {
- ChatService.notifyTimings(undefined, promptProgress, onTimings);
- }
-
- if (timings) {
- ChatService.notifyTimings(timings, promptProgress, onTimings);
- lastTimings = timings;
- }
-
- if (content) {
- aggregatedContent += content;
- if (!abortSignal?.aborted) {
- onChunk?.(content);
- }
- }
- } catch (e) {
- console.error('Error parsing JSON chunk:', e);
- }
- }
- }
-
- if (streamFinished) {
- break;
- }
- }
-
- if (abortSignal?.aborted) {
- return;
- }
-
- if (streamFinished) {
- onComplete?.(aggregatedContent, undefined, lastTimings, undefined);
- }
- } catch (error) {
- const err = error instanceof Error ? error : new Error('Stream error');
- onError?.(err);
- throw err;
- } finally {
- reader.releaseLock();
- }
- }
-
- /**
- * Handles non-streaming response from the completion API
- */
- private static async handleCompletionNonStreamResponse(
- response: Response,
- onComplete?: (
- response: string,
- reasoningContent?: string,
- timings?: ChatMessageTimings,
- toolCalls?: string
- ) => void,
- onError?: (error: Error) => void,
- onModel?: (model: string) => void
- ): Promise {
- try {
- const responseText = await response.text();
-
- if (!responseText.trim()) {
- const noResponseError = new Error('No response received from server. Please try again.');
- throw noResponseError;
- }
-
- const data: ApiCompletionResponse = JSON.parse(responseText);
-
- if (data.model) {
- onModel?.(data.model);
- }
-
- const content = data.content || '';
-
- if (!content.trim()) {
- const noResponseError = new Error('No response received from server. Please try again.');
- throw noResponseError;
- }
-
- onComplete?.(content, undefined, data.timings, undefined);
-
- return content;
- } catch (error) {
- const err = error instanceof Error ? error : new Error('Parse error');
- onError?.(err);
- throw err;
- }
- }
}
diff --git a/tools/server/webui/src/lib/services/completion.ts b/tools/server/webui/src/lib/services/completion.ts
new file mode 100644
index 0000000000..d74457a3bd
--- /dev/null
+++ b/tools/server/webui/src/lib/services/completion.ts
@@ -0,0 +1,354 @@
+import { getJsonHeaders } from '$lib/utils';
+import { ChatService } from '$lib/services/chat';
+
+import type {
+ ApiCompletionRequest,
+ ApiCompletionResponse,
+ ApiCompletionStreamChunk,
+ ApiErrorResponse
+} from '$lib/types/api';
+import type { ChatMessageTimings, ChatMessagePromptProgress } from '$lib/types/chat';
+import type { SettingsChatServiceOptions } from '$lib/types/settings';
+
+/**
+ * CompletionService - Low-level API communication layer for raw text completions.
+ * Used in the notebook page.
+ */
+export class CompletionService {
+ /**
+ * Sends a completion request to the llama.cpp server.
+ * Supports both streaming and non-streaming responses.
+ *
+ * @param prompt - The text prompt to complete
+ * @param options - Configuration options for the completion request
+ * @returns {Promise} that resolves to the complete response string (non-streaming) or void (streaming)
+ * @throws {Error} if the request fails or is aborted
+ */
+ static async sendCompletion(
+ prompt: string,
+ options: SettingsChatServiceOptions = {},
+ signal?: AbortSignal
+ ): Promise {
+ const {
+ stream,
+ onChunk,
+ onComplete,
+ onError,
+ onModel,
+ onTimings,
+ // Generation parameters
+ temperature,
+ max_tokens,
+ // Sampling parameters
+ dynatemp_range,
+ dynatemp_exponent,
+ top_k,
+ top_p,
+ min_p,
+ xtc_probability,
+ xtc_threshold,
+ typ_p,
+ // Penalty parameters
+ repeat_last_n,
+ repeat_penalty,
+ presence_penalty,
+ frequency_penalty,
+ dry_multiplier,
+ dry_base,
+ dry_allowed_length,
+ dry_penalty_last_n,
+ // Other parameters
+ samplers,
+ backend_sampling,
+ custom,
+ timings_per_token
+ } = options;
+
+ const requestBody: ApiCompletionRequest = {
+ prompt,
+ stream
+ };
+
+ // Include model in request if provided
+ if (options.model) {
+ requestBody.model = options.model;
+ }
+
+ if (temperature !== undefined) requestBody.temperature = temperature;
+ if (max_tokens !== undefined) {
+ requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
+ }
+
+ if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
+ if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;
+ if (top_k !== undefined) requestBody.top_k = top_k;
+ if (top_p !== undefined) requestBody.top_p = top_p;
+ if (min_p !== undefined) requestBody.min_p = min_p;
+ if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
+ if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
+ if (typ_p !== undefined) requestBody.typ_p = typ_p;
+
+ if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
+ if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
+ if (presence_penalty !== undefined) requestBody.presence_penalty = presence_penalty;
+ if (frequency_penalty !== undefined) requestBody.frequency_penalty = frequency_penalty;
+ if (dry_multiplier !== undefined) requestBody.dry_multiplier = dry_multiplier;
+ if (dry_base !== undefined) requestBody.dry_base = dry_base;
+ if (dry_allowed_length !== undefined) requestBody.dry_allowed_length = dry_allowed_length;
+ if (dry_penalty_last_n !== undefined) requestBody.dry_penalty_last_n = dry_penalty_last_n;
+
+ if (samplers !== undefined) {
+ requestBody.samplers =
+ typeof samplers === 'string'
+ ? samplers.split(';').filter((s: string) => s.trim())
+ : samplers;
+ }
+
+ if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
+ if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
+
+ if (custom) {
+ try {
+ const customParams = typeof custom === 'string' ? JSON.parse(custom) : custom;
+ Object.assign(requestBody, customParams);
+ } catch (error) {
+ console.warn('Failed to parse custom parameters:', error);
+ }
+ }
+
+ try {
+ const response = await fetch(`./completion`, {
+ method: 'POST',
+ headers: getJsonHeaders(),
+ body: JSON.stringify(requestBody),
+ signal
+ });
+
+ if (!response.ok) {
+ const error = await ChatService.parseErrorResponse(response);
+ if (onError) {
+ onError(error);
+ }
+ throw error;
+ }
+
+ if (stream) {
+ await CompletionService.handleCompletionStreamResponse(
+ response,
+ onChunk,
+ onComplete,
+ onError,
+ onModel,
+ onTimings,
+ signal
+ );
+ return;
+ } else {
+ return CompletionService.handleCompletionNonStreamResponse(
+ response,
+ onComplete,
+ onError,
+ onModel
+ );
+ }
+ } catch (error) {
+ if (error instanceof Error && error.name === 'AbortError') {
+ console.log('Completion request was aborted');
+ return;
+ }
+
+ let userFriendlyError: Error;
+
+ if (error instanceof Error) {
+ if (error.name === 'TypeError' && error.message.includes('fetch')) {
+ userFriendlyError = new Error(
+ 'Unable to connect to server - please check if the server is running'
+ );
+ userFriendlyError.name = 'NetworkError';
+ } else if (error.message.includes('ECONNREFUSED')) {
+ userFriendlyError = new Error('Connection refused - server may be offline');
+ userFriendlyError.name = 'NetworkError';
+ } else if (error.message.includes('ETIMEDOUT')) {
+ userFriendlyError = new Error('Request timed out - the server took too long to respond');
+ userFriendlyError.name = 'TimeoutError';
+ } else {
+ userFriendlyError = error;
+ }
+ } else {
+ userFriendlyError = new Error('Unknown error occurred while sending completion');
+ }
+
+ console.error('Error in sendCompletion:', error);
+ if (onError) {
+ onError(userFriendlyError);
+ }
+ throw userFriendlyError;
+ }
+ }
+
+ /**
+ * Handles streaming response from the completion API
+ */
+ private static async handleCompletionStreamResponse(
+ response: Response,
+ onChunk?: (chunk: string) => void,
+ onComplete?: (
+ response: string,
+ reasoningContent?: string,
+ timings?: ChatMessageTimings,
+ toolCalls?: string
+ ) => void,
+ onError?: (error: Error) => void,
+ onModel?: (model: string) => void,
+ onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
+ abortSignal?: AbortSignal
+ ): Promise {
+ const reader = response.body?.getReader();
+
+ if (!reader) {
+ throw new Error('No response body');
+ }
+
+ const decoder = new TextDecoder();
+ let aggregatedContent = '';
+ let lastTimings: ChatMessageTimings | undefined;
+ let streamFinished = false;
+ let modelEmitted = false;
+
+ try {
+ let chunk = '';
+ while (true) {
+ if (abortSignal?.aborted) {
+ break;
+ }
+
+ const { done, value } = await reader.read();
+ if (done) {
+ break;
+ }
+
+ if (abortSignal?.aborted) {
+ break;
+ }
+
+ chunk += decoder.decode(value, { stream: true });
+ const lines = chunk.split('\n');
+ chunk = lines.pop() || '';
+
+ for (const line of lines) {
+ if (abortSignal?.aborted) {
+ break;
+ }
+
+ if (line.startsWith('data: ')) {
+ const data = line.slice(6);
+ if (data === '[DONE]') {
+ streamFinished = true;
+ continue;
+ }
+
+ try {
+ const parsed: ApiCompletionStreamChunk = JSON.parse(data);
+ const content = parsed.content;
+ const timings = parsed.timings;
+ const model = parsed.model;
+ const promptProgress = parsed.prompt_progress;
+
+ if (parsed.stop) {
+ streamFinished = true;
+ }
+
+ if (model && !modelEmitted) {
+ modelEmitted = true;
+ onModel?.(model);
+ }
+
+ if (promptProgress) {
+ ChatService.notifyTimings(undefined, promptProgress, onTimings);
+ }
+
+ if (timings) {
+ ChatService.notifyTimings(timings, promptProgress, onTimings);
+ lastTimings = timings;
+ }
+
+ if (content) {
+ aggregatedContent += content;
+ if (!abortSignal?.aborted) {
+ onChunk?.(content);
+ }
+ }
+ } catch (e) {
+ console.error('Error parsing JSON chunk:', e);
+ }
+ }
+ }
+
+ if (streamFinished) {
+ break;
+ }
+ }
+
+ if (abortSignal?.aborted) {
+ return;
+ }
+
+ if (streamFinished) {
+ onComplete?.(aggregatedContent, undefined, lastTimings, undefined);
+ }
+ } catch (error) {
+ const err = error instanceof Error ? error : new Error('Stream error');
+ onError?.(err);
+ throw err;
+ } finally {
+ reader.releaseLock();
+ }
+ }
+
+ /**
+ * Handles non-streaming response from the completion API
+ */
+ private static async handleCompletionNonStreamResponse(
+ response: Response,
+ onComplete?: (
+ response: string,
+ reasoningContent?: string,
+ timings?: ChatMessageTimings,
+ toolCalls?: string
+ ) => void,
+ onError?: (error: Error) => void,
+ onModel?: (model: string) => void
+ ): Promise {
+ try {
+ const responseText = await response.text();
+
+ if (!responseText.trim()) {
+ const noResponseError = new Error('No response received from server. Please try again.');
+ throw noResponseError;
+ }
+
+ const data: ApiCompletionResponse = JSON.parse(responseText);
+
+ if (data.model) {
+ onModel?.(data.model);
+ }
+
+ const content = data.content || '';
+
+ if (!content.trim()) {
+ const noResponseError = new Error('No response received from server. Please try again.');
+ throw noResponseError;
+ }
+
+ onComplete?.(content, undefined, data.timings, undefined);
+
+ return content;
+ } catch (error) {
+ const err = error instanceof Error ? error : new Error('Parse error');
+ onError?.(err);
+ throw err;
+ }
+ }
+
+}
+
diff --git a/tools/server/webui/src/lib/stores/notebook.svelte.ts b/tools/server/webui/src/lib/stores/notebook.svelte.ts
index 70917def9c..4f7672db19 100644
--- a/tools/server/webui/src/lib/stores/notebook.svelte.ts
+++ b/tools/server/webui/src/lib/stores/notebook.svelte.ts
@@ -1,4 +1,4 @@
-import { ChatService } from '$lib/services/chat';
+import { CompletionService } from '$lib/services/completion';
import { config } from '$lib/stores/settings.svelte';
import { tokenize } from '$lib/services/tokenize';
@@ -45,7 +45,7 @@ export class NotebookStore {
try {
const currentConfig = config();
- await ChatService.sendCompletion(
+ await CompletionService.sendCompletion(
this.content,
{
...currentConfig,