webui: Add option to pre-encode conversation for faster next turns (#21034)
This commit is contained in:
parent
b54cb2e3d0
commit
75511a8d7e
File diff suppressed because one or more lines are too long
|
|
@ -18,7 +18,7 @@
|
|||
<div style="display: contents">
|
||||
<script>
|
||||
{
|
||||
__sveltekit_1610ad9 = {
|
||||
__sveltekit_nl4lme = {
|
||||
base: new URL('.', location).pathname.slice(0, -1)
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -291,14 +291,19 @@
|
|||
title: SETTINGS_SECTION_TITLES.DEVELOPER,
|
||||
icon: Code,
|
||||
fields: [
|
||||
{
|
||||
key: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
|
||||
label: 'Pre-fill KV cache after response',
|
||||
type: SettingsFieldType.CHECKBOX
|
||||
},
|
||||
{
|
||||
key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
|
||||
label: 'Disable reasoning content parsing',
|
||||
label: 'Disable server-side thinking extraction',
|
||||
type: SettingsFieldType.CHECKBOX
|
||||
},
|
||||
{
|
||||
key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
|
||||
label: 'Exclude reasoning from context',
|
||||
label: 'Strip thinking from message history',
|
||||
type: SettingsFieldType.CHECKBOX
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
|
|||
dry_penalty_last_n: undefined,
|
||||
max_tokens: undefined,
|
||||
custom: '', // custom json-stringified object
|
||||
preEncodeConversation: false,
|
||||
// experimental features
|
||||
pyInterpreterEnabled: false,
|
||||
enableContinueGeneration: false
|
||||
|
|
@ -106,9 +107,9 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
|||
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
|
||||
showThoughtInProgress: 'Expand thought process by default when generating messages.',
|
||||
disableReasoningParsing:
|
||||
'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
|
||||
'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
|
||||
excludeReasoningFromContext:
|
||||
'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
|
||||
'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
|
||||
showRawOutputSwitch:
|
||||
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
|
||||
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
|
||||
|
|
@ -143,6 +144,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
|||
'Automatically expand tool call details while executing and keep them expanded after completion.',
|
||||
pyInterpreterEnabled:
|
||||
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
|
||||
preEncodeConversation:
|
||||
'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
|
||||
enableContinueGeneration:
|
||||
'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
|
||||
};
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ export const SETTINGS_KEYS = {
|
|||
ALWAYS_SHOW_AGENTIC_TURNS: 'alwaysShowAgenticTurns',
|
||||
AGENTIC_MAX_TOOL_PREVIEW_LINES: 'agenticMaxToolPreviewLines',
|
||||
SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
|
||||
// Performance
|
||||
PRE_ENCODE_CONVERSATION: 'preEncodeConversation',
|
||||
// Developer
|
||||
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
|
||||
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
|
||||
|
|
|
|||
|
|
@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
|
|||
import {
|
||||
ATTACHMENT_LABEL_PDF_FILE,
|
||||
ATTACHMENT_LABEL_MCP_PROMPT,
|
||||
ATTACHMENT_LABEL_MCP_RESOURCE
|
||||
ATTACHMENT_LABEL_MCP_RESOURCE,
|
||||
LEGACY_AGENTIC_REGEX
|
||||
} from '$lib/constants';
|
||||
import {
|
||||
AttachmentType,
|
||||
|
|
@ -279,6 +280,107 @@ export class ChatService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether all server slots are currently idle (not processing any requests).
|
||||
* Queries the /slots endpoint (requires --slots flag on the server).
|
||||
* Returns true if all slots are idle, false if any is processing.
|
||||
* If the endpoint is unavailable or errors out, returns true (best-effort fallback).
|
||||
*
|
||||
* @param signal - Optional AbortSignal to cancel the request if needed
|
||||
* @param model - Optional model name to check slots for (required in ROUTER mode)
|
||||
* @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
|
||||
*/
|
||||
static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
|
||||
try {
|
||||
const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
|
||||
const res = await fetch(url, { signal });
|
||||
if (!res.ok) return true;
|
||||
|
||||
const slots: { is_processing: boolean }[] = await res.json();
|
||||
return slots.every((s) => !s.is_processing);
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
|
||||
* After a response completes, this re-submits the full conversation
|
||||
* using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
|
||||
* This warms the cache for the next turn, making it faster.
|
||||
*
|
||||
* When excludeReasoningFromContext is true, reasoning content is stripped from the messages
|
||||
* to match what sendMessage would send on the next turn (avoiding cache misses).
|
||||
* When false, reasoning_content is preserved so the cached prompt matches the next request.
|
||||
*
|
||||
* @param messages - The full conversation including the latest assistant response
|
||||
* @param model - Optional model name (required in ROUTER mode)
|
||||
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
|
||||
* @param signal - Optional AbortSignal to cancel the pre-encode request
|
||||
*/
|
||||
static async preEncode(
|
||||
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
|
||||
model?: string | null,
|
||||
excludeReasoning?: boolean,
|
||||
signal?: AbortSignal
|
||||
): Promise<void> {
|
||||
const normalizedMessages: ApiChatMessageData[] = messages
|
||||
.map((msg) => {
|
||||
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
|
||||
return ChatService.convertDbMessageToApiChatMessageData(
|
||||
msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
|
||||
);
|
||||
}
|
||||
|
||||
return msg as ApiChatMessageData;
|
||||
})
|
||||
.filter((msg) => {
|
||||
if (msg.role === MessageRole.SYSTEM) {
|
||||
const content = typeof msg.content === 'string' ? msg.content : '';
|
||||
|
||||
return content.trim().length > 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
const requestBody: Record<string, unknown> = {
|
||||
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
|
||||
const mapped: Record<string, unknown> = {
|
||||
role: msg.role,
|
||||
content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
|
||||
tool_calls: msg.tool_calls,
|
||||
tool_call_id: msg.tool_call_id
|
||||
};
|
||||
|
||||
if (!excludeReasoning && msg.reasoning_content) {
|
||||
mapped.reasoning_content = msg.reasoning_content;
|
||||
}
|
||||
|
||||
return mapped;
|
||||
}),
|
||||
stream: false,
|
||||
n_predict: 0
|
||||
};
|
||||
|
||||
if (model) {
|
||||
requestBody.model = model;
|
||||
}
|
||||
|
||||
try {
|
||||
await fetch(`./v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: getJsonHeaders(),
|
||||
body: JSON.stringify(requestBody),
|
||||
signal
|
||||
});
|
||||
} catch (error) {
|
||||
if (!isAbortError(error)) {
|
||||
console.warn('[ChatService] Pre-encode request failed:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
|
|
@ -799,6 +901,28 @@ export class ChatService {
|
|||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Strips legacy inline reasoning content tags from message content.
|
||||
* Handles both plain string content and multipart content arrays.
|
||||
*/
|
||||
private static stripReasoningContent(
|
||||
content: string | ApiChatMessageContentPart[]
|
||||
): string | ApiChatMessageContentPart[] {
|
||||
const stripFromString = (text: string): string =>
|
||||
text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
|
||||
|
||||
if (typeof content === 'string') {
|
||||
return stripFromString(content);
|
||||
}
|
||||
|
||||
return content.map((part) => {
|
||||
if (part.type === ContentPartType.TEXT && part.text) {
|
||||
return { ...part, text: stripFromString(part.text) };
|
||||
}
|
||||
return part;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses error response and creates appropriate error with context information
|
||||
* @param response - HTTP response object
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ class ChatStore {
|
|||
chatLoadingStates = new SvelteMap<string, boolean>();
|
||||
chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
|
||||
private abortControllers = new SvelteMap<string, AbortController>();
|
||||
private preEncodeAbortController: AbortController | null = null;
|
||||
private processingStates = new SvelteMap<string, ApiProcessingState | null>();
|
||||
private conversationStateTimestamps = new SvelteMap<string, ConversationStateEntry>();
|
||||
private activeConversationId = $state<string | null>(null);
|
||||
|
|
@ -462,6 +463,9 @@ class ChatStore {
|
|||
const activeConv = conversationsStore.activeConversation;
|
||||
if (activeConv && this.isChatLoadingInternal(activeConv.id)) return;
|
||||
|
||||
// Cancel any in-flight pre-encode request
|
||||
this.cancelPreEncode();
|
||||
|
||||
// Consume MCP resource attachments - converts them to extras and clears the live store
|
||||
const resourceExtras = mcpStore.consumeResourceAttachmentsAsExtras();
|
||||
const allExtras = resourceExtras.length > 0 ? [...(extras || []), ...resourceExtras] : extras;
|
||||
|
|
@ -724,6 +728,16 @@ class ChatStore {
|
|||
|
||||
if (onComplete) onComplete(streamedContent);
|
||||
if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
|
||||
// Pre-encode conversation in KV cache for faster next turn
|
||||
if (config().preEncodeConversation) {
|
||||
this.triggerPreEncode(
|
||||
allMessages,
|
||||
assistantMessage,
|
||||
streamedContent,
|
||||
effectiveModel,
|
||||
!!config().excludeReasoningFromContext
|
||||
);
|
||||
}
|
||||
},
|
||||
onError: (error: Error) => {
|
||||
this.setStreamingActive(false);
|
||||
|
|
@ -911,6 +925,7 @@ class ChatStore {
|
|||
async regenerateMessage(messageId: string): Promise<void> {
|
||||
const activeConv = conversationsStore.activeConversation;
|
||||
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
|
||||
this.cancelPreEncode();
|
||||
const result = this.getMessageByIdWithRole(messageId, MessageRole.ASSISTANT);
|
||||
if (!result) return;
|
||||
const { index: messageIndex } = result;
|
||||
|
|
@ -940,6 +955,7 @@ class ChatStore {
|
|||
async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
|
||||
const activeConv = conversationsStore.activeConversation;
|
||||
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
|
||||
this.cancelPreEncode();
|
||||
try {
|
||||
const idx = conversationsStore.findMessageIndex(messageId);
|
||||
if (idx === -1) return;
|
||||
|
|
@ -1616,6 +1632,42 @@ class ChatStore {
|
|||
|
||||
return apiOptions;
|
||||
}
|
||||
|
||||
private cancelPreEncode(): void {
|
||||
if (this.preEncodeAbortController) {
|
||||
this.preEncodeAbortController.abort();
|
||||
this.preEncodeAbortController = null;
|
||||
}
|
||||
}
|
||||
|
||||
private async triggerPreEncode(
|
||||
allMessages: DatabaseMessage[],
|
||||
assistantMessage: DatabaseMessage,
|
||||
assistantContent: string,
|
||||
model?: string | null,
|
||||
excludeReasoning?: boolean
|
||||
): Promise<void> {
|
||||
this.cancelPreEncode();
|
||||
this.preEncodeAbortController = new AbortController();
|
||||
|
||||
const signal = this.preEncodeAbortController.signal;
|
||||
|
||||
try {
|
||||
const allIdle = await ChatService.areAllSlotsIdle(model, signal);
|
||||
if (!allIdle || signal.aborted) return;
|
||||
|
||||
const messagesWithAssistant: DatabaseMessage[] = [
|
||||
...allMessages,
|
||||
{ ...assistantMessage, content: assistantContent }
|
||||
];
|
||||
|
||||
await ChatService.preEncode(messagesWithAssistant, model, excludeReasoning, signal);
|
||||
} catch (err) {
|
||||
if (!isAbortError(err)) {
|
||||
console.warn('[ChatStore] Pre-encode failed:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const chatStore = new ChatStore();
|
||||
|
|
|
|||
Loading…
Reference in New Issue