webui: Add option to pre-encode conversation for faster next turns (#21034)

This commit is contained in:
Aleksander Grygier 2026-04-09 09:10:18 +02:00 committed by GitHub
parent b54cb2e3d0
commit 75511a8d7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 267 additions and 81 deletions

File diff suppressed because one or more lines are too long

View File

@ -18,7 +18,7 @@
<div style="display: contents">
<script>
{
__sveltekit_1610ad9 = {
__sveltekit_nl4lme = {
base: new URL('.', location).pathname.slice(0, -1)
};

View File

@ -291,14 +291,19 @@
title: SETTINGS_SECTION_TITLES.DEVELOPER,
icon: Code,
fields: [
{
key: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
label: 'Pre-fill KV cache after response',
type: SettingsFieldType.CHECKBOX
},
{
key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
label: 'Disable reasoning content parsing',
label: 'Disable server-side thinking extraction',
type: SettingsFieldType.CHECKBOX
},
{
key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
label: 'Exclude reasoning from context',
label: 'Strip thinking from message history',
type: SettingsFieldType.CHECKBOX
},
{

View File

@ -56,6 +56,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
dry_penalty_last_n: undefined,
max_tokens: undefined,
custom: '', // custom json-stringified object
preEncodeConversation: false,
// experimental features
pyInterpreterEnabled: false,
enableContinueGeneration: false
@ -106,9 +107,9 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
showThoughtInProgress: 'Expand thought process by default when generating messages.',
disableReasoningParsing:
'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
'Send reasoning_format=none so the server returns thinking tokens inline instead of extracting them into a separate field.',
excludeReasoningFromContext:
'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
'Strip thinking from previous messages before sending. When off, thinking is sent back via the reasoning_content field so the model sees its own chain-of-thought across turns.',
showRawOutputSwitch:
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
@ -143,6 +144,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Automatically expand tool call details while executing and keep them expanded after completion.',
pyInterpreterEnabled:
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
preEncodeConversation:
'After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.',
enableContinueGeneration:
'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
};

View File

@ -52,6 +52,8 @@ export const SETTINGS_KEYS = {
ALWAYS_SHOW_AGENTIC_TURNS: 'alwaysShowAgenticTurns',
AGENTIC_MAX_TOOL_PREVIEW_LINES: 'agenticMaxToolPreviewLines',
SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
// Performance
PRE_ENCODE_CONVERSATION: 'preEncodeConversation',
// Developer
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',

View File

@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
import {
ATTACHMENT_LABEL_PDF_FILE,
ATTACHMENT_LABEL_MCP_PROMPT,
ATTACHMENT_LABEL_MCP_RESOURCE
ATTACHMENT_LABEL_MCP_RESOURCE,
LEGACY_AGENTIC_REGEX
} from '$lib/constants';
import {
AttachmentType,
@ -279,6 +280,107 @@ export class ChatService {
}
}
/**
* Checks whether all server slots are currently idle (not processing any requests).
* Queries the /slots endpoint (requires --slots flag on the server).
* Returns true if all slots are idle, false if any is processing.
* If the endpoint is unavailable or errors out, returns true (best-effort fallback).
*
* @param signal - Optional AbortSignal to cancel the request if needed
* @param model - Optional model name to check slots for (required in ROUTER mode)
* @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
*/
static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
try {
const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
const res = await fetch(url, { signal });
if (!res.ok) return true;
const slots: { is_processing: boolean }[] = await res.json();
return slots.every((s) => !s.is_processing);
} catch {
return true;
}
}
/**
* Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
* After a response completes, this re-submits the full conversation
* using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
* This warms the cache for the next turn, making it faster.
*
* When excludeReasoningFromContext is true, reasoning content is stripped from the messages
* to match what sendMessage would send on the next turn (avoiding cache misses).
* When false, reasoning_content is preserved so the cached prompt matches the next request.
*
* @param messages - The full conversation including the latest assistant response
* @param model - Optional model name (required in ROUTER mode)
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
* @param signal - Optional AbortSignal to cancel the pre-encode request
*/
static async preEncode(
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
model?: string | null,
excludeReasoning?: boolean,
signal?: AbortSignal
): Promise<void> {
const normalizedMessages: ApiChatMessageData[] = messages
.map((msg) => {
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
return ChatService.convertDbMessageToApiChatMessageData(
msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
);
}
return msg as ApiChatMessageData;
})
.filter((msg) => {
if (msg.role === MessageRole.SYSTEM) {
const content = typeof msg.content === 'string' ? msg.content : '';
return content.trim().length > 0;
}
return true;
});
const requestBody: Record<string, unknown> = {
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
const mapped: Record<string, unknown> = {
role: msg.role,
content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
tool_calls: msg.tool_calls,
tool_call_id: msg.tool_call_id
};
if (!excludeReasoning && msg.reasoning_content) {
mapped.reasoning_content = msg.reasoning_content;
}
return mapped;
}),
stream: false,
n_predict: 0
};
if (model) {
requestBody.model = model;
}
try {
await fetch(`./v1/chat/completions`, {
method: 'POST',
headers: getJsonHeaders(),
body: JSON.stringify(requestBody),
signal
});
} catch (error) {
if (!isAbortError(error)) {
console.warn('[ChatService] Pre-encode request failed:', error);
}
}
}
/**
*
*
@ -799,6 +901,28 @@ export class ChatService {
*
*/
/**
* Strips legacy inline reasoning content tags from message content.
* Handles both plain string content and multipart content arrays.
*/
private static stripReasoningContent(
content: string | ApiChatMessageContentPart[]
): string | ApiChatMessageContentPart[] {
const stripFromString = (text: string): string =>
text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
if (typeof content === 'string') {
return stripFromString(content);
}
return content.map((part) => {
if (part.type === ContentPartType.TEXT && part.text) {
return { ...part, text: stripFromString(part.text) };
}
return part;
});
}
/**
* Parses error response and creates appropriate error with context information
* @param response - HTTP response object

View File

@ -58,6 +58,7 @@ class ChatStore {
chatLoadingStates = new SvelteMap<string, boolean>();
chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
private abortControllers = new SvelteMap<string, AbortController>();
private preEncodeAbortController: AbortController | null = null;
private processingStates = new SvelteMap<string, ApiProcessingState | null>();
private conversationStateTimestamps = new SvelteMap<string, ConversationStateEntry>();
private activeConversationId = $state<string | null>(null);
@ -462,6 +463,9 @@ class ChatStore {
const activeConv = conversationsStore.activeConversation;
if (activeConv && this.isChatLoadingInternal(activeConv.id)) return;
// Cancel any in-flight pre-encode request
this.cancelPreEncode();
// Consume MCP resource attachments - converts them to extras and clears the live store
const resourceExtras = mcpStore.consumeResourceAttachmentsAsExtras();
const allExtras = resourceExtras.length > 0 ? [...(extras || []), ...resourceExtras] : extras;
@ -724,6 +728,16 @@ class ChatStore {
if (onComplete) onComplete(streamedContent);
if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
// Pre-encode conversation in KV cache for faster next turn
if (config().preEncodeConversation) {
this.triggerPreEncode(
allMessages,
assistantMessage,
streamedContent,
effectiveModel,
!!config().excludeReasoningFromContext
);
}
},
onError: (error: Error) => {
this.setStreamingActive(false);
@ -911,6 +925,7 @@ class ChatStore {
async regenerateMessage(messageId: string): Promise<void> {
const activeConv = conversationsStore.activeConversation;
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
this.cancelPreEncode();
const result = this.getMessageByIdWithRole(messageId, MessageRole.ASSISTANT);
if (!result) return;
const { index: messageIndex } = result;
@ -940,6 +955,7 @@ class ChatStore {
async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
const activeConv = conversationsStore.activeConversation;
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
this.cancelPreEncode();
try {
const idx = conversationsStore.findMessageIndex(messageId);
if (idx === -1) return;
@ -1616,6 +1632,42 @@ class ChatStore {
return apiOptions;
}
private cancelPreEncode(): void {
if (this.preEncodeAbortController) {
this.preEncodeAbortController.abort();
this.preEncodeAbortController = null;
}
}
private async triggerPreEncode(
allMessages: DatabaseMessage[],
assistantMessage: DatabaseMessage,
assistantContent: string,
model?: string | null,
excludeReasoning?: boolean
): Promise<void> {
this.cancelPreEncode();
this.preEncodeAbortController = new AbortController();
const signal = this.preEncodeAbortController.signal;
try {
const allIdle = await ChatService.areAllSlotsIdle(model, signal);
if (!allIdle || signal.aborted) return;
const messagesWithAssistant: DatabaseMessage[] = [
...allMessages,
{ ...assistantMessage, content: assistantContent }
];
await ChatService.preEncode(messagesWithAssistant, model, excludeReasoning, signal);
} catch (err) {
if (!isAbortError(err)) {
console.warn('[ChatStore] Pre-encode failed:', err);
}
}
}
}
export const chatStore = new ChatStore();