- {#if hasPromptStats}
+ {#if hasPromptStats || isLive}
- Generation (token output)
+
+ {isGenerationDisabled
+ ? 'Generation (waiting for tokens...)'
+ : 'Generation (token output)'}
+
- {#if activeView === ChatMessageStatsView.GENERATION}
+ {#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
(null);
+ let lastKnownProcessingStats = $state(null);
// Derive processing state reactively from chatStore's direct state
const processingState = $derived.by(() => {
@@ -46,6 +64,34 @@ export function useProcessingState(): UseProcessingStateReturn {
}
});
+ // Track last known processing stats for when promptProgress disappears
+ $effect(() => {
+ if (processingState?.promptProgress) {
+ const { processed, total, time_ms, cache } = processingState.promptProgress;
+ const actualProcessed = processed - cache;
+ const actualTotal = total - cache;
+
+ if (actualProcessed > 0 && time_ms > 0) {
+ const tokensPerSecond = actualProcessed / (time_ms / 1000);
+ lastKnownProcessingStats = {
+ tokensProcessed: actualProcessed,
+ totalTokens: actualTotal,
+ timeMs: time_ms,
+ tokensPerSecond
+ };
+ }
+ }
+ });
+
+ function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
+ const elapsedSecs = elapsedMs / 1000;
+ const progressETASecs =
+ done === 0 || elapsedSecs < 0.5
+ ? undefined // can be the case for the 0% progress report
+ : elapsedSecs * (total / done - 1);
+ return progressETASecs;
+ }
+
function startMonitoring(): void {
if (isMonitoring) return;
isMonitoring = true;
@@ -59,28 +105,25 @@ export function useProcessingState(): UseProcessingStateReturn {
const currentConfig = config();
if (!currentConfig.keepStatsVisible) {
lastKnownState = null;
+ lastKnownProcessingStats = null;
}
}
function getProcessingMessage(): string {
- const state = processingState;
- if (!state) {
+ if (!processingState) {
return 'Processing...';
}
- switch (state.status) {
+ switch (processingState.status) {
case 'initializing':
return 'Initializing...';
case 'preparing':
- if (state.progressPercent !== undefined) {
- return `Processing (${state.progressPercent}%)`;
+ if (processingState.progressPercent !== undefined) {
+ return `Processing (${processingState.progressPercent}%)`;
}
return 'Preparing response...';
case 'generating':
- if (state.tokensDecoded > 0) {
- return `Generating... (${state.tokensDecoded} tokens)`;
- }
- return 'Generating...';
+ return '';
default:
return 'Processing...';
}
@@ -131,8 +174,76 @@ export function useProcessingState(): UseProcessingStateReturn {
}
function shouldShowDetails(): boolean {
- const state = processingState;
- return state !== null && state.status !== 'idle';
+ return processingState !== null && processingState.status !== 'idle';
+ }
+
+ /**
+ * Returns a short progress message with percent
+ */
+ function getPromptProgressText(): string | null {
+ if (!processingState?.promptProgress) return null;
+
+ const { processed, total, cache } = processingState.promptProgress;
+
+ const actualProcessed = processed - cache;
+ const actualTotal = total - cache;
+ const percent = Math.round((actualProcessed / actualTotal) * 100);
+ const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
+
+ if (eta !== undefined) {
+ const etaSecs = Math.ceil(eta);
+ return `Processing ${percent}% (ETA: ${etaSecs}s)`;
+ }
+
+ return `Processing ${percent}%`;
+ }
+
+ /**
+ * Returns live processing statistics for display (prompt processing phase)
+ * Returns last known stats when promptProgress becomes unavailable
+ */
+ function getLiveProcessingStats(): LiveProcessingStats | null {
+ if (processingState?.promptProgress) {
+ const { processed, total, time_ms, cache } = processingState.promptProgress;
+
+ const actualProcessed = processed - cache;
+ const actualTotal = total - cache;
+
+ if (actualProcessed > 0 && time_ms > 0) {
+ const tokensPerSecond = actualProcessed / (time_ms / 1000);
+
+ return {
+ tokensProcessed: actualProcessed,
+ totalTokens: actualTotal,
+ timeMs: time_ms,
+ tokensPerSecond
+ };
+ }
+ }
+
+ // Return last known stats if promptProgress is no longer available
+ return lastKnownProcessingStats;
+ }
+
+ /**
+ * Returns live generation statistics for display (token generation phase)
+ */
+ function getLiveGenerationStats(): LiveGenerationStats | null {
+ if (!processingState) return null;
+
+ const { tokensDecoded, tokensPerSecond } = processingState;
+
+ if (tokensDecoded <= 0) return null;
+
+ // Calculate time from tokens and speed
+ const timeMs =
+ tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
+
+ return {
+ tokensGenerated: tokensDecoded,
+ timeMs,
+ tokensPerSecond: tokensPerSecond || 0
+ };
}
return {
@@ -141,6 +252,9 @@ export function useProcessingState(): UseProcessingStateReturn {
},
getProcessingDetails,
getProcessingMessage,
+ getPromptProgressText,
+ getLiveProcessingStats,
+ getLiveGenerationStats,
shouldShowDetails,
startMonitoring,
stopMonitoring
diff --git a/tools/server/webui/src/lib/services/chat.ts b/tools/server/webui/src/lib/services/chat.ts
index fb98d2c995..02fc6381c0 100644
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@@ -118,7 +118,8 @@ export class ChatService {
role: msg.role,
content: msg.content
})),
- stream
+ stream,
+ return_progress: stream ? true : undefined
};
// Include model in request if provided (required in ROUTER mode)
@@ -274,7 +275,7 @@ export class ChatService {
onReasoningChunk?: (chunk: string) => void,
onToolCallChunk?: (chunk: string) => void,
onModel?: (model: string) => void,
- onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
+ onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
conversationId?: string,
abortSignal?: AbortSignal
): Promise {
@@ -369,11 +370,13 @@ export class ChatService {
onModel?.(chunkModel);
}
- if (timings || promptProgress) {
+ if (promptProgress) {
+ ChatService.notifyTimings(undefined, promptProgress, onTimings);
+ }
+
+ if (timings) {
ChatService.notifyTimings(timings, promptProgress, onTimings);
- if (timings) {
- lastTimings = timings;
- }
+ lastTimings = timings;
}
if (content) {
@@ -771,10 +774,11 @@ export class ChatService {
timings: ChatMessageTimings | undefined,
promptProgress: ChatMessagePromptProgress | undefined,
onTimingsCallback:
- | ((timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
+ | ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
| undefined
): void {
- if (!timings || !onTimingsCallback) return;
+ if (!onTimingsCallback || (!timings && !promptProgress)) return;
+
onTimingsCallback(timings, promptProgress);
}
}
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index cc7dabdafc..879b2f3245 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -303,11 +303,17 @@ class ChatStore {
const currentConfig = config();
const outputTokensMax = currentConfig.max_tokens || -1;
+ // Note: for timings data, the n_prompt does NOT include cache tokens
const contextUsed = promptTokens + cacheTokens + predictedTokens;
const outputTokensUsed = predictedTokens;
+ // Note: for prompt progress, the "processed" DOES include cache tokens
+ // we need to exclude them to get the real prompt tokens processed count
+ const progressCache = promptProgress?.cache || 0;
+ const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
+ const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
const progressPercent = promptProgress
- ? Math.round((promptProgress.processed / promptProgress.total) * 100)
+ ? Math.round((progressActualDone / progressActualTotal) * 100)
: undefined;
return {
@@ -324,6 +330,7 @@ class ChatStore {
topP: currentConfig.top_p ?? 0.95,
speculative: false,
progressPercent,
+ promptProgress,
promptTokens,
promptMs,
cacheTokens
@@ -534,7 +541,7 @@ class ChatStore {
conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
},
onModel: (modelName: string) => recordModel(modelName),
- onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+ onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
const tokensPerSecond =
timings?.predicted_ms && timings?.predicted_n
? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1032,7 +1039,7 @@ class ChatStore {
});
},
- onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+ onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
const tokensPerSecond =
timings?.predicted_ms && timings?.predicted_n
? (timings.predicted_n / timings.predicted_ms) * 1000
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index babca4691c..714509f024 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -187,6 +187,7 @@ export interface ApiChatCompletionRequest {
}>;
stream?: boolean;
model?: string;
+ return_progress?: boolean;
// Reasoning parameters
reasoning_format?: string;
// Generation parameters
@@ -344,6 +345,7 @@ export interface ApiProcessingState {
tokensPerSecond?: number;
// Progress information from prompt_progress
progressPercent?: number;
+ promptProgress?: ChatMessagePromptProgress;
promptTokens?: number;
promptMs?: number;
cacheTokens?: number;
diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts
index ecd5802fb6..38b3047dd0 100644
--- a/tools/server/webui/src/lib/types/settings.d.ts
+++ b/tools/server/webui/src/lib/types/settings.d.ts
@@ -52,7 +52,7 @@ export interface SettingsChatServiceOptions {
onReasoningChunk?: (chunk: string) => void;
onToolCallChunk?: (chunk: string) => void;
onModel?: (model: string) => void;
- onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
+ onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
onComplete?: (
response: string,
reasoningContent?: string,