diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index fad15e38e9..d1c10eed91 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts index 4b24cfc691..c06cf28864 100644 --- a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +++ b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts @@ -6,6 +6,7 @@ export interface LiveProcessingStats { totalTokens: number; timeMs: number; tokensPerSecond: number; + etaSecs?: number; } export interface LiveGenerationStats { @@ -82,6 +83,15 @@ export function useProcessingState(): UseProcessingStateReturn { } }); + function getETASecs(done: number, total: number, elapsedMs: number): number | undefined { + const elapsedSecs = elapsedMs / 1000; + const progressETASecs = + done === 0 || elapsedSecs < 0.5 + ? undefined // can be the case for the 0% progress report + : elapsedSecs * (total / done - 1); + return progressETASecs; + } + function startMonitoring(): void { if (isMonitoring) return; isMonitoring = true; @@ -178,6 +188,12 @@ export function useProcessingState(): UseProcessingStateReturn { const actualProcessed = processed - cache; const actualTotal = total - cache; const percent = Math.round((actualProcessed / actualTotal) * 100); + const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms); + + if (eta !== undefined) { + const etaSecs = Math.ceil(eta); + return `Processing ${percent}% (ETA: ${etaSecs}s)`; + } return `Processing ${percent}%`; } diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts index 86d034e8be..67157e36ac 100644 --- a/tools/server/webui/src/lib/stores/chat.svelte.ts +++ b/tools/server/webui/src/lib/stores/chat.svelte.ts @@ -303,11 +303,17 @@ class ChatStore { const currentConfig = config(); const outputTokensMax = currentConfig.max_tokens || -1; + // Note: for timings data, the n_prompt does NOT include cache tokens const contextUsed = promptTokens + cacheTokens + predictedTokens; const outputTokensUsed = predictedTokens; + // Note: for prompt progress, the "processed" DOES include cache tokens + // we need to exclude them to get the real prompt tokens processed count + const progressCache = promptProgress?.cache || 0; + const progressActualDone = (promptProgress?.processed ?? 0) - progressCache; + const progressActualTotal = (promptProgress?.total ?? 0) - progressCache; const progressPercent = promptProgress - ? Math.round((promptProgress.processed / promptProgress.total) * 100) + ? Math.round((progressActualDone / progressActualTotal) * 100) : undefined; return {