diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index fad15e38e9..d1c10eed91 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
index 4b24cfc691..c06cf28864 100644
--- a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -6,6 +6,7 @@ export interface LiveProcessingStats {
totalTokens: number;
timeMs: number;
tokensPerSecond: number;
+ etaSecs?: number;
}
export interface LiveGenerationStats {
@@ -82,6 +83,15 @@ export function useProcessingState(): UseProcessingStateReturn {
}
});
+ function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
+ const elapsedSecs = elapsedMs / 1000;
+ const progressETASecs =
+ done === 0 || elapsedSecs < 0.5
+ ? undefined // can be the case for the 0% progress report
+ : elapsedSecs * (total / done - 1);
+ return progressETASecs;
+ }
+
function startMonitoring(): void {
if (isMonitoring) return;
isMonitoring = true;
@@ -178,6 +188,12 @@ export function useProcessingState(): UseProcessingStateReturn {
const actualProcessed = processed - cache;
const actualTotal = total - cache;
const percent = Math.round((actualProcessed / actualTotal) * 100);
+ const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
+
+ if (eta !== undefined) {
+ const etaSecs = Math.ceil(eta);
+ return `Processing ${percent}% (ETA: ${etaSecs}s)`;
+ }
return `Processing ${percent}%`;
}
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index 86d034e8be..67157e36ac 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -303,11 +303,17 @@ class ChatStore {
const currentConfig = config();
const outputTokensMax = currentConfig.max_tokens || -1;
+ // Note: for timings data, the n_prompt does NOT include cache tokens
const contextUsed = promptTokens + cacheTokens + predictedTokens;
const outputTokensUsed = predictedTokens;
+ // Note: for prompt progress, the "processed" DOES include cache tokens
+ // we need to exclude them to get the real prompt tokens processed count
+ const progressCache = promptProgress?.cache || 0;
+ const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
+ const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
const progressPercent = promptProgress
- ? Math.round((promptProgress.processed / promptProgress.total) * 100)
+ ? Math.round((progressActualDone / progressActualTotal) * 100)
: undefined;
return {