Send reasoning content back to the model across turns via the reasoning_content API field (#21036)

* webui: send reasoning_content back to model in context Preserve assistant reasoning across turns by extracting it from internal tags and sending it as a separate reasoning_content field in the API payload. The server and Jinja templates handle native formatting (e.g. <think> tags for Qwen, GLM, DeepSeek...). Adds "Exclude reasoning from context" toggle in Settings > Developer (off by default, so reasoning is preserved). Includes unit tests. * webui: add syncable parameter for excludeReasoningFromContext * chore: update webui build output
2026-03-27 08:17:35 +01:00 · 2026-03-27 08:17:35 +01:00 · d0fa2c9fbb
parent 9bcb4eff4d
commit d0fa2c9fbb
11 changed files with 281 additions and 9 deletions
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@ -296,6 +296,11 @@
 					label: 'Disable reasoning content parsing',
 					type: SettingsFieldType.CHECKBOX
 				},
+				{
+					key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
+					label: 'Exclude reasoning from context',
+					type: SettingsFieldType.CHECKBOX
+				},
 				{
 					key: SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,
 					label: 'Enable raw output toggle',
--- a/tools/server/webui/src/lib/constants/agentic.ts
+++ b/tools/server/webui/src/lib/constants/agentic.ts
@ -50,6 +50,8 @@ export const AGENTIC_REGEX = {
 	PARTIAL_MARKER: /<<<[A-Za-z_]*$/,
 	// Matches reasoning content blocks (including tags)
 	REASONING_BLOCK: /<<<reasoning_content_start>>>[\s\S]*?<<<reasoning_content_end>>>/g,
+	// Captures the reasoning text between start/end tags
+	REASONING_EXTRACT: /<<<reasoning_content_start>>>([\s\S]*?)<<<reasoning_content_end>>>/,
 	// Matches an opening reasoning tag and any remaining content (unterminated)
 	REASONING_OPEN: /<<<reasoning_content_start>>>[\s\S]*$/,
 	// Matches a complete agentic tool call display block (start to end marker)
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@ -10,6 +10,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
 	theme: ColorMode.SYSTEM,
 	showThoughtInProgress: false,
 	disableReasoningParsing: false,
+	excludeReasoningFromContext: false,
 	showRawOutputSwitch: false,
 	keepStatsVisible: false,
 	showMessageStats: true,
@ -106,6 +107,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	showThoughtInProgress: 'Expand thought process by default when generating messages.',
 	disableReasoningParsing:
 		'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
+	excludeReasoningFromContext:
+		'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
 	showRawOutputSwitch:
 		'Show toggle button to display messages as plain text instead of Markdown-formatted content',
 	keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
--- a/tools/server/webui/src/lib/constants/settings-keys.ts
+++ b/tools/server/webui/src/lib/constants/settings-keys.ts
@ -54,6 +54,7 @@ export const SETTINGS_KEYS = {
 	SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
 	// Developer
 	DISABLE_REASONING_PARSING: 'disableReasoningParsing',
+	EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
 	SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
 	CUSTOM: 'custom'
 } as const;
--- a/tools/server/webui/src/lib/services/chat.service.ts
+++ b/tools/server/webui/src/lib/services/chat.service.ts
@ -57,6 +57,46 @@ export class ChatService {
 	 *
 	 */

+	/**
+	 * Extracts reasoning text from content that contains internal reasoning tags.
+	 * Returns the concatenated reasoning content or undefined if none found.
+	 */
+	private static extractReasoningFromContent(
+		content: ApiChatMessageData['content'] | null | undefined
+	): string | undefined {
+		if (!content) return undefined;
+
+		const extractFromString = (text: string): string => {
+			const parts: string[] = [];
+			// Use a fresh regex instance to avoid shared lastIndex state
+			const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
+			let match = re.exec(text);
+			while (match) {
+				parts.push(match[1]);
+				// advance past the matched portion and retry
+				text = text.slice(match.index + match[0].length);
+				match = re.exec(text);
+			}
+			return parts.join('');
+		};
+
+		if (typeof content === 'string') {
+			const result = extractFromString(content);
+			return result || undefined;
+		}
+
+		if (!Array.isArray(content)) return undefined;
+
+		const parts: string[] = [];
+		for (const part of content) {
+			if (part.type === ContentPartType.TEXT && part.text) {
+				const result = extractFromString(part.text);
+				if (result) parts.push(result);
+			}
+		}
+		return parts.length > 0 ? parts.join('') : undefined;
+	}
+
 	/**
 	 * Sends a chat completion request to the llama.cpp server.
 	 * Supports both streaming and non-streaming responses with comprehensive parameter configuration.
@ -111,7 +151,8 @@ export class ChatService {
 			custom,
 			timings_per_token,
 			// Config options
-			disableReasoningParsing
+			disableReasoningParsing,
+			excludeReasoningFromContext
 		} = options;

 		const normalizedMessages: ApiChatMessageData[] = messages
@ -159,14 +200,24 @@ export class ChatService {
 		}

 		const requestBody: ApiChatCompletionRequest = {
-			messages: normalizedMessages.map((msg: ApiChatMessageData) => ({
-				role: msg.role,
-				// Strip reasoning tags/content from the prompt to avoid polluting KV cache.
-				// TODO: investigate backend expectations for reasoning tags and add a toggle if needed.
-				content: ChatService.stripReasoningContent(msg.content),
-				tool_calls: msg.tool_calls,
-				tool_call_id: msg.tool_call_id
-			})),
+			messages: normalizedMessages.map((msg: ApiChatMessageData) => {
+				// Always strip internal reasoning/agentic tags from content
+				const cleanedContent = ChatService.stripReasoningContent(msg.content);
+				const mapped: ApiChatCompletionRequest['messages'][0] = {
+					role: msg.role,
+					content: cleanedContent,
+					tool_calls: msg.tool_calls,
+					tool_call_id: msg.tool_call_id
+				};
+				// When preserving reasoning, extract it from raw content and send as separate field
+				if (!excludeReasoningFromContext) {
+					const reasoning = ChatService.extractReasoningFromContent(msg.content);
+					if (reasoning) {
+						mapped.reasoning_content = reasoning;
+					}
+				}
+				return mapped;
+			}),
 			stream,
 			return_progress: stream ? true : undefined,
 			tools: tools && tools.length > 0 ? tools : undefined
--- a/tools/server/webui/src/lib/services/parameter-sync.service.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.ts
@ -227,6 +227,12 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
 		serverKey: 'alwaysShowAgenticTurns',
 		type: SyncableParameterType.BOOLEAN,
 		canSync: true
+	},
+	{
+		key: 'excludeReasoningFromContext',
+		serverKey: 'excludeReasoningFromContext',
+		type: SyncableParameterType.BOOLEAN,
+		canSync: true
 	}
 ];

--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@ -1479,6 +1479,8 @@ class ChatStore {

 		if (currentConfig.disableReasoningParsing) apiOptions.disableReasoningParsing = true;

+		if (currentConfig.excludeReasoningFromContext) apiOptions.excludeReasoningFromContext = true;
+
 		if (hasValue(currentConfig.temperature))
 			apiOptions.temperature = Number(currentConfig.temperature);

--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@ -45,6 +45,7 @@ export interface ApiErrorResponse {
 export interface ApiChatMessageData {
 	role: ChatRole;
 	content: string | ApiChatMessageContentPart[];
+	reasoning_content?: string;
 	tool_calls?: ApiChatCompletionToolCall[];
 	tool_call_id?: string;
 	timestamp?: number;
@ -201,6 +202,9 @@ export interface ApiChatCompletionRequest {
 	messages: Array<{
 		role: ChatRole;
 		content: string | ApiChatMessageContentPart[];
+		reasoning_content?: string;
+		tool_calls?: ApiChatCompletionToolCall[];
+		tool_call_id?: string;
 	}>;
 	stream?: boolean;
 	model?: string;
--- a/tools/server/webui/src/lib/types/settings.d.ts
+++ b/tools/server/webui/src/lib/types/settings.d.ts
@ -24,6 +24,8 @@ export interface SettingsChatServiceOptions {
 	systemMessage?: string;
 	// Disable reasoning parsing (use 'none' instead of 'auto')
 	disableReasoningParsing?: boolean;
+	// Strip reasoning content from context before sending
+	excludeReasoningFromContext?: boolean;
 	tools?: OpenAIToolDefinition[];
 	// Generation parameters
 	temperature?: number;
--- a/tools/server/webui/tests/unit/reasoning-context.test.ts
+++ b/tools/server/webui/tests/unit/reasoning-context.test.ts
@ -0,0 +1,196 @@
+import { describe, it, expect } from 'vitest';
+import { AGENTIC_REGEX, REASONING_TAGS } from '$lib/constants/agentic';
+import { ContentPartType } from '$lib/enums';
+
+// Replicate ChatService.extractReasoningFromContent (private static)
+function extractReasoningFromContent(
+	content: string | Array<{ type: string; text?: string }> | null | undefined
+): string | undefined {
+	if (!content) return undefined;
+
+	const extractFromString = (text: string): string => {
+		const parts: string[] = [];
+		const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
+		let match = re.exec(text);
+		while (match) {
+			parts.push(match[1]);
+			text = text.slice(match.index + match[0].length);
+			match = re.exec(text);
+		}
+		return parts.join('');
+	};
+
+	if (typeof content === 'string') {
+		const result = extractFromString(content);
+		return result || undefined;
+	}
+
+	if (!Array.isArray(content)) return undefined;
+
+	const parts: string[] = [];
+	for (const part of content) {
+		if (part.type === ContentPartType.TEXT && part.text) {
+			const result = extractFromString(part.text);
+			if (result) parts.push(result);
+		}
+	}
+	return parts.length > 0 ? parts.join('') : undefined;
+}
+
+// Replicate ChatService.stripReasoningContent (private static)
+function stripReasoningContent(
+	content: string | Array<{ type: string; text?: string }> | null | undefined
+): typeof content {
+	if (!content) return content;
+
+	if (typeof content === 'string') {
+		return content
+			.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
+			.replace(AGENTIC_REGEX.REASONING_OPEN, '')
+			.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
+			.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
+	}
+
+	if (!Array.isArray(content)) return content;
+
+	return content.map((part) => {
+		if (part.type !== ContentPartType.TEXT || !part.text) return part;
+		return {
+			...part,
+			text: part.text
+				.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
+				.replace(AGENTIC_REGEX.REASONING_OPEN, '')
+				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
+				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '')
+		};
+	});
+}
+
+// Simulate the message mapping logic from ChatService.sendMessage
+function buildApiMessage(
+	content: string,
+	excludeReasoningFromContext: boolean
+): { role: string; content: string; reasoning_content?: string } {
+	const cleaned = stripReasoningContent(content) as string;
+	const mapped: { role: string; content: string; reasoning_content?: string } = {
+		role: 'assistant',
+		content: cleaned
+	};
+	if (!excludeReasoningFromContext) {
+		const reasoning = extractReasoningFromContent(content);
+		if (reasoning) {
+			mapped.reasoning_content = reasoning;
+		}
+	}
+	return mapped;
+}
+
+// Helper: wrap reasoning the same way the chat store does during streaming
+function wrapReasoning(reasoning: string, content: string): string {
+	return `${REASONING_TAGS.START}${reasoning}${REASONING_TAGS.END}${content}`;
+}
+
+describe('reasoning content extraction', () => {
+	it('extracts reasoning from tagged string content', () => {
+		const input = wrapReasoning('step 1, step 2', 'The answer is 42.');
+		const result = extractReasoningFromContent(input);
+		expect(result).toBe('step 1, step 2');
+	});
+
+	it('returns undefined when no reasoning tags present', () => {
+		expect(extractReasoningFromContent('Just a normal response.')).toBeUndefined();
+	});
+
+	it('returns undefined for null/empty input', () => {
+		expect(extractReasoningFromContent(null)).toBeUndefined();
+		expect(extractReasoningFromContent(undefined)).toBeUndefined();
+		expect(extractReasoningFromContent('')).toBeUndefined();
+	});
+
+	it('extracts reasoning from content part arrays', () => {
+		const input = [
+			{
+				type: ContentPartType.TEXT,
+				text: wrapReasoning('thinking hard', 'result')
+			}
+		];
+		expect(extractReasoningFromContent(input)).toBe('thinking hard');
+	});
+
+	it('handles multiple reasoning blocks', () => {
+		const input =
+			REASONING_TAGS.START +
+			'block1' +
+			REASONING_TAGS.END +
+			'middle' +
+			REASONING_TAGS.START +
+			'block2' +
+			REASONING_TAGS.END +
+			'end';
+		expect(extractReasoningFromContent(input)).toBe('block1block2');
+	});
+
+	it('ignores non-text content parts', () => {
+		const input = [{ type: 'image_url', text: wrapReasoning('hidden', 'img') }];
+		expect(extractReasoningFromContent(input)).toBeUndefined();
+	});
+});
+
+describe('strip reasoning content', () => {
+	it('removes reasoning tags from string content', () => {
+		const input = wrapReasoning('internal thoughts', 'visible answer');
+		expect(stripReasoningContent(input)).toBe('visible answer');
+	});
+
+	it('removes reasoning from content part arrays', () => {
+		const input = [
+			{
+				type: ContentPartType.TEXT,
+				text: wrapReasoning('thoughts', 'answer')
+			}
+		];
+		const result = stripReasoningContent(input) as Array<{ type: string; text?: string }>;
+		expect(result[0].text).toBe('answer');
+	});
+});
+
+describe('API message building with reasoning preservation', () => {
+	const storedContent = wrapReasoning('Let me think: 2+2=4, basic arithmetic.', 'The answer is 4.');
+
+	it('preserves reasoning_content when excludeReasoningFromContext is false', () => {
+		const msg = buildApiMessage(storedContent, false);
+		expect(msg.content).toBe('The answer is 4.');
+		expect(msg.reasoning_content).toBe('Let me think: 2+2=4, basic arithmetic.');
+		// no internal tags leak into either field
+		expect(msg.content).not.toContain('<<<');
+		expect(msg.reasoning_content).not.toContain('<<<');
+	});
+
+	it('strips reasoning_content when excludeReasoningFromContext is true', () => {
+		const msg = buildApiMessage(storedContent, true);
+		expect(msg.content).toBe('The answer is 4.');
+		expect(msg.reasoning_content).toBeUndefined();
+	});
+
+	it('handles content with no reasoning in both modes', () => {
+		const plain = 'No reasoning here.';
+		const msgPreserve = buildApiMessage(plain, false);
+		const msgExclude = buildApiMessage(plain, true);
+		expect(msgPreserve.content).toBe(plain);
+		expect(msgPreserve.reasoning_content).toBeUndefined();
+		expect(msgExclude.content).toBe(plain);
+		expect(msgExclude.reasoning_content).toBeUndefined();
+	});
+
+	it('cleans agentic tool call blocks from content even when preserving reasoning', () => {
+		const input =
+			wrapReasoning('plan', 'text') +
+			'\n\n<<<AGENTIC_TOOL_CALL_START>>>\n' +
+			'<<<TOOL_NAME:bash>>>\n' +
+			'<<<TOOL_ARGS_START>>>\n{}\n<<<TOOL_ARGS_END>>>\nout\n' +
+			'<<<AGENTIC_TOOL_CALL_END>>>\n';
+		const msg = buildApiMessage(input, false);
+		expect(msg.content).not.toContain('<<<');
+		expect(msg.reasoning_content).toBe('plan');
+	});
+});