fix(markdown): use rune-based truncation to prevent invalid UTF-8

2025-11-21 17:04:24 +01:00 · 2025-11-21 17:04:24 +01:00 · b8e491882c
parent d69435c97c
commit b8e491882c
2 changed files with 64 additions and 4 deletions
--- a/plugin/markdown/markdown.go
+++ b/plugin/markdown/markdown.go
@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
 }

 // truncateAtWord truncates a string at the last word boundary before maxLength.
+// maxLength is counted in runes (Unicode characters), not bytes, to safely handle multi-byte UTF-8 characters.
 func truncateAtWord(s string, maxLength int) string {
-	if len(s) <= maxLength {
+	// Convert to runes to handle multi-byte UTF-8 characters correctly
+	runes := []rune(s)
+	if len(runes) <= maxLength {
 		return s
 	}

-	// Truncate to max length
-	truncated := s[:maxLength]
+	// Truncate to max length (by rune count, not byte count)
+	truncated := string(runes[:maxLength])

-	// Find last space
+	// Find last space to avoid cutting mid-word
 	lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
 	if lastSpace > 0 {
 		truncated = truncated[:lastSpace]
--- a/plugin/markdown/markdown_test.go
+++ b/plugin/markdown/markdown_test.go
@ -382,16 +382,73 @@ func TestTruncateAtWord(t *testing.T) {
 			maxLength: 10,
 			expected:  "supercalif ...",
 		},
+		{
+			name:      "Chinese characters - no truncation",
+			input:     "这是一段中文测试文本",
+			maxLength: 50,
+			expected:  "这是一段中文测试文本",
+		},
+		{
+			name:      "Chinese characters - truncate at word boundary",
+			input:     "这是一段比较长的中文测试文本，用来验证截断功能是否正常工作",
+			maxLength: 20,
+			expected:  "这是一段比较长的中文测试文本，用来验证截 ...",
+		},
+		{
+			name:      "Chinese characters - truncate mid-sentence",
+			input:     "这是一段比较长的中文测试文本，用来验证截断功能是否正常工作",
+			maxLength: 15,
+			expected:  "这是一段比较长的中文测试文本， ...",
+		},
+		{
+			name:      "Mixed English and Chinese",
+			input:     "This is a test 这是一个测试 with mixed content",
+			maxLength: 20,
+			expected:  "This is a test ...",
+		},
+		{
+			name:      "Japanese characters",
+			input:     "日本語のテキストを切り詰めるテスト",
+			maxLength: 10,
+			expected:  "日本語のテキストを切 ...",
+		},
+		{
+			name:      "Korean characters",
+			input:     "한국어 텍스트 잘라내기 테스트입니다",
+			maxLength: 10,
+			expected:  "한국어 텍스트 ...",
+		},
+		{
+			name:      "Emoji characters",
+			input:     "Hello 👋 World 🌍 with emoji 😊",
+			maxLength: 15,
+			expected:  "Hello 👋 World ...",
+		},
+		{
+			name:      "UTF-8 boundary test - exactly at character",
+			input:     "测试",
+			maxLength: 2,
+			expected:  "测试",
+		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := truncateAtWord(tt.input, tt.maxLength)
 			assert.Equal(t, tt.expected, result)
+			// Verify result is always valid UTF-8
+			assert.True(t, isValidUTF8(result), "Result should be valid UTF-8: %q", result)
 		})
 	}
 }

+// isValidUTF8 checks if a string contains valid UTF-8 encoding.
+func isValidUTF8(s string) bool {
+	// Try to convert to runes and back - invalid UTF-8 will cause issues
+	runes := []rune(s)
+	return string(runes) == s
+}
+
 // Benchmark tests.
 func BenchmarkGenerateSnippet(b *testing.B) {
 	svc := NewService()