diff --git a/plugin/markdown/markdown.go b/plugin/markdown/markdown.go index c6498beb4..54f972e60 100644 --- a/plugin/markdown/markdown.go +++ b/plugin/markdown/markdown.go @@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string { } // truncateAtWord truncates a string at the last word boundary before maxLength. +// maxLength is counted in runes (Unicode characters), not bytes, to safely handle multi-byte UTF-8 characters. func truncateAtWord(s string, maxLength int) string { - if len(s) <= maxLength { + // Convert to runes to handle multi-byte UTF-8 characters correctly + runes := []rune(s) + if len(runes) <= maxLength { return s } - // Truncate to max length - truncated := s[:maxLength] + // Truncate to max length (by rune count, not byte count) + truncated := string(runes[:maxLength]) - // Find last space + // Find last space to avoid cutting mid-word lastSpace := strings.LastIndexAny(truncated, " \t\n\r") if lastSpace > 0 { truncated = truncated[:lastSpace] diff --git a/plugin/markdown/markdown_test.go b/plugin/markdown/markdown_test.go index 21a9f08cf..62fa616d4 100644 --- a/plugin/markdown/markdown_test.go +++ b/plugin/markdown/markdown_test.go @@ -382,16 +382,73 @@ func TestTruncateAtWord(t *testing.T) { maxLength: 10, expected: "supercalif ...", }, + { + name: "Chinese characters - no truncation", + input: "这是一段中文测试文本", + maxLength: 50, + expected: "这是一段中文测试文本", + }, + { + name: "Chinese characters - truncate at word boundary", + input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作", + maxLength: 20, + expected: "这是一段比较长的中文测试文本,用来验证截 ...", + }, + { + name: "Chinese characters - truncate mid-sentence", + input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作", + maxLength: 15, + expected: "这是一段比较长的中文测试文本, ...", + }, + { + name: "Mixed English and Chinese", + input: "This is a test 这是一个测试 with mixed content", + maxLength: 20, + expected: "This is a test ...", + }, + { + name: "Japanese characters", + input: "日本語のテキストを切り詰めるテスト", + maxLength: 10, + expected: "日本語のテキストを切 ...", + }, + { + name: "Korean characters", + input: "한국어 텍스트 잘라내기 테스트입니다", + maxLength: 10, + expected: "한국어 텍스트 ...", + }, + { + name: "Emoji characters", + input: "Hello 👋 World 🌍 with emoji 😊", + maxLength: 15, + expected: "Hello 👋 World ...", + }, + { + name: "UTF-8 boundary test - exactly at character", + input: "测试", + maxLength: 2, + expected: "测试", + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := truncateAtWord(tt.input, tt.maxLength) assert.Equal(t, tt.expected, result) + // Verify result is always valid UTF-8 + assert.True(t, isValidUTF8(result), "Result should be valid UTF-8: %q", result) }) } } +// isValidUTF8 checks if a string contains valid UTF-8 encoding. +func isValidUTF8(s string) bool { + // Try to convert to runes and back - invalid UTF-8 will cause issues + runes := []rune(s) + return string(runes) == s +} + // Benchmark tests. func BenchmarkGenerateSnippet(b *testing.B) { svc := NewService()