fix(markdown): use rune-based truncation to prevent invalid UTF-8

This commit is contained in:
gitkeniwo 2025-11-21 17:04:24 +01:00
parent d69435c97c
commit b8e491882c
2 changed files with 64 additions and 4 deletions

View File

@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
}
// truncateAtWord truncates a string at the last word boundary before maxLength.
// maxLength is counted in runes (Unicode characters), not bytes, to safely handle multi-byte UTF-8 characters.
func truncateAtWord(s string, maxLength int) string {
if len(s) <= maxLength {
// Convert to runes to handle multi-byte UTF-8 characters correctly
runes := []rune(s)
if len(runes) <= maxLength {
return s
}
// Truncate to max length
truncated := s[:maxLength]
// Truncate to max length (by rune count, not byte count)
truncated := string(runes[:maxLength])
// Find last space
// Find last space to avoid cutting mid-word
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
if lastSpace > 0 {
truncated = truncated[:lastSpace]

View File

@ -382,16 +382,73 @@ func TestTruncateAtWord(t *testing.T) {
maxLength: 10,
expected: "supercalif ...",
},
{
name: "Chinese characters - no truncation",
input: "这是一段中文测试文本",
maxLength: 50,
expected: "这是一段中文测试文本",
},
{
name: "Chinese characters - truncate at word boundary",
input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作",
maxLength: 20,
expected: "这是一段比较长的中文测试文本,用来验证截 ...",
},
{
name: "Chinese characters - truncate mid-sentence",
input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作",
maxLength: 15,
expected: "这是一段比较长的中文测试文本, ...",
},
{
name: "Mixed English and Chinese",
input: "This is a test 这是一个测试 with mixed content",
maxLength: 20,
expected: "This is a test ...",
},
{
name: "Japanese characters",
input: "日本語のテキストを切り詰めるテスト",
maxLength: 10,
expected: "日本語のテキストを切 ...",
},
{
name: "Korean characters",
input: "한국어 텍스트 잘라내기 테스트입니다",
maxLength: 10,
expected: "한국어 텍스트 ...",
},
{
name: "Emoji characters",
input: "Hello 👋 World 🌍 with emoji 😊",
maxLength: 15,
expected: "Hello 👋 World ...",
},
{
name: "UTF-8 boundary test - exactly at character",
input: "测试",
maxLength: 2,
expected: "测试",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := truncateAtWord(tt.input, tt.maxLength)
assert.Equal(t, tt.expected, result)
// Verify result is always valid UTF-8
assert.True(t, isValidUTF8(result), "Result should be valid UTF-8: %q", result)
})
}
}
// isValidUTF8 checks if a string contains valid UTF-8 encoding.
func isValidUTF8(s string) bool {
// Try to convert to runes and back - invalid UTF-8 will cause issues
runes := []rune(s)
return string(runes) == s
}
// Benchmark tests.
func BenchmarkGenerateSnippet(b *testing.B) {
svc := NewService()