mirror of https://github.com/usememos/memos.git
fix(markdown): use rune-based truncation to prevent invalid UTF-8
This commit is contained in:
parent
d69435c97c
commit
b8e491882c
|
|
@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
|
|||
}
|
||||
|
||||
// truncateAtWord truncates a string at the last word boundary before maxLength.
|
||||
// maxLength is counted in runes (Unicode characters), not bytes, to safely handle multi-byte UTF-8 characters.
|
||||
func truncateAtWord(s string, maxLength int) string {
|
||||
if len(s) <= maxLength {
|
||||
// Convert to runes to handle multi-byte UTF-8 characters correctly
|
||||
runes := []rune(s)
|
||||
if len(runes) <= maxLength {
|
||||
return s
|
||||
}
|
||||
|
||||
// Truncate to max length
|
||||
truncated := s[:maxLength]
|
||||
// Truncate to max length (by rune count, not byte count)
|
||||
truncated := string(runes[:maxLength])
|
||||
|
||||
// Find last space
|
||||
// Find last space to avoid cutting mid-word
|
||||
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
|
||||
if lastSpace > 0 {
|
||||
truncated = truncated[:lastSpace]
|
||||
|
|
|
|||
|
|
@ -382,16 +382,73 @@ func TestTruncateAtWord(t *testing.T) {
|
|||
maxLength: 10,
|
||||
expected: "supercalif ...",
|
||||
},
|
||||
{
|
||||
name: "Chinese characters - no truncation",
|
||||
input: "这是一段中文测试文本",
|
||||
maxLength: 50,
|
||||
expected: "这是一段中文测试文本",
|
||||
},
|
||||
{
|
||||
name: "Chinese characters - truncate at word boundary",
|
||||
input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作",
|
||||
maxLength: 20,
|
||||
expected: "这是一段比较长的中文测试文本,用来验证截 ...",
|
||||
},
|
||||
{
|
||||
name: "Chinese characters - truncate mid-sentence",
|
||||
input: "这是一段比较长的中文测试文本,用来验证截断功能是否正常工作",
|
||||
maxLength: 15,
|
||||
expected: "这是一段比较长的中文测试文本, ...",
|
||||
},
|
||||
{
|
||||
name: "Mixed English and Chinese",
|
||||
input: "This is a test 这是一个测试 with mixed content",
|
||||
maxLength: 20,
|
||||
expected: "This is a test ...",
|
||||
},
|
||||
{
|
||||
name: "Japanese characters",
|
||||
input: "日本語のテキストを切り詰めるテスト",
|
||||
maxLength: 10,
|
||||
expected: "日本語のテキストを切 ...",
|
||||
},
|
||||
{
|
||||
name: "Korean characters",
|
||||
input: "한국어 텍스트 잘라내기 테스트입니다",
|
||||
maxLength: 10,
|
||||
expected: "한국어 텍스트 ...",
|
||||
},
|
||||
{
|
||||
name: "Emoji characters",
|
||||
input: "Hello 👋 World 🌍 with emoji 😊",
|
||||
maxLength: 15,
|
||||
expected: "Hello 👋 World ...",
|
||||
},
|
||||
{
|
||||
name: "UTF-8 boundary test - exactly at character",
|
||||
input: "测试",
|
||||
maxLength: 2,
|
||||
expected: "测试",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := truncateAtWord(tt.input, tt.maxLength)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
// Verify result is always valid UTF-8
|
||||
assert.True(t, isValidUTF8(result), "Result should be valid UTF-8: %q", result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// isValidUTF8 checks if a string contains valid UTF-8 encoding.
|
||||
func isValidUTF8(s string) bool {
|
||||
// Try to convert to runes and back - invalid UTF-8 will cause issues
|
||||
runes := []rune(s)
|
||||
return string(runes) == s
|
||||
}
|
||||
|
||||
// Benchmark tests.
|
||||
func BenchmarkGenerateSnippet(b *testing.B) {
|
||||
svc := NewService()
|
||||
|
|
|
|||
Loading…
Reference in New Issue