mirror of https://github.com/usememos/memos.git
fix(markdown): fix UTF-8 truncation for CJK characters in snippet generation
The truncateAtWord function was slicing strings by byte position instead of character position. When truncating text with multi-byte UTF-8 characters (like CJK), this could cut in the middle of a character, creating invalid UTF-8 and causing gRPC marshaling errors. Fixed by converting to runes before truncation to ensure we always cut at proper character boundaries. Added test cases for CJK characters. Fixes #5276 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
e17cd163c6
commit
68c17469a3
|
|
@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncateAtWord truncates a string at the last word boundary before maxLength.
|
// truncateAtWord truncates a string at the last word boundary before maxLength.
|
||||||
|
// maxLength is treated as a rune (character) count to properly handle UTF-8 multi-byte characters.
|
||||||
func truncateAtWord(s string, maxLength int) string {
|
func truncateAtWord(s string, maxLength int) string {
|
||||||
if len(s) <= maxLength {
|
// Convert to runes to properly handle multi-byte UTF-8 characters
|
||||||
|
runes := []rune(s)
|
||||||
|
if len(runes) <= maxLength {
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Truncate to max length
|
// Truncate to max length (by character count, not byte count)
|
||||||
truncated := s[:maxLength]
|
truncated := string(runes[:maxLength])
|
||||||
|
|
||||||
// Find last space
|
// Find last space to avoid cutting in the middle of a word
|
||||||
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
|
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
|
||||||
if lastSpace > 0 {
|
if lastSpace > 0 {
|
||||||
truncated = truncated[:lastSpace]
|
truncated = truncated[:lastSpace]
|
||||||
|
|
|
||||||
|
|
@ -382,6 +382,18 @@ func TestTruncateAtWord(t *testing.T) {
|
||||||
maxLength: 10,
|
maxLength: 10,
|
||||||
expected: "supercalif ...",
|
expected: "supercalif ...",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "CJK characters without spaces",
|
||||||
|
input: "这是一个很长的中文句子没有空格的情况下也要正确处理",
|
||||||
|
maxLength: 15,
|
||||||
|
expected: "这是一个很长的中文句子没有空格 ...",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "mixed CJK and Latin",
|
||||||
|
input: "这是中文mixed with English文字",
|
||||||
|
maxLength: 10,
|
||||||
|
expected: "这是中文mixed ...",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue