mirror of https://github.com/usememos/memos.git
fix(parser): support Unicode characters in tags
Fixes #5264 Chinese, Japanese, Korean, and other Unicode characters are now properly recognized in hashtags, following the standard hashtag parsing conventions used by Twitter, Instagram, and GitHub. Changes: - Updated tag parser to allow Unicode letters and digits - Tags stop at whitespace and punctuation (both ASCII and CJK) - Allow dash, underscore, forward slash in tags - Added comprehensive tests for CJK characters and emoji Examples: - #测试 → recognized as tag '测试' - #日本語 → recognized as tag '日本語' - #한국어 → recognized as tag '한국어' - #测试。→ recognized as tag '测试' (stops at punctuation) - #work/测试/项目 → hierarchical tag with Unicode
This commit is contained in:
parent
4de8712cb0
commit
64e9d82d67
|
|
@ -261,6 +261,42 @@ func TestExtractTags(t *testing.T) {
|
|||
withExt: false,
|
||||
expected: []string{},
|
||||
},
|
||||
{
|
||||
name: "Chinese tag",
|
||||
content: "Text with #测试",
|
||||
withExt: true,
|
||||
expected: []string{"测试"},
|
||||
},
|
||||
{
|
||||
name: "Chinese tag followed by punctuation",
|
||||
content: "Text #测试。 More text",
|
||||
withExt: true,
|
||||
expected: []string{"测试"},
|
||||
},
|
||||
{
|
||||
name: "mixed Chinese and ASCII tag",
|
||||
content: "#测试test123 content",
|
||||
withExt: true,
|
||||
expected: []string{"测试test123"},
|
||||
},
|
||||
{
|
||||
name: "Japanese tag",
|
||||
content: "#日本語 content",
|
||||
withExt: true,
|
||||
expected: []string{"日本語"},
|
||||
},
|
||||
{
|
||||
name: "Korean tag",
|
||||
content: "#한국어 content",
|
||||
withExt: true,
|
||||
expected: []string{"한국어"},
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag with Chinese",
|
||||
content: "#work/测试/项目",
|
||||
withExt: true,
|
||||
expected: []string{"work/测试/项目"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
|
|
|||
|
|
@ -45,20 +45,68 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N
|
|||
}
|
||||
|
||||
// Scan tag characters
|
||||
// Valid: alphanumeric, dash, underscore, forward slash
|
||||
// Tags include Unicode letters, digits, underscore, hyphen, forward slash
|
||||
// Stop at: whitespace, punctuation (except - _ /)
|
||||
// This follows the Twitter/social media standard for hashtag parsing
|
||||
tagEnd := 1 // Start after #
|
||||
for tagEnd < len(line) {
|
||||
c := line[tagEnd]
|
||||
|
||||
isValid := (c >= 'a' && c <= 'z') ||
|
||||
(c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') ||
|
||||
c == '-' || c == '_' || c == '/'
|
||||
// ASCII fast path for common characters
|
||||
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' {
|
||||
tagEnd++
|
||||
continue
|
||||
}
|
||||
|
||||
if !isValid {
|
||||
// Stop at whitespace
|
||||
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
break
|
||||
}
|
||||
|
||||
// Stop at common ASCII punctuation
|
||||
if c == '.' || c == ',' || c == ';' || c == ':' ||
|
||||
c == '!' || c == '?' || c == '(' || c == ')' ||
|
||||
c == '[' || c == ']' || c == '{' || c == '}' ||
|
||||
c == '<' || c == '>' || c == '"' || c == '\'' ||
|
||||
c == '`' || c == '|' || c == '\\' || c == '@' ||
|
||||
c == '&' || c == '*' || c == '+' || c == '=' ||
|
||||
c == '^' || c == '%' || c == '$' || c == '~' || c == '#' {
|
||||
break
|
||||
}
|
||||
|
||||
// For UTF-8 multibyte sequences, check for Unicode punctuation
|
||||
// U+3000 (IDEOGRAPHIC SPACE) - treat as space
|
||||
// U+3001-U+303F - CJK punctuation
|
||||
// U+FF00-U+FFEF - Fullwidth punctuation
|
||||
if c >= 0x80 && tagEnd+2 < len(line) {
|
||||
b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2]
|
||||
|
||||
// U+3000 IDEOGRAPHIC SPACE (E3 80 80)
|
||||
if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 {
|
||||
break
|
||||
}
|
||||
|
||||
// U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF)
|
||||
if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF {
|
||||
break
|
||||
}
|
||||
|
||||
// Common fullwidth punctuation: !?,。;:()
|
||||
// U+FF01 ! (EF BC 81), U+FF1F ? (EF BC 9F)
|
||||
// U+FF0C , (EF BC 8C), U+FF0E 。 (EF BC 8E)
|
||||
// U+FF1A : (EF BC 9A), U+FF1B ; (EF BC 9B)
|
||||
// U+FF08 ( (EF BC 88), U+FF09 ) (EF BC 89)
|
||||
if b1 == 0xEF && b2 == 0xBC {
|
||||
if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 ||
|
||||
b3 == 0x8C || b3 == 0x8E ||
|
||||
b3 == 0x9A || b3 == 0x9B || b3 == 0x9F {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Allow Unicode letters and other characters
|
||||
tagEnd++
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ func TestTagParser(t *testing.T) {
|
|||
name: "special characters",
|
||||
input: "#tag@special",
|
||||
expectedTag: "tag",
|
||||
shouldParse: true, // Stops at @
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "mixed case",
|
||||
|
|
@ -126,6 +126,48 @@ func TestTagParser(t *testing.T) {
|
|||
expectedTag: "work-log/2024/q1",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese characters",
|
||||
input: "#测试",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese tag followed by space",
|
||||
input: "#测试 some text",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese tag followed by punctuation",
|
||||
input: "#测试。",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "mixed Chinese and ASCII",
|
||||
input: "#测试test123",
|
||||
expectedTag: "测试test123",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Japanese characters",
|
||||
input: "#テスト",
|
||||
expectedTag: "テスト",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Korean characters",
|
||||
input: "#테스트",
|
||||
expectedTag: "테스트",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "emoji",
|
||||
input: "#test🚀",
|
||||
expectedTag: "test🚀",
|
||||
shouldParse: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
|
|
|||
|
|
@ -21,9 +21,28 @@ import { visit } from "unist-util-visit";
|
|||
|
||||
/**
|
||||
* Check if character is valid for tag content
|
||||
* Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash
|
||||
* Stops at whitespace and punctuation
|
||||
*/
|
||||
function isTagChar(char: string): boolean {
|
||||
return /[a-zA-Z0-9_\-/]/.test(char);
|
||||
// Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash
|
||||
// Stop at: whitespace, punctuation
|
||||
|
||||
// Stop at whitespace
|
||||
if (/\s/.test(char)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Stop at common punctuation (ASCII and Unicode)
|
||||
// U+3000-U+303F: CJK punctuation
|
||||
// U+FF00-U+FF65: Fullwidth punctuation subset
|
||||
const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/;
|
||||
if (punctuation.test(char)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allow everything else (Unicode letters, digits, and allowed symbols like - _ /)
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue