mirror of https://github.com/usememos/memos.git
refactor(markdown): use Unicode categories for tag validation
Replace custom character whitelist with Unicode standards-based validation: - Use unicode.IsLetter/IsNumber/IsSymbol instead of hardcoded lists - Remove manual UTF-8 byte checking for CJK punctuation - Add proper rune-based length limiting (MAX_TAG_LENGTH = 100) - Improve international character support (CJK, Arabic, Cyrillic, etc.) - Add emoji support via unicode.IsSymbol Benefits: - Cleaner, more maintainable code (~50 lines removed) - Standards-based approach following Unicode categories - Better UTF-8 safety with utf8.DecodeRune - Consistent validation between Go backend and TypeScript frontend All existing tests pass with improved Unicode handling.
This commit is contained in:
parent
d69435c97c
commit
b78d4c2568
|
|
@ -1,6 +1,9 @@
|
|||
package parser
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
gast "github.com/yuin/goldmark/ast"
|
||||
"github.com/yuin/goldmark/parser"
|
||||
"github.com/yuin/goldmark/text"
|
||||
|
|
@ -8,6 +11,11 @@ import (
|
|||
mast "github.com/usememos/memos/plugin/markdown/ast"
|
||||
)
|
||||
|
||||
const (
|
||||
// MaxTagLength defines the maximum number of runes allowed in a tag
|
||||
MaxTagLength = 100
|
||||
)
|
||||
|
||||
type tagParser struct{}
|
||||
|
||||
// NewTagParser creates a new inline parser for #tag syntax.
|
||||
|
|
@ -20,7 +28,42 @@ func (*tagParser) Trigger() []byte {
|
|||
return []byte{'#'}
|
||||
}
|
||||
|
||||
// Parse parses #tag syntax.
|
||||
// isValidTagRune checks if a Unicode rune is valid in a tag.
|
||||
// Uses Unicode categories for proper international character support.
|
||||
func isValidTagRune(r rune) bool {
|
||||
// Allow Unicode letters (any script: Latin, CJK, Arabic, Cyrillic, etc.)
|
||||
if unicode.IsLetter(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow Unicode digits
|
||||
if unicode.IsNumber(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow emoji and symbols (So category: Symbol, Other)
|
||||
// This includes emoji, which are essential for social media-style tagging
|
||||
if unicode.IsSymbol(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow specific ASCII symbols for tag structure
|
||||
// Underscore: word separation (snake_case)
|
||||
// Hyphen: word separation (kebab-case)
|
||||
// Forward slash: hierarchical tags (category/subcategory)
|
||||
if r == '_' || r == '-' || r == '/' {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Parse parses #tag syntax using Unicode-aware validation.
|
||||
// Tags support international characters and follow these rules:
|
||||
// - Must start with # followed by valid tag characters
|
||||
// - Valid characters: Unicode letters, Unicode digits, underscore (_), hyphen (-), forward slash (/)
|
||||
// - Maximum length: 100 runes (Unicode characters)
|
||||
// - Stops at: whitespace, punctuation, or other invalid characters
|
||||
func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.Node {
|
||||
line, _ := block.PeekLine()
|
||||
|
||||
|
|
@ -44,86 +87,47 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N
|
|||
return nil
|
||||
}
|
||||
|
||||
// Scan tag characters
|
||||
// Tags include Unicode letters, digits, underscore, hyphen, forward slash
|
||||
// Stop at: whitespace, punctuation (except - _ /)
|
||||
// This follows the Twitter/social media standard for hashtag parsing
|
||||
tagEnd := 1 // Start after #
|
||||
for tagEnd < len(line) {
|
||||
c := line[tagEnd]
|
||||
// Parse tag using UTF-8 aware rune iteration
|
||||
tagStart := 1
|
||||
pos := tagStart
|
||||
runeCount := 0
|
||||
|
||||
// ASCII fast path for common characters
|
||||
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' {
|
||||
tagEnd++
|
||||
continue
|
||||
}
|
||||
for pos < len(line) {
|
||||
r, size := utf8.DecodeRune(line[pos:])
|
||||
|
||||
// Stop at whitespace
|
||||
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
// Stop at invalid UTF-8
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
break
|
||||
}
|
||||
|
||||
// Stop at common ASCII punctuation
|
||||
if c == '.' || c == ',' || c == ';' || c == ':' ||
|
||||
c == '!' || c == '?' || c == '(' || c == ')' ||
|
||||
c == '[' || c == ']' || c == '{' || c == '}' ||
|
||||
c == '<' || c == '>' || c == '"' || c == '\'' ||
|
||||
c == '`' || c == '|' || c == '\\' || c == '@' ||
|
||||
c == '&' || c == '*' || c == '+' || c == '=' ||
|
||||
c == '^' || c == '%' || c == '$' || c == '~' || c == '#' {
|
||||
// Validate character using Unicode categories
|
||||
if !isValidTagRune(r) {
|
||||
break
|
||||
}
|
||||
|
||||
// For UTF-8 multibyte sequences, check for Unicode punctuation
|
||||
// U+3000 (IDEOGRAPHIC SPACE) - treat as space
|
||||
// U+3001-U+303F - CJK punctuation
|
||||
// U+FF00-U+FFEF - Fullwidth punctuation
|
||||
if c >= 0x80 && tagEnd+2 < len(line) {
|
||||
b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2]
|
||||
|
||||
// U+3000 IDEOGRAPHIC SPACE (E3 80 80)
|
||||
if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 {
|
||||
break
|
||||
}
|
||||
|
||||
// U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF)
|
||||
if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF {
|
||||
break
|
||||
}
|
||||
|
||||
// Common fullwidth punctuation: !?,。;:()
|
||||
// U+FF01 ! (EF BC 81), U+FF1F ? (EF BC 9F)
|
||||
// U+FF0C , (EF BC 8C), U+FF0E 。 (EF BC 8E)
|
||||
// U+FF1A : (EF BC 9A), U+FF1B ; (EF BC 9B)
|
||||
// U+FF08 ( (EF BC 88), U+FF09 ) (EF BC 89)
|
||||
if b1 == 0xEF && b2 == 0xBC {
|
||||
if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 ||
|
||||
b3 == 0x8C || b3 == 0x8E ||
|
||||
b3 == 0x9A || b3 == 0x9B || b3 == 0x9F {
|
||||
break
|
||||
}
|
||||
}
|
||||
// Enforce max length (by rune count, not byte count)
|
||||
runeCount++
|
||||
if runeCount > MaxTagLength {
|
||||
break
|
||||
}
|
||||
|
||||
// Allow Unicode letters and other characters
|
||||
tagEnd++
|
||||
pos += size
|
||||
}
|
||||
|
||||
// Must have at least one character after #
|
||||
if tagEnd == 1 {
|
||||
if pos <= tagStart {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Extract tag (without #)
|
||||
tagName := line[1:tagEnd]
|
||||
tagName := line[tagStart:pos]
|
||||
|
||||
// Make a copy of the tag name
|
||||
tagCopy := make([]byte, len(tagName))
|
||||
copy(tagCopy, tagName)
|
||||
|
||||
// Advance reader
|
||||
block.Advance(tagEnd)
|
||||
block.Advance(pos)
|
||||
|
||||
// Create node
|
||||
node := &mast.TagNode{
|
||||
|
|
|
|||
|
|
@ -14,35 +14,52 @@ import { visit } from "unist-util-visit";
|
|||
* #tag1/subtag/subtag2 → <span class="tag" data-tag="tag1/subtag/subtag2">#tag1/subtag/subtag2</span>
|
||||
*
|
||||
* Rules:
|
||||
* - Tag must start with # followed by alphanumeric, underscore, hyphen, or forward slash
|
||||
* - Tag ends at whitespace, punctuation (except -, _, /), or end of line
|
||||
* - Tag must start with # followed by valid tag characters
|
||||
* - Valid characters: Unicode letters, Unicode digits, underscore (_), hyphen (-), forward slash (/)
|
||||
* - Maximum length: 100 characters
|
||||
* - Stops at: whitespace, punctuation, or other invalid characters
|
||||
* - Tags at start of line after ## are headings, not tags
|
||||
*/
|
||||
|
||||
const MAX_TAG_LENGTH = 100;
|
||||
|
||||
/**
|
||||
* Check if character is valid for tag content
|
||||
* Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash
|
||||
* Stops at whitespace and punctuation
|
||||
* Check if character is valid for tag content using Unicode categories.
|
||||
* Uses Unicode property escapes for proper international character support.
|
||||
*
|
||||
* Valid characters:
|
||||
* - \p{L}: Unicode letters (any script: Latin, CJK, Arabic, Cyrillic, etc.)
|
||||
* - \p{N}: Unicode numbers/digits
|
||||
* - \p{S}: Unicode symbols (includes emoji)
|
||||
* - Special symbols: underscore (_), hyphen (-), forward slash (/)
|
||||
*/
|
||||
function isTagChar(char: string): boolean {
|
||||
// Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash
|
||||
// Stop at: whitespace, punctuation
|
||||
|
||||
// Stop at whitespace
|
||||
if (/\s/.test(char)) {
|
||||
return false;
|
||||
// Allow Unicode letters (any script)
|
||||
if (/\p{L}/u.test(char)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Stop at common punctuation (ASCII and Unicode)
|
||||
// U+3000-U+303F: CJK punctuation
|
||||
// U+FF00-U+FF65: Fullwidth punctuation subset
|
||||
const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/;
|
||||
if (punctuation.test(char)) {
|
||||
return false;
|
||||
// Allow Unicode digits
|
||||
if (/\p{N}/u.test(char)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allow everything else (Unicode letters, digits, and allowed symbols like - _ /)
|
||||
return true;
|
||||
// Allow Unicode symbols (includes emoji)
|
||||
// This makes tags compatible with social media platforms
|
||||
if (/\p{S}/u.test(char)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allow specific symbols for tag structure
|
||||
// Underscore: word separation (snake_case)
|
||||
// Hyphen: word separation (kebab-case)
|
||||
// Forward slash: hierarchical tags (category/subcategory)
|
||||
if (char === "_" || char === "-" || char === "/") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Everything else is invalid (whitespace, punctuation, control chars)
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -74,8 +91,8 @@ function parseTagsFromText(text: string): Array<{ type: "text" | "tag"; value: s
|
|||
|
||||
const tagContent = text.slice(i + 1, j);
|
||||
|
||||
// Validate tag length
|
||||
if (tagContent.length > 0 && tagContent.length <= 100) {
|
||||
// Validate tag length (must match backend MAX_TAG_LENGTH)
|
||||
if (tagContent.length > 0 && tagContent.length <= MAX_TAG_LENGTH) {
|
||||
segments.push({ type: "tag", value: tagContent });
|
||||
i = j;
|
||||
continue;
|
||||
|
|
|
|||
Loading…
Reference in New Issue