diff --git a/plugin/markdown/parser/tag.go b/plugin/markdown/parser/tag.go index f176e0048..ad2d1d6fb 100644 --- a/plugin/markdown/parser/tag.go +++ b/plugin/markdown/parser/tag.go @@ -1,6 +1,9 @@ package parser import ( + "unicode" + "unicode/utf8" + gast "github.com/yuin/goldmark/ast" "github.com/yuin/goldmark/parser" "github.com/yuin/goldmark/text" @@ -8,6 +11,11 @@ import ( mast "github.com/usememos/memos/plugin/markdown/ast" ) +const ( + // MaxTagLength defines the maximum number of runes allowed in a tag + MaxTagLength = 100 +) + type tagParser struct{} // NewTagParser creates a new inline parser for #tag syntax. @@ -20,7 +28,42 @@ func (*tagParser) Trigger() []byte { return []byte{'#'} } -// Parse parses #tag syntax. +// isValidTagRune checks if a Unicode rune is valid in a tag. +// Uses Unicode categories for proper international character support. +func isValidTagRune(r rune) bool { + // Allow Unicode letters (any script: Latin, CJK, Arabic, Cyrillic, etc.) + if unicode.IsLetter(r) { + return true + } + + // Allow Unicode digits + if unicode.IsNumber(r) { + return true + } + + // Allow emoji and symbols (So category: Symbol, Other) + // This includes emoji, which are essential for social media-style tagging + if unicode.IsSymbol(r) { + return true + } + + // Allow specific ASCII symbols for tag structure + // Underscore: word separation (snake_case) + // Hyphen: word separation (kebab-case) + // Forward slash: hierarchical tags (category/subcategory) + if r == '_' || r == '-' || r == '/' { + return true + } + + return false +} + +// Parse parses #tag syntax using Unicode-aware validation. +// Tags support international characters and follow these rules: +// - Must start with # followed by valid tag characters +// - Valid characters: Unicode letters, Unicode digits, underscore (_), hyphen (-), forward slash (/) +// - Maximum length: 100 runes (Unicode characters) +// - Stops at: whitespace, punctuation, or other invalid characters func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.Node { line, _ := block.PeekLine() @@ -44,86 +87,47 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N return nil } - // Scan tag characters - // Tags include Unicode letters, digits, underscore, hyphen, forward slash - // Stop at: whitespace, punctuation (except - _ /) - // This follows the Twitter/social media standard for hashtag parsing - tagEnd := 1 // Start after # - for tagEnd < len(line) { - c := line[tagEnd] - - // ASCII fast path for common characters - if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' { - tagEnd++ - continue - } + // Parse tag using UTF-8 aware rune iteration + tagStart := 1 + pos := tagStart + runeCount := 0 - // Stop at whitespace - if c == ' ' || c == '\t' || c == '\n' || c == '\r' { + for pos < len(line) { + r, size := utf8.DecodeRune(line[pos:]) + + // Stop at invalid UTF-8 + if r == utf8.RuneError && size == 1 { break } - // Stop at common ASCII punctuation - if c == '.' || c == ',' || c == ';' || c == ':' || - c == '!' || c == '?' || c == '(' || c == ')' || - c == '[' || c == ']' || c == '{' || c == '}' || - c == '<' || c == '>' || c == '"' || c == '\'' || - c == '`' || c == '|' || c == '\\' || c == '@' || - c == '&' || c == '*' || c == '+' || c == '=' || - c == '^' || c == '%' || c == '$' || c == '~' || c == '#' { + // Validate character using Unicode categories + if !isValidTagRune(r) { break } - // For UTF-8 multibyte sequences, check for Unicode punctuation - // U+3000 (IDEOGRAPHIC SPACE) - treat as space - // U+3001-U+303F - CJK punctuation - // U+FF00-U+FFEF - Fullwidth punctuation - if c >= 0x80 && tagEnd+2 < len(line) { - b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2] - - // U+3000 IDEOGRAPHIC SPACE (E3 80 80) - if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 { - break - } - - // U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF) - if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF { - break - } - - // Common fullwidth punctuation: !?,。;:() - // U+FF01 ! (EF BC 81), U+FF1F ? (EF BC 9F) - // U+FF0C , (EF BC 8C), U+FF0E 。 (EF BC 8E) - // U+FF1A : (EF BC 9A), U+FF1B ; (EF BC 9B) - // U+FF08 ( (EF BC 88), U+FF09 ) (EF BC 89) - if b1 == 0xEF && b2 == 0xBC { - if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 || - b3 == 0x8C || b3 == 0x8E || - b3 == 0x9A || b3 == 0x9B || b3 == 0x9F { - break - } - } + // Enforce max length (by rune count, not byte count) + runeCount++ + if runeCount > MaxTagLength { + break } - // Allow Unicode letters and other characters - tagEnd++ + pos += size } // Must have at least one character after # - if tagEnd == 1 { + if pos <= tagStart { return nil } // Extract tag (without #) - tagName := line[1:tagEnd] + tagName := line[tagStart:pos] // Make a copy of the tag name tagCopy := make([]byte, len(tagName)) copy(tagCopy, tagName) // Advance reader - block.Advance(tagEnd) + block.Advance(pos) // Create node node := &mast.TagNode{ diff --git a/web/src/utils/remark-plugins/remark-tag.ts b/web/src/utils/remark-plugins/remark-tag.ts index c2bfaf855..b21e5a9a4 100644 --- a/web/src/utils/remark-plugins/remark-tag.ts +++ b/web/src/utils/remark-plugins/remark-tag.ts @@ -14,35 +14,52 @@ import { visit } from "unist-util-visit"; * #tag1/subtag/subtag2 → #tag1/subtag/subtag2 * * Rules: - * - Tag must start with # followed by alphanumeric, underscore, hyphen, or forward slash - * - Tag ends at whitespace, punctuation (except -, _, /), or end of line + * - Tag must start with # followed by valid tag characters + * - Valid characters: Unicode letters, Unicode digits, underscore (_), hyphen (-), forward slash (/) + * - Maximum length: 100 characters + * - Stops at: whitespace, punctuation, or other invalid characters * - Tags at start of line after ## are headings, not tags */ +const MAX_TAG_LENGTH = 100; + /** - * Check if character is valid for tag content - * Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash - * Stops at whitespace and punctuation + * Check if character is valid for tag content using Unicode categories. + * Uses Unicode property escapes for proper international character support. + * + * Valid characters: + * - \p{L}: Unicode letters (any script: Latin, CJK, Arabic, Cyrillic, etc.) + * - \p{N}: Unicode numbers/digits + * - \p{S}: Unicode symbols (includes emoji) + * - Special symbols: underscore (_), hyphen (-), forward slash (/) */ function isTagChar(char: string): boolean { - // Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash - // Stop at: whitespace, punctuation + // Allow Unicode letters (any script) + if (/\p{L}/u.test(char)) { + return true; + } + + // Allow Unicode digits + if (/\p{N}/u.test(char)) { + return true; + } - // Stop at whitespace - if (/\s/.test(char)) { - return false; + // Allow Unicode symbols (includes emoji) + // This makes tags compatible with social media platforms + if (/\p{S}/u.test(char)) { + return true; } - // Stop at common punctuation (ASCII and Unicode) - // U+3000-U+303F: CJK punctuation - // U+FF00-U+FF65: Fullwidth punctuation subset - const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/; - if (punctuation.test(char)) { - return false; + // Allow specific symbols for tag structure + // Underscore: word separation (snake_case) + // Hyphen: word separation (kebab-case) + // Forward slash: hierarchical tags (category/subcategory) + if (char === "_" || char === "-" || char === "/") { + return true; } - // Allow everything else (Unicode letters, digits, and allowed symbols like - _ /) - return true; + // Everything else is invalid (whitespace, punctuation, control chars) + return false; } /** @@ -74,8 +91,8 @@ function parseTagsFromText(text: string): Array<{ type: "text" | "tag"; value: s const tagContent = text.slice(i + 1, j); - // Validate tag length - if (tagContent.length > 0 && tagContent.length <= 100) { + // Validate tag length (must match backend MAX_TAG_LENGTH) + if (tagContent.length > 0 && tagContent.length <= MAX_TAG_LENGTH) { segments.push({ type: "tag", value: tagContent }); i = j; continue;