fix(parser): support Unicode characters in tags

Fixes #5264 Chinese, Japanese, Korean, and other Unicode characters are now properly recognized in hashtags, following the standard hashtag parsing conventions used by Twitter, Instagram, and GitHub. Changes: - Updated tag parser to allow Unicode letters and digits - Tags stop at whitespace and punctuation (both ASCII and CJK) - Allow dash, underscore, forward slash in tags - Added comprehensive tests for CJK characters and emoji Examples: - #测试 → recognized as tag '测试' - #日本語 → recognized as tag '日本語' - #한국어 → recognized as tag '한국어' - #测试。→ recognized as tag '测试' (stops at punctuation) - #work/测试/项目 → hierarchical tag with Unicode
6 months ago · 64e9d82d67
parent 4de8712cb0
commit 64e9d82d67
4 changed files with 153 additions and 8 deletions
--- a/plugin/markdown/markdown_test.go
+++ b/plugin/markdown/markdown_test.go
@ -261,6 +261,42 @@ func TestExtractTags(t *testing.T) {
 			withExt:  false,
 			expected: []string{},
 		},
+		{
+			name:     "Chinese tag",
+			content:  "Text with #测试",
+			withExt:  true,
+			expected: []string{"测试"},
+		},
+		{
+			name:     "Chinese tag followed by punctuation",
+			content:  "Text #测试。 More text",
+			withExt:  true,
+			expected: []string{"测试"},
+		},
+		{
+			name:     "mixed Chinese and ASCII tag",
+			content:  "#测试test123 content",
+			withExt:  true,
+			expected: []string{"测试test123"},
+		},
+		{
+			name:     "Japanese tag",
+			content:  "#日本語 content",
+			withExt:  true,
+			expected: []string{"日本語"},
+		},
+		{
+			name:     "Korean tag",
+			content:  "#한국어 content",
+			withExt:  true,
+			expected: []string{"한국어"},
+		},
+		{
+			name:     "hierarchical tag with Chinese",
+			content:  "#work/测试/项目",
+			withExt:  true,
+			expected: []string{"work/测试/项目"},
+		},
 	}

 	for _, tt := range tests {
--- a/plugin/markdown/parser/tag.go
+++ b/plugin/markdown/parser/tag.go
@ -45,20 +45,68 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N
 	}

 	// Scan tag characters
-	// Valid: alphanumeric, dash, underscore, forward slash
+	// Tags include Unicode letters, digits, underscore, hyphen, forward slash
+	// Stop at: whitespace, punctuation (except - _ /)
+	// This follows the Twitter/social media standard for hashtag parsing
 	tagEnd := 1 // Start after #
 	for tagEnd < len(line) {
 		c := line[tagEnd]

-		isValid := (c >= 'a' && c <= 'z') ||
-			(c >= 'A' && c <= 'Z') ||
-			(c >= '0' && c <= '9') ||
-			c == '-' || c == '_' || c == '/'
+		// ASCII fast path for common characters
+		if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+			(c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' {
+			tagEnd++
+			continue
+		}
+
+		// Stop at whitespace
+		if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
+			break
+		}
+
+		// Stop at common ASCII punctuation
+		if c == '.' || c == ',' || c == ';' || c == ':' ||
+			c == '!' || c == '?' || c == '(' || c == ')' ||
+			c == '[' || c == ']' || c == '{' || c == '}' ||
+			c == '<' || c == '>' || c == '"' || c == '\'' ||
+			c == '`' || c == '|' || c == '\\' || c == '@' ||
+			c == '&' || c == '*' || c == '+' || c == '=' ||
+			c == '^' || c == '%' || c == '$' || c == '~' || c == '#' {
+			break
+		}
+
+		// For UTF-8 multibyte sequences, check for Unicode punctuation
+		// U+3000 (IDEOGRAPHIC SPACE) - treat as space
+		// U+3001-U+303F - CJK punctuation
+		// U+FF00-U+FFEF - Fullwidth punctuation
+		if c >= 0x80 && tagEnd+2 < len(line) {
+			b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2]
 			
-		if !isValid {
+			// U+3000 IDEOGRAPHIC SPACE (E3 80 80)
+			if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 {
 				break
 			}
 			
+			// U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF)
+			if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF {
+				break
+			}
+			
+			// Common fullwidth punctuation: ！？，。；：（）
+			// U+FF01 ！ (EF BC 81), U+FF1F ？ (EF BC 9F)
+			// U+FF0C ， (EF BC 8C), U+FF0E 。 (EF BC 8E)
+			// U+FF1A ： (EF BC 9A), U+FF1B ； (EF BC 9B)
+			// U+FF08 （ (EF BC 88), U+FF09 ） (EF BC 89)
+			if b1 == 0xEF && b2 == 0xBC {
+				if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 ||
+					b3 == 0x8C || b3 == 0x8E ||
+					b3 == 0x9A || b3 == 0x9B || b3 == 0x9F {
+					break
+				}
+			}
+		}
+
+		// Allow Unicode letters and other characters
 		tagEnd++
 	}

--- a/plugin/markdown/parser/tag_test.go
+++ b/plugin/markdown/parser/tag_test.go
@ -88,7 +88,7 @@ func TestTagParser(t *testing.T) {
 			name:        "special characters",
 			input:       "#tag@special",
 			expectedTag: "tag",
-			shouldParse: true, // Stops at @
+			shouldParse: true,
 		},
 		{
 			name:        "mixed case",
@ -126,6 +126,48 @@ func TestTagParser(t *testing.T) {
 			expectedTag: "work-log/2024/q1",
 			shouldParse: true,
 		},
+		{
+			name:        "Chinese characters",
+			input:       "#测试",
+			expectedTag: "测试",
+			shouldParse: true,
+		},
+		{
+			name:        "Chinese tag followed by space",
+			input:       "#测试 some text",
+			expectedTag: "测试",
+			shouldParse: true,
+		},
+		{
+			name:        "Chinese tag followed by punctuation",
+			input:       "#测试。",
+			expectedTag: "测试",
+			shouldParse: true,
+		},
+		{
+			name:        "mixed Chinese and ASCII",
+			input:       "#测试test123",
+			expectedTag: "测试test123",
+			shouldParse: true,
+		},
+		{
+			name:        "Japanese characters",
+			input:       "#テスト",
+			expectedTag: "テスト",
+			shouldParse: true,
+		},
+		{
+			name:        "Korean characters",
+			input:       "#테스트",
+			expectedTag: "테스트",
+			shouldParse: true,
+		},
+		{
+			name:        "emoji",
+			input:       "#test🚀",
+			expectedTag: "test🚀",
+			shouldParse: true,
+		},
 	}

 	for _, tt := range tests {
--- a/web/src/utils/remark-plugins/remark-tag.ts
+++ b/web/src/utils/remark-plugins/remark-tag.ts
@ -21,9 +21,28 @@ import { visit } from "unist-util-visit";

 /**
 * Check if character is valid for tag content
+ * Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash
+ * Stops at whitespace and punctuation
 */
 function isTagChar(char: string): boolean {
-  return /[a-zA-Z0-9_\-/]/.test(char);
+  // Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash
+  // Stop at: whitespace, punctuation
+
+  // Stop at whitespace
+  if (/\s/.test(char)) {
+    return false;
+  }
+
+  // Stop at common punctuation (ASCII and Unicode)
+  // U+3000-U+303F: CJK punctuation
+  // U+FF00-U+FF65: Fullwidth punctuation subset
+  const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/;
+  if (punctuation.test(char)) {
+    return false;
+  }
+
+  // Allow everything else (Unicode letters, digits, and allowed symbols like - _ /)
+  return true;
 }

 /**