From 64e9d82d675ff38330d16e2cb458d50ec17b9227 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 19 Nov 2025 22:06:11 +0800 Subject: [PATCH] fix(parser): support Unicode characters in tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #5264 Chinese, Japanese, Korean, and other Unicode characters are now properly recognized in hashtags, following the standard hashtag parsing conventions used by Twitter, Instagram, and GitHub. Changes: - Updated tag parser to allow Unicode letters and digits - Tags stop at whitespace and punctuation (both ASCII and CJK) - Allow dash, underscore, forward slash in tags - Added comprehensive tests for CJK characters and emoji Examples: - #测试 → recognized as tag '测试' - #日本語 → recognized as tag '日本語' - #한국어 → recognized as tag '한국어' - #测试。→ recognized as tag '测试' (stops at punctuation) - #work/测试/项目 → hierarchical tag with Unicode --- plugin/markdown/markdown_test.go | 36 +++++++++++++ plugin/markdown/parser/tag.go | 60 +++++++++++++++++++--- plugin/markdown/parser/tag_test.go | 44 +++++++++++++++- web/src/utils/remark-plugins/remark-tag.ts | 21 +++++++- 4 files changed, 153 insertions(+), 8 deletions(-) diff --git a/plugin/markdown/markdown_test.go b/plugin/markdown/markdown_test.go index 628d96c4a..21a9f08cf 100644 --- a/plugin/markdown/markdown_test.go +++ b/plugin/markdown/markdown_test.go @@ -261,6 +261,42 @@ func TestExtractTags(t *testing.T) { withExt: false, expected: []string{}, }, + { + name: "Chinese tag", + content: "Text with #测试", + withExt: true, + expected: []string{"测试"}, + }, + { + name: "Chinese tag followed by punctuation", + content: "Text #测试。 More text", + withExt: true, + expected: []string{"测试"}, + }, + { + name: "mixed Chinese and ASCII tag", + content: "#测试test123 content", + withExt: true, + expected: []string{"测试test123"}, + }, + { + name: "Japanese tag", + content: "#日本語 content", + withExt: true, + expected: []string{"日本語"}, + }, + { + name: "Korean tag", + content: "#한국어 content", + withExt: true, + expected: []string{"한국어"}, + }, + { + name: "hierarchical tag with Chinese", + content: "#work/测试/项目", + withExt: true, + expected: []string{"work/测试/项目"}, + }, } for _, tt := range tests { diff --git a/plugin/markdown/parser/tag.go b/plugin/markdown/parser/tag.go index 26804e266..038252159 100644 --- a/plugin/markdown/parser/tag.go +++ b/plugin/markdown/parser/tag.go @@ -45,20 +45,68 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N } // Scan tag characters - // Valid: alphanumeric, dash, underscore, forward slash + // Tags include Unicode letters, digits, underscore, hyphen, forward slash + // Stop at: whitespace, punctuation (except - _ /) + // This follows the Twitter/social media standard for hashtag parsing tagEnd := 1 // Start after # for tagEnd < len(line) { c := line[tagEnd] - isValid := (c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || - c == '-' || c == '_' || c == '/' + // ASCII fast path for common characters + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' { + tagEnd++ + continue + } + + // Stop at whitespace + if c == ' ' || c == '\t' || c == '\n' || c == '\r' { + break + } - if !isValid { + // Stop at common ASCII punctuation + if c == '.' || c == ',' || c == ';' || c == ':' || + c == '!' || c == '?' || c == '(' || c == ')' || + c == '[' || c == ']' || c == '{' || c == '}' || + c == '<' || c == '>' || c == '"' || c == '\'' || + c == '`' || c == '|' || c == '\\' || c == '@' || + c == '&' || c == '*' || c == '+' || c == '=' || + c == '^' || c == '%' || c == '$' || c == '~' || c == '#' { break } + // For UTF-8 multibyte sequences, check for Unicode punctuation + // U+3000 (IDEOGRAPHIC SPACE) - treat as space + // U+3001-U+303F - CJK punctuation + // U+FF00-U+FFEF - Fullwidth punctuation + if c >= 0x80 && tagEnd+2 < len(line) { + b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2] + + // U+3000 IDEOGRAPHIC SPACE (E3 80 80) + if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 { + break + } + + // U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF) + if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF { + break + } + + // Common fullwidth punctuation: !?,。;:() + // U+FF01 ! (EF BC 81), U+FF1F ? (EF BC 9F) + // U+FF0C , (EF BC 8C), U+FF0E 。 (EF BC 8E) + // U+FF1A : (EF BC 9A), U+FF1B ; (EF BC 9B) + // U+FF08 ( (EF BC 88), U+FF09 ) (EF BC 89) + if b1 == 0xEF && b2 == 0xBC { + if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 || + b3 == 0x8C || b3 == 0x8E || + b3 == 0x9A || b3 == 0x9B || b3 == 0x9F { + break + } + } + } + + // Allow Unicode letters and other characters tagEnd++ } diff --git a/plugin/markdown/parser/tag_test.go b/plugin/markdown/parser/tag_test.go index d238b591f..6150c4487 100644 --- a/plugin/markdown/parser/tag_test.go +++ b/plugin/markdown/parser/tag_test.go @@ -88,7 +88,7 @@ func TestTagParser(t *testing.T) { name: "special characters", input: "#tag@special", expectedTag: "tag", - shouldParse: true, // Stops at @ + shouldParse: true, }, { name: "mixed case", @@ -126,6 +126,48 @@ func TestTagParser(t *testing.T) { expectedTag: "work-log/2024/q1", shouldParse: true, }, + { + name: "Chinese characters", + input: "#测试", + expectedTag: "测试", + shouldParse: true, + }, + { + name: "Chinese tag followed by space", + input: "#测试 some text", + expectedTag: "测试", + shouldParse: true, + }, + { + name: "Chinese tag followed by punctuation", + input: "#测试。", + expectedTag: "测试", + shouldParse: true, + }, + { + name: "mixed Chinese and ASCII", + input: "#测试test123", + expectedTag: "测试test123", + shouldParse: true, + }, + { + name: "Japanese characters", + input: "#テスト", + expectedTag: "テスト", + shouldParse: true, + }, + { + name: "Korean characters", + input: "#테스트", + expectedTag: "테스트", + shouldParse: true, + }, + { + name: "emoji", + input: "#test🚀", + expectedTag: "test🚀", + shouldParse: true, + }, } for _, tt := range tests { diff --git a/web/src/utils/remark-plugins/remark-tag.ts b/web/src/utils/remark-plugins/remark-tag.ts index ea3cd81fd..c2bfaf855 100644 --- a/web/src/utils/remark-plugins/remark-tag.ts +++ b/web/src/utils/remark-plugins/remark-tag.ts @@ -21,9 +21,28 @@ import { visit } from "unist-util-visit"; /** * Check if character is valid for tag content + * Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash + * Stops at whitespace and punctuation */ function isTagChar(char: string): boolean { - return /[a-zA-Z0-9_\-/]/.test(char); + // Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash + // Stop at: whitespace, punctuation + + // Stop at whitespace + if (/\s/.test(char)) { + return false; + } + + // Stop at common punctuation (ASCII and Unicode) + // U+3000-U+303F: CJK punctuation + // U+FF00-U+FF65: Fullwidth punctuation subset + const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/; + if (punctuation.test(char)) { + return false; + } + + // Allow everything else (Unicode letters, digits, and allowed symbols like - _ /) + return true; } /**