fix(parser): support Unicode characters in tags

Fixes #5264

Chinese, Japanese, Korean, and other Unicode characters are now
properly recognized in hashtags, following the standard hashtag
parsing conventions used by Twitter, Instagram, and GitHub.

Changes:
- Updated tag parser to allow Unicode letters and digits
- Tags stop at whitespace and punctuation (both ASCII and CJK)
- Allow dash, underscore, forward slash in tags
- Added comprehensive tests for CJK characters and emoji

Examples:
- #测试 → recognized as tag '测试'
- #日本語 → recognized as tag '日本語'
- #한국어 → recognized as tag '한국어'
- #测试。→ recognized as tag '测试' (stops at punctuation)
- #work/测试/项目 → hierarchical tag with Unicode
pull/5263/merge
Steven 4 days ago
parent 4de8712cb0
commit 64e9d82d67

@ -261,6 +261,42 @@ func TestExtractTags(t *testing.T) {
withExt: false,
expected: []string{},
},
{
name: "Chinese tag",
content: "Text with #测试",
withExt: true,
expected: []string{"测试"},
},
{
name: "Chinese tag followed by punctuation",
content: "Text #测试。 More text",
withExt: true,
expected: []string{"测试"},
},
{
name: "mixed Chinese and ASCII tag",
content: "#测试test123 content",
withExt: true,
expected: []string{"测试test123"},
},
{
name: "Japanese tag",
content: "#日本語 content",
withExt: true,
expected: []string{"日本語"},
},
{
name: "Korean tag",
content: "#한국어 content",
withExt: true,
expected: []string{"한국어"},
},
{
name: "hierarchical tag with Chinese",
content: "#work/测试/项目",
withExt: true,
expected: []string{"work/测试/项目"},
},
}
for _, tt := range tests {

@ -45,20 +45,68 @@ func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.N
}
// Scan tag characters
// Valid: alphanumeric, dash, underscore, forward slash
// Tags include Unicode letters, digits, underscore, hyphen, forward slash
// Stop at: whitespace, punctuation (except - _ /)
// This follows the Twitter/social media standard for hashtag parsing
tagEnd := 1 // Start after #
for tagEnd < len(line) {
c := line[tagEnd]
isValid := (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '-' || c == '_' || c == '/'
// ASCII fast path for common characters
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') || c == '_' || c == '-' || c == '/' {
tagEnd++
continue
}
// Stop at whitespace
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
break
}
// Stop at common ASCII punctuation
if c == '.' || c == ',' || c == ';' || c == ':' ||
c == '!' || c == '?' || c == '(' || c == ')' ||
c == '[' || c == ']' || c == '{' || c == '}' ||
c == '<' || c == '>' || c == '"' || c == '\'' ||
c == '`' || c == '|' || c == '\\' || c == '@' ||
c == '&' || c == '*' || c == '+' || c == '=' ||
c == '^' || c == '%' || c == '$' || c == '~' || c == '#' {
break
}
// For UTF-8 multibyte sequences, check for Unicode punctuation
// U+3000 (IDEOGRAPHIC SPACE) - treat as space
// U+3001-U+303F - CJK punctuation
// U+FF00-U+FFEF - Fullwidth punctuation
if c >= 0x80 && tagEnd+2 < len(line) {
b1, b2, b3 := line[tagEnd], line[tagEnd+1], line[tagEnd+2]
if !isValid {
// U+3000 IDEOGRAPHIC SPACE (E3 80 80)
if b1 == 0xE3 && b2 == 0x80 && b3 == 0x80 {
break
}
// U+3001-U+303F CJK punctuation (E3 80 81 to E3 80 BF)
if b1 == 0xE3 && b2 == 0x80 && b3 >= 0x81 && b3 <= 0xBF {
break
}
// Common fullwidth punctuation: !?,。;:()
// U+FF01 (EF BC 81), U+FF1F (EF BC 9F)
// U+FF0C (EF BC 8C), U+FF0E 。 (EF BC 8E)
// U+FF1A (EF BC 9A), U+FF1B (EF BC 9B)
// U+FF08 (EF BC 88), U+FF09 (EF BC 89)
if b1 == 0xEF && b2 == 0xBC {
if b3 == 0x81 || b3 == 0x88 || b3 == 0x89 ||
b3 == 0x8C || b3 == 0x8E ||
b3 == 0x9A || b3 == 0x9B || b3 == 0x9F {
break
}
}
}
// Allow Unicode letters and other characters
tagEnd++
}

@ -88,7 +88,7 @@ func TestTagParser(t *testing.T) {
name: "special characters",
input: "#tag@special",
expectedTag: "tag",
shouldParse: true, // Stops at @
shouldParse: true,
},
{
name: "mixed case",
@ -126,6 +126,48 @@ func TestTagParser(t *testing.T) {
expectedTag: "work-log/2024/q1",
shouldParse: true,
},
{
name: "Chinese characters",
input: "#测试",
expectedTag: "测试",
shouldParse: true,
},
{
name: "Chinese tag followed by space",
input: "#测试 some text",
expectedTag: "测试",
shouldParse: true,
},
{
name: "Chinese tag followed by punctuation",
input: "#测试。",
expectedTag: "测试",
shouldParse: true,
},
{
name: "mixed Chinese and ASCII",
input: "#测试test123",
expectedTag: "测试test123",
shouldParse: true,
},
{
name: "Japanese characters",
input: "#テスト",
expectedTag: "テスト",
shouldParse: true,
},
{
name: "Korean characters",
input: "#테스트",
expectedTag: "테스트",
shouldParse: true,
},
{
name: "emoji",
input: "#test🚀",
expectedTag: "test🚀",
shouldParse: true,
},
}
for _, tt := range tests {

@ -21,9 +21,28 @@ import { visit } from "unist-util-visit";
/**
* Check if character is valid for tag content
* Follows Twitter/social media standard: Unicode letters/digits, underscore, hyphen, slash
* Stops at whitespace and punctuation
*/
function isTagChar(char: string): boolean {
return /[a-zA-Z0-9_\-/]/.test(char);
// Allow: letters (Unicode), digits (Unicode), underscore, hyphen, forward slash
// Stop at: whitespace, punctuation
// Stop at whitespace
if (/\s/.test(char)) {
return false;
}
// Stop at common punctuation (ASCII and Unicode)
// U+3000-U+303F: CJK punctuation
// U+FF00-U+FF65: Fullwidth punctuation subset
const punctuation = /[.,;:!?()[\]{}<>"'`|\\@&*+=^%$~#\u3000-\u303F\uFF00-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65]/;
if (punctuation.test(char)) {
return false;
}
// Allow everything else (Unicode letters, digits, and allowed symbols like - _ /)
return true;
}
/**

Loading…
Cancel
Save