From fe7e5385e814121771d28098854e561366a1c25d Mon Sep 17 00:00:00 2001 From: ggurdin <46800240+ggurdin@users.noreply.github.com> Date: Wed, 13 Aug 2025 10:50:40 -0400 Subject: [PATCH] fix: group adjacent punctuation tokens with content tokens to prevent line breaks, added token positions cache (#3713) --- lib/pages/chat/events/html_message.dart | 23 +++-- .../token_position_model.dart | 87 ++++++++++++++++++- .../widgets/stt_transcript_tokens.dart | 3 +- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/lib/pages/chat/events/html_message.dart b/lib/pages/chat/events/html_message.dart index ef945d711..393a1fa46 100644 --- a/lib/pages/chat/events/html_message.dart +++ b/lib/pages/chat/events/html_message.dart @@ -14,6 +14,7 @@ import 'package:fluffychat/pages/chat/chat.dart'; import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart'; import 'package:fluffychat/pangea/events/models/pangea_token_model.dart'; import 'package:fluffychat/pangea/message_token_text/message_token_button.dart'; +import 'package:fluffychat/pangea/message_token_text/token_position_model.dart'; import 'package:fluffychat/pangea/toolbar/enums/reading_assistance_mode_enum.dart'; import 'package:fluffychat/pangea/toolbar/utils/token_rendering_util.dart'; import 'package:fluffychat/pangea/toolbar/widgets/message_selection_overlay.dart'; @@ -157,7 +158,7 @@ class HtmlMessage extends StatelessWidget { pangeaMessageEvent?.messageDisplayRepresentation?.tokens ?.where( (t) => - !["PUNCT", "SYM"].contains(t.pos) && + !["SYM"].contains(t.pos) && !t.lemma.text.contains(RegExp(r'[0-9]')) && t.lemma.text.length <= 50, ) @@ -209,17 +210,25 @@ class HtmlMessage extends StatelessWidget { } int position = 0; - for (final PangeaToken token in tokens ?? []) { - final String tokenText = token.text.content; + final tokenPositions = tokens != null + ? TokensUtil.getAdjacentTokenPositions(event.eventId, tokens!) + : []; + + for (final TokenPosition tokenPosition in tokenPositions) { + final String tokenSpanText = tokens! + .sublist(tokenPosition.startIndex, tokenPosition.endIndex + 1) + .map((t) => t.text.content) + .join(); + final substringIndex = result.indexWhere( (string) => - string.contains(tokenText) && + string.contains(tokenSpanText) && !(string.startsWith('<') && string.endsWith('>')), position, ); if (substringIndex == -1) continue; - int tokenIndex = result[substringIndex].indexOf(tokenText); + int tokenIndex = result[substringIndex].indexOf(tokenSpanText); if (tokenIndex == -1) continue; final beforeSubstring = result[substringIndex].substring(0, tokenIndex); @@ -227,7 +236,7 @@ class HtmlMessage extends StatelessWidget { tokenIndex = beforeSubstring.characters.length; } - final int tokenLength = tokenText.characters.length; + final int tokenLength = tokenSpanText.characters.length; final before = result[substringIndex].characters.take(tokenIndex).toString(); final after = result[substringIndex] @@ -237,7 +246,7 @@ class HtmlMessage extends StatelessWidget { result.replaceRange(substringIndex, substringIndex + 1, [ if (before.isNotEmpty) before, - '$tokenText', + '$tokenSpanText', if (after.isNotEmpty) after, ]); diff --git a/lib/pangea/message_token_text/token_position_model.dart b/lib/pangea/message_token_text/token_position_model.dart index 5e8c9b57b..cd5a47355 100644 --- a/lib/pangea/message_token_text/token_position_model.dart +++ b/lib/pangea/message_token_text/token_position_model.dart @@ -13,7 +13,82 @@ class TokenPosition { } class TokensUtil { - static List getTokenPositions( + /// A cache of calculated adjacent token positions + static final Map _tokenPositionCache = {}; + + static const Duration _cacheDuration = Duration(minutes: 1); + + static List? _getCachedTokenPositions(String eventID) { + final cacheItem = _tokenPositionCache[eventID]; + if (cacheItem == null) return null; + if (cacheItem.timestamp.isBefore(DateTime.now().subtract(_cacheDuration))) { + _tokenPositionCache.remove(eventID); + return null; + } + + return cacheItem.positions; + } + + static void _setCachedTokenPositions( + String eventID, + List positions, + ) { + _tokenPositionCache[eventID] = _TokenPositionCacheItem( + positions, + DateTime.now(), + ); + } + + /// Given a list of tokens, returns a list of positions for tokens and adjacent punctuation + /// This list may include gaps in the actual message for non-token elements, + /// so should not be used to fully reconstruct the original message. + static List getAdjacentTokenPositions( + String eventID, + List tokens, + ) { + final cached = _getCachedTokenPositions(eventID); + if (cached != null) { + return cached; + } + + final List positions = []; + for (int i = 0; i < tokens.length; i++) { + final PangeaToken token = tokens[i]; + + PangeaToken? currentToken = token; + PangeaToken? nextToken = i < tokens.length - 1 ? tokens[i + 1] : null; + + final isPunct = token.pos == 'PUNCT'; + final nextIsPunct = nextToken?.pos == 'PUNCT'; + + final int startIndex = i; + if (isPunct || nextIsPunct) { + while (nextToken != null && currentToken?.end == nextToken.start) { + i++; + currentToken = nextToken; + nextToken = i < tokens.length - 1 ? tokens[i + 1] : null; + } + } + + final adjacentTokens = tokens.sublist(startIndex, i + 1); + if (adjacentTokens.every((t) => t.pos == 'PUNCT')) { + continue; + } + + final position = TokenPosition( + token: adjacentTokens.firstWhere((t) => t.pos != 'PUNCT'), + startIndex: startIndex, + endIndex: i, + ); + positions.add(position); + } + + _setCachedTokenPositions(eventID, positions); + return positions; + } + + /// Given a list of tokens, reconstructs an original message, including gaps for non-token elements. + static List getGlobalTokenPositions( List tokens, ) { final List tokenPositions = []; @@ -83,3 +158,13 @@ class TokensUtil { return tokenPositions; } } + +class _TokenPositionCacheItem { + final List positions; + final DateTime timestamp; + + _TokenPositionCacheItem( + this.positions, + this.timestamp, + ); +} diff --git a/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart b/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart index fcea78ddf..e63bcb7df 100644 --- a/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart +++ b/lib/pangea/toolbar/widgets/stt_transcript_tokens.dart @@ -37,7 +37,8 @@ class SttTranscriptTokens extends StatelessWidget { textScaler: TextScaler.noScaling, text: TextSpan( style: style ?? DefaultTextStyle.of(context).style, - children: TokensUtil.getTokenPositions(tokens).map((tokenPosition) { + children: + TokensUtil.getGlobalTokenPositions(tokens).map((tokenPosition) { final text = messageCharacters .skip(tokenPosition.startIndex) .take(tokenPosition.endIndex - tokenPosition.startIndex)