fix: group adjacent punctuation tokens with content tokens to prevent line breaks, added token positions cache (#3713)

pull/2245/head
ggurdin 3 months ago committed by GitHub
parent fd617f296f
commit fe7e5385e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,6 +14,7 @@ import 'package:fluffychat/pages/chat/chat.dart';
import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart'; import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart';
import 'package:fluffychat/pangea/events/models/pangea_token_model.dart'; import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
import 'package:fluffychat/pangea/message_token_text/message_token_button.dart'; import 'package:fluffychat/pangea/message_token_text/message_token_button.dart';
import 'package:fluffychat/pangea/message_token_text/token_position_model.dart';
import 'package:fluffychat/pangea/toolbar/enums/reading_assistance_mode_enum.dart'; import 'package:fluffychat/pangea/toolbar/enums/reading_assistance_mode_enum.dart';
import 'package:fluffychat/pangea/toolbar/utils/token_rendering_util.dart'; import 'package:fluffychat/pangea/toolbar/utils/token_rendering_util.dart';
import 'package:fluffychat/pangea/toolbar/widgets/message_selection_overlay.dart'; import 'package:fluffychat/pangea/toolbar/widgets/message_selection_overlay.dart';
@ -157,7 +158,7 @@ class HtmlMessage extends StatelessWidget {
pangeaMessageEvent?.messageDisplayRepresentation?.tokens pangeaMessageEvent?.messageDisplayRepresentation?.tokens
?.where( ?.where(
(t) => (t) =>
!["PUNCT", "SYM"].contains(t.pos) && !["SYM"].contains(t.pos) &&
!t.lemma.text.contains(RegExp(r'[0-9]')) && !t.lemma.text.contains(RegExp(r'[0-9]')) &&
t.lemma.text.length <= 50, t.lemma.text.length <= 50,
) )
@ -209,17 +210,25 @@ class HtmlMessage extends StatelessWidget {
} }
int position = 0; int position = 0;
for (final PangeaToken token in tokens ?? []) { final tokenPositions = tokens != null
final String tokenText = token.text.content; ? TokensUtil.getAdjacentTokenPositions(event.eventId, tokens!)
: [];
for (final TokenPosition tokenPosition in tokenPositions) {
final String tokenSpanText = tokens!
.sublist(tokenPosition.startIndex, tokenPosition.endIndex + 1)
.map((t) => t.text.content)
.join();
final substringIndex = result.indexWhere( final substringIndex = result.indexWhere(
(string) => (string) =>
string.contains(tokenText) && string.contains(tokenSpanText) &&
!(string.startsWith('<') && string.endsWith('>')), !(string.startsWith('<') && string.endsWith('>')),
position, position,
); );
if (substringIndex == -1) continue; if (substringIndex == -1) continue;
int tokenIndex = result[substringIndex].indexOf(tokenText); int tokenIndex = result[substringIndex].indexOf(tokenSpanText);
if (tokenIndex == -1) continue; if (tokenIndex == -1) continue;
final beforeSubstring = result[substringIndex].substring(0, tokenIndex); final beforeSubstring = result[substringIndex].substring(0, tokenIndex);
@ -227,7 +236,7 @@ class HtmlMessage extends StatelessWidget {
tokenIndex = beforeSubstring.characters.length; tokenIndex = beforeSubstring.characters.length;
} }
final int tokenLength = tokenText.characters.length; final int tokenLength = tokenSpanText.characters.length;
final before = final before =
result[substringIndex].characters.take(tokenIndex).toString(); result[substringIndex].characters.take(tokenIndex).toString();
final after = result[substringIndex] final after = result[substringIndex]
@ -237,7 +246,7 @@ class HtmlMessage extends StatelessWidget {
result.replaceRange(substringIndex, substringIndex + 1, [ result.replaceRange(substringIndex, substringIndex + 1, [
if (before.isNotEmpty) before, if (before.isNotEmpty) before,
'<token offset="${token.text.offset}" length="${token.text.length}">$tokenText</token>', '<token offset="${tokenPosition.token!.text.offset}" length="${tokenPosition.token!.text.length}">$tokenSpanText</token>',
if (after.isNotEmpty) after, if (after.isNotEmpty) after,
]); ]);

@ -13,7 +13,82 @@ class TokenPosition {
} }
class TokensUtil { class TokensUtil {
static List<TokenPosition> getTokenPositions( /// A cache of calculated adjacent token positions
static final Map<String, _TokenPositionCacheItem> _tokenPositionCache = {};
static const Duration _cacheDuration = Duration(minutes: 1);
static List<TokenPosition>? _getCachedTokenPositions(String eventID) {
final cacheItem = _tokenPositionCache[eventID];
if (cacheItem == null) return null;
if (cacheItem.timestamp.isBefore(DateTime.now().subtract(_cacheDuration))) {
_tokenPositionCache.remove(eventID);
return null;
}
return cacheItem.positions;
}
static void _setCachedTokenPositions(
String eventID,
List<TokenPosition> positions,
) {
_tokenPositionCache[eventID] = _TokenPositionCacheItem(
positions,
DateTime.now(),
);
}
/// Given a list of tokens, returns a list of positions for tokens and adjacent punctuation
/// This list may include gaps in the actual message for non-token elements,
/// so should not be used to fully reconstruct the original message.
static List<TokenPosition> getAdjacentTokenPositions(
String eventID,
List<PangeaToken> tokens,
) {
final cached = _getCachedTokenPositions(eventID);
if (cached != null) {
return cached;
}
final List<TokenPosition> positions = [];
for (int i = 0; i < tokens.length; i++) {
final PangeaToken token = tokens[i];
PangeaToken? currentToken = token;
PangeaToken? nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
final isPunct = token.pos == 'PUNCT';
final nextIsPunct = nextToken?.pos == 'PUNCT';
final int startIndex = i;
if (isPunct || nextIsPunct) {
while (nextToken != null && currentToken?.end == nextToken.start) {
i++;
currentToken = nextToken;
nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
}
}
final adjacentTokens = tokens.sublist(startIndex, i + 1);
if (adjacentTokens.every((t) => t.pos == 'PUNCT')) {
continue;
}
final position = TokenPosition(
token: adjacentTokens.firstWhere((t) => t.pos != 'PUNCT'),
startIndex: startIndex,
endIndex: i,
);
positions.add(position);
}
_setCachedTokenPositions(eventID, positions);
return positions;
}
/// Given a list of tokens, reconstructs an original message, including gaps for non-token elements.
static List<TokenPosition> getGlobalTokenPositions(
List<PangeaToken> tokens, List<PangeaToken> tokens,
) { ) {
final List<TokenPosition> tokenPositions = []; final List<TokenPosition> tokenPositions = [];
@ -83,3 +158,13 @@ class TokensUtil {
return tokenPositions; return tokenPositions;
} }
} }
class _TokenPositionCacheItem {
final List<TokenPosition> positions;
final DateTime timestamp;
_TokenPositionCacheItem(
this.positions,
this.timestamp,
);
}

@ -37,7 +37,8 @@ class SttTranscriptTokens extends StatelessWidget {
textScaler: TextScaler.noScaling, textScaler: TextScaler.noScaling,
text: TextSpan( text: TextSpan(
style: style ?? DefaultTextStyle.of(context).style, style: style ?? DefaultTextStyle.of(context).style,
children: TokensUtil.getTokenPositions(tokens).map((tokenPosition) { children:
TokensUtil.getGlobalTokenPositions(tokens).map((tokenPosition) {
final text = messageCharacters final text = messageCharacters
.skip(tokenPosition.startIndex) .skip(tokenPosition.startIndex)
.take(tokenPosition.endIndex - tokenPosition.startIndex) .take(tokenPosition.endIndex - tokenPosition.startIndex)

Loading…
Cancel
Save