fix: group adjacent punctuation tokens with content tokens to prevent line breaks, added token positions cache (#3713)

pull/2245/head
ggurdin 3 months ago committed by GitHub
parent fd617f296f
commit fe7e5385e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,6 +14,7 @@ import 'package:fluffychat/pages/chat/chat.dart';
import 'package:fluffychat/pangea/events/event_wrappers/pangea_message_event.dart';
import 'package:fluffychat/pangea/events/models/pangea_token_model.dart';
import 'package:fluffychat/pangea/message_token_text/message_token_button.dart';
import 'package:fluffychat/pangea/message_token_text/token_position_model.dart';
import 'package:fluffychat/pangea/toolbar/enums/reading_assistance_mode_enum.dart';
import 'package:fluffychat/pangea/toolbar/utils/token_rendering_util.dart';
import 'package:fluffychat/pangea/toolbar/widgets/message_selection_overlay.dart';
@ -157,7 +158,7 @@ class HtmlMessage extends StatelessWidget {
pangeaMessageEvent?.messageDisplayRepresentation?.tokens
?.where(
(t) =>
!["PUNCT", "SYM"].contains(t.pos) &&
!["SYM"].contains(t.pos) &&
!t.lemma.text.contains(RegExp(r'[0-9]')) &&
t.lemma.text.length <= 50,
)
@ -209,17 +210,25 @@ class HtmlMessage extends StatelessWidget {
}
int position = 0;
for (final PangeaToken token in tokens ?? []) {
final String tokenText = token.text.content;
final tokenPositions = tokens != null
? TokensUtil.getAdjacentTokenPositions(event.eventId, tokens!)
: [];
for (final TokenPosition tokenPosition in tokenPositions) {
final String tokenSpanText = tokens!
.sublist(tokenPosition.startIndex, tokenPosition.endIndex + 1)
.map((t) => t.text.content)
.join();
final substringIndex = result.indexWhere(
(string) =>
string.contains(tokenText) &&
string.contains(tokenSpanText) &&
!(string.startsWith('<') && string.endsWith('>')),
position,
);
if (substringIndex == -1) continue;
int tokenIndex = result[substringIndex].indexOf(tokenText);
int tokenIndex = result[substringIndex].indexOf(tokenSpanText);
if (tokenIndex == -1) continue;
final beforeSubstring = result[substringIndex].substring(0, tokenIndex);
@ -227,7 +236,7 @@ class HtmlMessage extends StatelessWidget {
tokenIndex = beforeSubstring.characters.length;
}
final int tokenLength = tokenText.characters.length;
final int tokenLength = tokenSpanText.characters.length;
final before =
result[substringIndex].characters.take(tokenIndex).toString();
final after = result[substringIndex]
@ -237,7 +246,7 @@ class HtmlMessage extends StatelessWidget {
result.replaceRange(substringIndex, substringIndex + 1, [
if (before.isNotEmpty) before,
'<token offset="${token.text.offset}" length="${token.text.length}">$tokenText</token>',
'<token offset="${tokenPosition.token!.text.offset}" length="${tokenPosition.token!.text.length}">$tokenSpanText</token>',
if (after.isNotEmpty) after,
]);

@ -13,7 +13,82 @@ class TokenPosition {
}
class TokensUtil {
static List<TokenPosition> getTokenPositions(
/// A cache of calculated adjacent token positions
static final Map<String, _TokenPositionCacheItem> _tokenPositionCache = {};
static const Duration _cacheDuration = Duration(minutes: 1);
static List<TokenPosition>? _getCachedTokenPositions(String eventID) {
final cacheItem = _tokenPositionCache[eventID];
if (cacheItem == null) return null;
if (cacheItem.timestamp.isBefore(DateTime.now().subtract(_cacheDuration))) {
_tokenPositionCache.remove(eventID);
return null;
}
return cacheItem.positions;
}
static void _setCachedTokenPositions(
String eventID,
List<TokenPosition> positions,
) {
_tokenPositionCache[eventID] = _TokenPositionCacheItem(
positions,
DateTime.now(),
);
}
/// Given a list of tokens, returns a list of positions for tokens and adjacent punctuation
/// This list may include gaps in the actual message for non-token elements,
/// so should not be used to fully reconstruct the original message.
static List<TokenPosition> getAdjacentTokenPositions(
String eventID,
List<PangeaToken> tokens,
) {
final cached = _getCachedTokenPositions(eventID);
if (cached != null) {
return cached;
}
final List<TokenPosition> positions = [];
for (int i = 0; i < tokens.length; i++) {
final PangeaToken token = tokens[i];
PangeaToken? currentToken = token;
PangeaToken? nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
final isPunct = token.pos == 'PUNCT';
final nextIsPunct = nextToken?.pos == 'PUNCT';
final int startIndex = i;
if (isPunct || nextIsPunct) {
while (nextToken != null && currentToken?.end == nextToken.start) {
i++;
currentToken = nextToken;
nextToken = i < tokens.length - 1 ? tokens[i + 1] : null;
}
}
final adjacentTokens = tokens.sublist(startIndex, i + 1);
if (adjacentTokens.every((t) => t.pos == 'PUNCT')) {
continue;
}
final position = TokenPosition(
token: adjacentTokens.firstWhere((t) => t.pos != 'PUNCT'),
startIndex: startIndex,
endIndex: i,
);
positions.add(position);
}
_setCachedTokenPositions(eventID, positions);
return positions;
}
/// Given a list of tokens, reconstructs an original message, including gaps for non-token elements.
static List<TokenPosition> getGlobalTokenPositions(
List<PangeaToken> tokens,
) {
final List<TokenPosition> tokenPositions = [];
@ -83,3 +158,13 @@ class TokensUtil {
return tokenPositions;
}
}
class _TokenPositionCacheItem {
final List<TokenPosition> positions;
final DateTime timestamp;
_TokenPositionCacheItem(
this.positions,
this.timestamp,
);
}

@ -37,7 +37,8 @@ class SttTranscriptTokens extends StatelessWidget {
textScaler: TextScaler.noScaling,
text: TextSpan(
style: style ?? DefaultTextStyle.of(context).style,
children: TokensUtil.getTokenPositions(tokens).map((tokenPosition) {
children:
TokensUtil.getGlobalTokenPositions(tokens).map((tokenPosition) {
final text = messageCharacters
.skip(tokenPosition.startIndex)
.take(tokenPosition.endIndex - tokenPosition.startIndex)

Loading…
Cancel
Save