Skip to content

Commit 30854d7

Browse files
committed
fix: avoid quadratic repeated punctuation merges
1 parent a522688 commit 30854d7

File tree

2 files changed

+26
-12
lines changed

2 files changed

+26
-12
lines changed

src/analysis.ts

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -352,12 +352,14 @@ function splitTrailingForwardStickyCluster(text: string): { head: string, tail:
352352
}
353353
}
354354

355-
function isRepeatedSingleCharRun(segment: string, ch: string): boolean {
356-
if (segment.length === 0) return false
357-
for (const part of segment) {
358-
if (part !== ch) return false
359-
}
360-
return true
355+
function getRepeatableSingleCharRunChar(
356+
text: string,
357+
isWordLike: boolean,
358+
kind: SegmentBreakKind,
359+
): string | null {
360+
return kind === 'text' && !isWordLike && text.length === 1 && text !== '-' && text !== '—'
361+
? text
362+
: null
361363
}
362364

363365
function endsWithArabicNoSpacePunctuation(segment: string): boolean {
@@ -883,10 +885,14 @@ function buildMergedSegmentation(
883885
const mergedWordLike: boolean[] = []
884886
const mergedKinds: SegmentBreakKind[] = []
885887
const mergedStarts: number[] = []
888+
// Track repeatable single-char punctuation runs structurally so identical
889+
// merges stay O(1) instead of re-scanning the accumulated segment each time.
890+
const mergedSingleCharRunChars: (string | null)[] = []
886891

887892
for (const s of wordSegmenter.segment(normalized)) {
888893
for (const piece of splitSegmentByBreakKind(s.segment, s.isWordLike ?? false, s.index, whiteSpaceProfile)) {
889894
const isText = piece.kind === 'text'
895+
const repeatableSingleCharRunChar = getRepeatableSingleCharRunChar(piece.text, piece.isWordLike, piece.kind)
890896

891897
// First-pass keeps: no-space script-specific joins and punctuation glue
892898
// that depend on the immediately preceding text run.
@@ -901,6 +907,7 @@ function buildMergedSegmentation(
901907
) {
902908
mergedTexts[mergedLen - 1] += piece.text
903909
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
910+
mergedSingleCharRunChars[mergedLen - 1] = null
904911
} else if (
905912
isText &&
906913
mergedLen > 0 &&
@@ -910,6 +917,7 @@ function buildMergedSegmentation(
910917
) {
911918
mergedTexts[mergedLen - 1] += piece.text
912919
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
920+
mergedSingleCharRunChars[mergedLen - 1] = null
913921
} else if (
914922
isText &&
915923
mergedLen > 0 &&
@@ -918,6 +926,7 @@ function buildMergedSegmentation(
918926
) {
919927
mergedTexts[mergedLen - 1] += piece.text
920928
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
929+
mergedSingleCharRunChars[mergedLen - 1] = null
921930
} else if (
922931
isText &&
923932
mergedLen > 0 &&
@@ -928,15 +937,12 @@ function buildMergedSegmentation(
928937
) {
929938
mergedTexts[mergedLen - 1] += piece.text
930939
mergedWordLike[mergedLen - 1] = true
940+
mergedSingleCharRunChars[mergedLen - 1] = null
931941
} else if (
932-
isText &&
933-
!piece.isWordLike &&
942+
repeatableSingleCharRunChar !== null &&
934943
mergedLen > 0 &&
935944
mergedKinds[mergedLen - 1] === 'text' &&
936-
piece.text.length === 1 &&
937-
piece.text !== '-' &&
938-
piece.text !== '—' &&
939-
isRepeatedSingleCharRun(mergedTexts[mergedLen - 1]!, piece.text)
945+
mergedSingleCharRunChars[mergedLen - 1] === repeatableSingleCharRunChar
940946
) {
941947
mergedTexts[mergedLen - 1] += piece.text
942948
} else if (
@@ -950,11 +956,13 @@ function buildMergedSegmentation(
950956
)
951957
) {
952958
mergedTexts[mergedLen - 1] += piece.text
959+
mergedSingleCharRunChars[mergedLen - 1] = null
953960
} else {
954961
mergedTexts[mergedLen] = piece.text
955962
mergedWordLike[mergedLen] = piece.isWordLike
956963
mergedKinds[mergedLen] = piece.kind
957964
mergedStarts[mergedLen] = piece.start
965+
mergedSingleCharRunChars[mergedLen] = repeatableSingleCharRunChar
958966
mergedLen++
959967
}
960968
}

src/layout.test.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,12 @@ describe('prepare invariants', () => {
530530
expect(prepared.segments).toEqual(['===', ' ', 'heading', ' ', '==='])
531531
})
532532

533+
test('keeps long repeated punctuation runs coalesced', () => {
534+
const text = '('.repeat(256)
535+
const prepared = prepareWithSegments(text, FONT)
536+
expect(prepared.segments).toEqual([text])
537+
})
538+
533539
test('applies CJK and Hangul punctuation attachment rules', () => {
534540
expect(prepareWithSegments('中文,测试。', FONT).segments).toEqual(['中', '文,', '测', '试。'])
535541
expect(prepareWithSegments('테스트입니다.', FONT).segments.at(-1)).toBe('다.')

0 commit comments

Comments
 (0)