From dd179767f6228548b2e141cb19f1211b1dc6edf6 Mon Sep 17 00:00:00 2001 From: Vjacheslav Trushkin Date: Fri, 9 Dec 2022 23:09:11 +0200 Subject: [PATCH] fix: sort emoji sequences by length to avoid lazy matching --- packages/utils/src/emoji/regex/base.ts | 18 +++- .../tests/emoji-regex-item-creation-test.ts | 25 ++++- .../utils/tests/emoji-regex-numbers-test.ts | 13 ++- .../tests/emoji-regex-similar-items-test.ts | 16 ++- packages/utils/tests/emoji-regex-test.ts | 102 ++++++++++++++++++ packages/utils/tests/emoji-tree-test.ts | 2 +- 6 files changed, 165 insertions(+), 11 deletions(-) create mode 100644 packages/utils/tests/emoji-regex-test.ts diff --git a/packages/utils/src/emoji/regex/base.ts b/packages/utils/src/emoji/regex/base.ts index 3cc54d3..46f3384 100644 --- a/packages/utils/src/emoji/regex/base.ts +++ b/packages/utils/src/emoji/regex/base.ts @@ -17,6 +17,9 @@ interface BaseEmojiItemRegex { // True if regex can be treated as a group (does not require wrapping in `(?:` + `)`) group: boolean; + + // Number of characters, minimum value + length: number; } interface EmojiItemRegexWithNumbers { @@ -196,6 +199,7 @@ export function createUTF16EmojiRegexItem( type: 'utf16', regex: '', numbers, + length: 1, group: true, }; updateUTF16EmojiRegexItem(result); @@ -243,6 +247,7 @@ export function createSequenceEmojiRegexItem( type: 'sequence', items, regex: '', + length: items.reduce((length, item) => item.length + length, 0), group: false, }; @@ -311,13 +316,23 @@ export function createSetEmojiRegexItem( }); // Sort items to guarantee same results regardless of order - sets.sort((a, b) => a.regex.localeCompare(b.regex)); + sets.sort((a, b) => { + if (a.length === b.length) { + return a.regex.localeCompare(b.regex); + } + return b.length - a.length; + }); // Create item const result: SetEmojiItemRegex = { type: 'set', sets, regex: '', + length: sets.reduce( + (length, item) => + length ? Math.min(length, item.length) : item.length, + 0 + ), group: false, }; if (numbers) { @@ -361,6 +376,7 @@ export function createOptionalEmojiRegexItem( type: 'optional', item, regex: '', + length: item.length, group: true, }; updateOptionalEmojiRegexItem(result); diff --git a/packages/utils/tests/emoji-regex-item-creation-test.ts b/packages/utils/tests/emoji-regex-item-creation-test.ts index 7947439..4428902 100644 --- a/packages/utils/tests/emoji-regex-item-creation-test.ts +++ b/packages/utils/tests/emoji-regex-item-creation-test.ts @@ -12,6 +12,7 @@ describe('Creating chunks of regex', () => { type: 'utf16', regex: '\\u2763', numbers: [0x2763], + length: 1, group: true, }); @@ -20,6 +21,7 @@ describe('Creating chunks of regex', () => { type: 'utf16', regex: '[\\u2762-\\u2764]', numbers: [0x2762, 0x2763, 0x2764], + length: 1, group: true, }); @@ -28,6 +30,7 @@ describe('Creating chunks of regex', () => { type: 'utf16', regex: '[\\u2760\\u2764\\uFE0F]', numbers: [0x2760, 0x2764, 0xfe0f], + length: 1, group: true, }); @@ -44,6 +47,7 @@ describe('Creating chunks of regex', () => { 0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762, 0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f, ], + length: 1, group: true, }); }); @@ -58,6 +62,7 @@ describe('Creating chunks of regex', () => { regex: '[\\u2000\\u2001]', numbers: [0x2000, 0x2001], items: [num1], + length: 1, group: true, }); @@ -66,6 +71,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '[\\u2000\\u2001][\\u2000\\u2100]', items: [num1, num2], + length: 2, group: false, }); }); @@ -80,6 +86,7 @@ describe('Creating chunks of regex', () => { regex: '[\\u2000\\u2001]', numbers: [0x2000, 0x2001], sets: [num1], + length: 1, group: true, }); @@ -89,6 +96,7 @@ describe('Creating chunks of regex', () => { regex: '[\\u2000\\u2001]|[\\u2000\\u2100]', numbers: [0x2000, 0x2001, 0x2000, 0x2100], sets: [num1, num2], + length: 1, group: false, }); }); @@ -102,6 +110,7 @@ describe('Creating chunks of regex', () => { type: 'optional', regex: '\\uFE0F?', item: num1, + length: 1, group: true, }); @@ -110,6 +119,7 @@ describe('Creating chunks of regex', () => { type: 'optional', regex: '[\\uFE0E\\uFE0F]?', item: num2, + length: 1, group: true, }); }); @@ -126,6 +136,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '\\uFE0F?', items: [fe0f], + length: 1, group: true, }); @@ -134,6 +145,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '[\\u2000\\u2001]\\uFE0F?', items: [num1, fe0f], + length: 2, group: false, }); @@ -142,6 +154,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]', items: [num1, fe0f, num2], + length: 3, group: false, }); @@ -150,6 +163,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?', items: [num2, num1, fe0f], + length: 3, group: false, }); }); @@ -167,6 +181,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '\\uD83D\\uDC9A', items: [utf32a1, utf32a2], + length: 2, group: false, }); utf32a.numbers = [0x1f49a]; @@ -176,6 +191,7 @@ describe('Creating chunks of regex', () => { type: 'optional', regex: '(?:\\uD83D\\uDC9A)?', item: utf32a, + length: 2, group: true, }); @@ -183,17 +199,19 @@ describe('Creating chunks of regex', () => { const set = createSetEmojiRegexItem([num1, utf32a]); expect(set).toEqual({ type: 'set', - regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A', - sets: [num1, utf32a], + regex: '\\uD83D\\uDC9A|[\\u1234-\\u1237]', + sets: [utf32a, num1], numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a], + length: 1, group: false, }); // Make it optional expect(createOptionalEmojiRegexItem(set)).toEqual({ type: 'optional', - regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?', + regex: '(?:\\uD83D\\uDC9A|[\\u1234-\\u1237])?', item: set, + length: 1, group: true, }); @@ -206,6 +224,7 @@ describe('Creating chunks of regex', () => { type: 'sequence', regex: '\\u2000(?:\\u2100|\\u2101)', items: [utf16a, set1], + length: 2, group: false, }); }); diff --git a/packages/utils/tests/emoji-regex-numbers-test.ts b/packages/utils/tests/emoji-regex-numbers-test.ts index 095a5fe..ccca563 100644 --- a/packages/utils/tests/emoji-regex-numbers-test.ts +++ b/packages/utils/tests/emoji-regex-numbers-test.ts @@ -16,6 +16,7 @@ describe('Creating chunks of regex for numbers', () => { type: 'utf16', regex: '\\u2763', numbers: [0x2763], + length: 1, group: true, }); @@ -25,6 +26,7 @@ describe('Creating chunks of regex for numbers', () => { type: 'utf16', regex: '[\\u2761\\u2763-\\u2765]', numbers: [0x2761, 0x2763, 0x2764, 0x2765], + length: 1, group: true, }); @@ -37,16 +39,19 @@ describe('Creating chunks of regex for numbers', () => { type: 'utf16', regex: '\\uD83D', numbers: [0xd83d], + length: 1, group: true, }, { type: 'utf16', regex: '\\uDC9A', numbers: [0xdc9a], + length: 1, group: true, }, ], numbers: [0x1f49a], + length: 2, group: false, }); @@ -59,6 +64,7 @@ describe('Creating chunks of regex for numbers', () => { type: 'sequence', regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]', numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c], + length: 2, group: false, }); @@ -71,6 +77,7 @@ describe('Creating chunks of regex for numbers', () => { type: 'set', regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]', numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e], + length: 2, group: false, }); @@ -82,11 +89,12 @@ describe('Creating chunks of regex for numbers', () => { delete (items3 as unknown as Record).sets; expect(items3).toEqual({ type: 'set', - regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]', + regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]|[\\u2763-\\u2765]', numbers: [ 0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e, ], + length: 1, group: false, }); }); @@ -106,6 +114,7 @@ describe('Creating chunks of regex for numbers', () => { createUTF16EmojiRegexItem([0xd83d]), createUTF16EmojiRegexItem([0xdc9a]), ], + length: 2, group: false, }); @@ -120,6 +129,7 @@ describe('Creating chunks of regex for numbers', () => { createUTF16EmojiRegexItem([0xfe0f]) ), ], + length: 3, group: false, }); @@ -132,6 +142,7 @@ describe('Creating chunks of regex for numbers', () => { createUTF16EmojiRegexItem([0xdc9a]), createUTF16EmojiRegexItem([0xfe0f]), ], + length: 3, group: false, } ); diff --git a/packages/utils/tests/emoji-regex-similar-items-test.ts b/packages/utils/tests/emoji-regex-similar-items-test.ts index b6c878f..1b5b96e 100644 --- a/packages/utils/tests/emoji-regex-similar-items-test.ts +++ b/packages/utils/tests/emoji-regex-similar-items-test.ts @@ -91,6 +91,7 @@ describe('Similar chunks of regex', () => { ]), items[2], ], + length: 1, group: false, }); }); @@ -163,6 +164,7 @@ describe('Similar chunks of regex', () => { ]), ]), ], + length: 4, group: false, }); }); @@ -226,6 +228,7 @@ describe('Similar chunks of regex', () => { ]), items[2], ], + length: 1, group: false, }); }); @@ -277,6 +280,7 @@ describe('Similar chunks of regex', () => { ), ]), ], + length: 4, group: false, }); }); @@ -340,6 +344,7 @@ describe('Similar chunks of regex', () => { ]), items[2], ], + length: 1, group: false, }); }); @@ -427,16 +432,17 @@ describe('Similar chunks of regex', () => { expect(set).toEqual({ type: 'set', regex: - // last 2 items (set items are sorted alphabetically), // 6 numbers from common chunks, grouped mix - '\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' + + // last 2 items (set items are sorted by length, then alphabetically), + '\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' + slicedSet.regex + - ')', + ')|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83D\\uDE4F', sets: [ - items[4], - items[5], createSequenceEmojiRegexItem([...slicedSequence, slicedSet]), + items[5], + items[4], ], + length: 2, group: false, }); }); diff --git a/packages/utils/tests/emoji-regex-test.ts b/packages/utils/tests/emoji-regex-test.ts new file mode 100644 index 0000000..2f1301f --- /dev/null +++ b/packages/utils/tests/emoji-regex-test.ts @@ -0,0 +1,102 @@ +import { createOptimisedRegex } from '../lib/emoji/regex/create'; + +describe('Emoji regex matching', () => { + it('Simple regex', () => { + const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']); + + const matches = ` +E1.0 grinning face: ${String.fromCodePoint(0x1f600)} +E0.6 grinning face with big eyes: ${String.fromCodePoint(0x1f603)} +E1.0 grinning face: ${String.fromCodePoint(0x1f600)} +`.match(new RegExp(regexValue, 'g')); + + expect(matches?.length).toBe(3); + expect(matches?.[0]).toBe(String.fromCodePoint(0x1f600)); + expect(matches?.[2]).toBe(String.fromCodePoint(0x1f600)); + expect(matches?.[1]).toBe(String.fromCodePoint(0x1f603)); + }); + + it('Sequences', () => { + const regexValue = createOptimisedRegex([ + // Emoji with optional variation + '263A FE0F', + // Sequence and single emojis after it + // Add multiple variations to test ranges + '1F62E 200D 1F4A7', + '1F62E 200D 1F4A8', + '1F62E 200D 1F4A9', + '1F62E', + '1F62D', + '1F62F', + '1F4A8', + ]); + + const matches = ` +E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)} +E13.1 face exhaling: ${ + String.fromCodePoint(0x1f62e) + + String.fromCodePoint(0x200d) + + String.fromCodePoint(0x1f4a8) + } +E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)} +E0.6 smiling face: ${ + String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f) + } (icon) +E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text) +`.match(new RegExp(regexValue, 'g')); + + expect(matches?.length).toBe(5); + expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8)); + expect(matches?.[1]).toBe( + String.fromCodePoint(0x1f62e) + + String.fromCodePoint(0x200d) + + String.fromCodePoint(0x1f4a8) + ); + expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e)); + expect(matches?.[3]).toBe( + String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f) + ); + expect(matches?.[4]).toBe(String.fromCodePoint(0x263a)); + }); + + it('Skin tones', () => { + const list = [ + '1f44b', + '1f44b-1f3fb', + '1f44b-1f3fc', + '1f44b-1f3fd', + '1f44b-1f3fe', + '1f44b-1f3ff', + '1f91a', + '1f91a-1f3fb', + '1f91a-1f3fc', + '1f91a-1f3fd', + '1f91a-1f3fe', + '1f91a-1f3ff', + '1f590-fe0f', + '1f590', + '1f590-1f3fb', + '1f590-1f3fc', + '1f590-1f3fd', + '1f590-1f3fe', + '1f590-1f3ff', + '1f3fb', + '1f3fc', + '1f3fd', + '1f3fe', + '1f3ff', + ]; + const regexValue = createOptimisedRegex(list); + + const matches = ` +E1.0 waving hand: medium skin tone: ${ + String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd) + } +`.match(new RegExp(regexValue, 'g')); + + expect(matches?.length).toBe(1); + expect(matches?.[0]).toBe( + String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd) + ); + }); +}); diff --git a/packages/utils/tests/emoji-tree-test.ts b/packages/utils/tests/emoji-tree-test.ts index f4e6799..413db74 100644 --- a/packages/utils/tests/emoji-tree-test.ts +++ b/packages/utils/tests/emoji-tree-test.ts @@ -42,7 +42,7 @@ describe('Emoji regex tree', () => { ]); expect(parseEmojiTree(tree).regex).toEqual( - '\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])' + '\\uD83C(?:(?:\\uDFF4\\u200D\\u2620|\\uDFF3)\\uFE0F?|[\\uDFC1\\uDFF3])' ); });