2
0
mirror of https://github.com/iconify/iconify.git synced 2025-01-22 14:48:24 +00:00

fix: sort emoji sequences by length to avoid lazy matching

This commit is contained in:
Vjacheslav Trushkin 2022-12-09 23:09:11 +02:00
parent 875e9707e6
commit dd179767f6
6 changed files with 165 additions and 11 deletions

View File

@ -17,6 +17,9 @@ interface BaseEmojiItemRegex {
// True if regex can be treated as a group (does not require wrapping in `(?:` + `)`)
group: boolean;
// Number of characters, minimum value
length: number;
}
interface EmojiItemRegexWithNumbers {
@ -196,6 +199,7 @@ export function createUTF16EmojiRegexItem(
type: 'utf16',
regex: '',
numbers,
length: 1,
group: true,
};
updateUTF16EmojiRegexItem(result);
@ -243,6 +247,7 @@ export function createSequenceEmojiRegexItem(
type: 'sequence',
items,
regex: '',
length: items.reduce((length, item) => item.length + length, 0),
group: false,
};
@ -311,13 +316,23 @@ export function createSetEmojiRegexItem(
});
// Sort items to guarantee same results regardless of order
sets.sort((a, b) => a.regex.localeCompare(b.regex));
sets.sort((a, b) => {
if (a.length === b.length) {
return a.regex.localeCompare(b.regex);
}
return b.length - a.length;
});
// Create item
const result: SetEmojiItemRegex = {
type: 'set',
sets,
regex: '',
length: sets.reduce(
(length, item) =>
length ? Math.min(length, item.length) : item.length,
0
),
group: false,
};
if (numbers) {
@ -361,6 +376,7 @@ export function createOptionalEmojiRegexItem(
type: 'optional',
item,
regex: '',
length: item.length,
group: true,
};
updateOptionalEmojiRegexItem(result);

View File

@ -12,6 +12,7 @@ describe('Creating chunks of regex', () => {
type: 'utf16',
regex: '\\u2763',
numbers: [0x2763],
length: 1,
group: true,
});
@ -20,6 +21,7 @@ describe('Creating chunks of regex', () => {
type: 'utf16',
regex: '[\\u2762-\\u2764]',
numbers: [0x2762, 0x2763, 0x2764],
length: 1,
group: true,
});
@ -28,6 +30,7 @@ describe('Creating chunks of regex', () => {
type: 'utf16',
regex: '[\\u2760\\u2764\\uFE0F]',
numbers: [0x2760, 0x2764, 0xfe0f],
length: 1,
group: true,
});
@ -44,6 +47,7 @@ describe('Creating chunks of regex', () => {
0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762,
0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f,
],
length: 1,
group: true,
});
});
@ -58,6 +62,7 @@ describe('Creating chunks of regex', () => {
regex: '[\\u2000\\u2001]',
numbers: [0x2000, 0x2001],
items: [num1],
length: 1,
group: true,
});
@ -66,6 +71,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '[\\u2000\\u2001][\\u2000\\u2100]',
items: [num1, num2],
length: 2,
group: false,
});
});
@ -80,6 +86,7 @@ describe('Creating chunks of regex', () => {
regex: '[\\u2000\\u2001]',
numbers: [0x2000, 0x2001],
sets: [num1],
length: 1,
group: true,
});
@ -89,6 +96,7 @@ describe('Creating chunks of regex', () => {
regex: '[\\u2000\\u2001]|[\\u2000\\u2100]',
numbers: [0x2000, 0x2001, 0x2000, 0x2100],
sets: [num1, num2],
length: 1,
group: false,
});
});
@ -102,6 +110,7 @@ describe('Creating chunks of regex', () => {
type: 'optional',
regex: '\\uFE0F?',
item: num1,
length: 1,
group: true,
});
@ -110,6 +119,7 @@ describe('Creating chunks of regex', () => {
type: 'optional',
regex: '[\\uFE0E\\uFE0F]?',
item: num2,
length: 1,
group: true,
});
});
@ -126,6 +136,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '\\uFE0F?',
items: [fe0f],
length: 1,
group: true,
});
@ -134,6 +145,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '[\\u2000\\u2001]\\uFE0F?',
items: [num1, fe0f],
length: 2,
group: false,
});
@ -142,6 +154,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]',
items: [num1, fe0f, num2],
length: 3,
group: false,
});
@ -150,6 +163,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?',
items: [num2, num1, fe0f],
length: 3,
group: false,
});
});
@ -167,6 +181,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '\\uD83D\\uDC9A',
items: [utf32a1, utf32a2],
length: 2,
group: false,
});
utf32a.numbers = [0x1f49a];
@ -176,6 +191,7 @@ describe('Creating chunks of regex', () => {
type: 'optional',
regex: '(?:\\uD83D\\uDC9A)?',
item: utf32a,
length: 2,
group: true,
});
@ -183,17 +199,19 @@ describe('Creating chunks of regex', () => {
const set = createSetEmojiRegexItem([num1, utf32a]);
expect(set).toEqual({
type: 'set',
regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A',
sets: [num1, utf32a],
regex: '\\uD83D\\uDC9A|[\\u1234-\\u1237]',
sets: [utf32a, num1],
numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a],
length: 1,
group: false,
});
// Make it optional
expect(createOptionalEmojiRegexItem(set)).toEqual({
type: 'optional',
regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?',
regex: '(?:\\uD83D\\uDC9A|[\\u1234-\\u1237])?',
item: set,
length: 1,
group: true,
});
@ -206,6 +224,7 @@ describe('Creating chunks of regex', () => {
type: 'sequence',
regex: '\\u2000(?:\\u2100|\\u2101)',
items: [utf16a, set1],
length: 2,
group: false,
});
});

View File

@ -16,6 +16,7 @@ describe('Creating chunks of regex for numbers', () => {
type: 'utf16',
regex: '\\u2763',
numbers: [0x2763],
length: 1,
group: true,
});
@ -25,6 +26,7 @@ describe('Creating chunks of regex for numbers', () => {
type: 'utf16',
regex: '[\\u2761\\u2763-\\u2765]',
numbers: [0x2761, 0x2763, 0x2764, 0x2765],
length: 1,
group: true,
});
@ -37,16 +39,19 @@ describe('Creating chunks of regex for numbers', () => {
type: 'utf16',
regex: '\\uD83D',
numbers: [0xd83d],
length: 1,
group: true,
},
{
type: 'utf16',
regex: '\\uDC9A',
numbers: [0xdc9a],
length: 1,
group: true,
},
],
numbers: [0x1f49a],
length: 2,
group: false,
});
@ -59,6 +64,7 @@ describe('Creating chunks of regex for numbers', () => {
type: 'sequence',
regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]',
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c],
length: 2,
group: false,
});
@ -71,6 +77,7 @@ describe('Creating chunks of regex for numbers', () => {
type: 'set',
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e],
length: 2,
group: false,
});
@ -82,11 +89,12 @@ describe('Creating chunks of regex for numbers', () => {
delete (items3 as unknown as Record<string, unknown>).sets;
expect(items3).toEqual({
type: 'set',
regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]|[\\u2763-\\u2765]',
numbers: [
0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a,
0x1f89b, 0x1f89e,
],
length: 1,
group: false,
});
});
@ -106,6 +114,7 @@ describe('Creating chunks of regex for numbers', () => {
createUTF16EmojiRegexItem([0xd83d]),
createUTF16EmojiRegexItem([0xdc9a]),
],
length: 2,
group: false,
});
@ -120,6 +129,7 @@ describe('Creating chunks of regex for numbers', () => {
createUTF16EmojiRegexItem([0xfe0f])
),
],
length: 3,
group: false,
});
@ -132,6 +142,7 @@ describe('Creating chunks of regex for numbers', () => {
createUTF16EmojiRegexItem([0xdc9a]),
createUTF16EmojiRegexItem([0xfe0f]),
],
length: 3,
group: false,
}
);

View File

@ -91,6 +91,7 @@ describe('Similar chunks of regex', () => {
]),
items[2],
],
length: 1,
group: false,
});
});
@ -163,6 +164,7 @@ describe('Similar chunks of regex', () => {
]),
]),
],
length: 4,
group: false,
});
});
@ -226,6 +228,7 @@ describe('Similar chunks of regex', () => {
]),
items[2],
],
length: 1,
group: false,
});
});
@ -277,6 +280,7 @@ describe('Similar chunks of regex', () => {
),
]),
],
length: 4,
group: false,
});
});
@ -340,6 +344,7 @@ describe('Similar chunks of regex', () => {
]),
items[2],
],
length: 1,
group: false,
});
});
@ -427,16 +432,17 @@ describe('Similar chunks of regex', () => {
expect(set).toEqual({
type: 'set',
regex:
// last 2 items (set items are sorted alphabetically),
// 6 numbers from common chunks, grouped mix
'\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
// last 2 items (set items are sorted by length, then alphabetically),
'\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
slicedSet.regex +
')',
')|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83D\\uDE4F',
sets: [
items[4],
items[5],
createSequenceEmojiRegexItem([...slicedSequence, slicedSet]),
items[5],
items[4],
],
length: 2,
group: false,
});
});

View File

@ -0,0 +1,102 @@
import { createOptimisedRegex } from '../lib/emoji/regex/create';
describe('Emoji regex matching', () => {
it('Simple regex', () => {
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
const matches = `
E1.0 grinning face: ${String.fromCodePoint(0x1f600)}
E0.6 grinning face with big eyes: ${String.fromCodePoint(0x1f603)}
E1.0 grinning face: ${String.fromCodePoint(0x1f600)}
`.match(new RegExp(regexValue, 'g'));
expect(matches?.length).toBe(3);
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f600));
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f600));
expect(matches?.[1]).toBe(String.fromCodePoint(0x1f603));
});
it('Sequences', () => {
const regexValue = createOptimisedRegex([
// Emoji with optional variation
'263A FE0F',
// Sequence and single emojis after it
// Add multiple variations to test ranges
'1F62E 200D 1F4A7',
'1F62E 200D 1F4A8',
'1F62E 200D 1F4A9',
'1F62E',
'1F62D',
'1F62F',
'1F4A8',
]);
const matches = `
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
E13.1 face exhaling: ${
String.fromCodePoint(0x1f62e) +
String.fromCodePoint(0x200d) +
String.fromCodePoint(0x1f4a8)
}
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
E0.6 smiling face: ${
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
} (icon)
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
`.match(new RegExp(regexValue, 'g'));
expect(matches?.length).toBe(5);
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8));
expect(matches?.[1]).toBe(
String.fromCodePoint(0x1f62e) +
String.fromCodePoint(0x200d) +
String.fromCodePoint(0x1f4a8)
);
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
expect(matches?.[3]).toBe(
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
);
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
});
it('Skin tones', () => {
const list = [
'1f44b',
'1f44b-1f3fb',
'1f44b-1f3fc',
'1f44b-1f3fd',
'1f44b-1f3fe',
'1f44b-1f3ff',
'1f91a',
'1f91a-1f3fb',
'1f91a-1f3fc',
'1f91a-1f3fd',
'1f91a-1f3fe',
'1f91a-1f3ff',
'1f590-fe0f',
'1f590',
'1f590-1f3fb',
'1f590-1f3fc',
'1f590-1f3fd',
'1f590-1f3fe',
'1f590-1f3ff',
'1f3fb',
'1f3fc',
'1f3fd',
'1f3fe',
'1f3ff',
];
const regexValue = createOptimisedRegex(list);
const matches = `
E1.0 waving hand: medium skin tone: ${
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
}
`.match(new RegExp(regexValue, 'g'));
expect(matches?.length).toBe(1);
expect(matches?.[0]).toBe(
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
);
});
});

View File

@ -42,7 +42,7 @@ describe('Emoji regex tree', () => {
]);
expect(parseEmojiTree(tree).regex).toEqual(
'\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])'
'\\uD83C(?:(?:\\uDFF4\\u200D\\u2620|\\uDFF3)\\uFE0F?|[\\uDFC1\\uDFF3])'
);
});