mirror of
https://github.com/iconify/iconify.git
synced 2025-01-22 14:48:24 +00:00
fix: sort emoji sequences by length to avoid lazy matching
This commit is contained in:
parent
875e9707e6
commit
dd179767f6
@ -17,6 +17,9 @@ interface BaseEmojiItemRegex {
|
||||
|
||||
// True if regex can be treated as a group (does not require wrapping in `(?:` + `)`)
|
||||
group: boolean;
|
||||
|
||||
// Number of characters, minimum value
|
||||
length: number;
|
||||
}
|
||||
|
||||
interface EmojiItemRegexWithNumbers {
|
||||
@ -196,6 +199,7 @@ export function createUTF16EmojiRegexItem(
|
||||
type: 'utf16',
|
||||
regex: '',
|
||||
numbers,
|
||||
length: 1,
|
||||
group: true,
|
||||
};
|
||||
updateUTF16EmojiRegexItem(result);
|
||||
@ -243,6 +247,7 @@ export function createSequenceEmojiRegexItem(
|
||||
type: 'sequence',
|
||||
items,
|
||||
regex: '',
|
||||
length: items.reduce((length, item) => item.length + length, 0),
|
||||
group: false,
|
||||
};
|
||||
|
||||
@ -311,13 +316,23 @@ export function createSetEmojiRegexItem(
|
||||
});
|
||||
|
||||
// Sort items to guarantee same results regardless of order
|
||||
sets.sort((a, b) => a.regex.localeCompare(b.regex));
|
||||
sets.sort((a, b) => {
|
||||
if (a.length === b.length) {
|
||||
return a.regex.localeCompare(b.regex);
|
||||
}
|
||||
return b.length - a.length;
|
||||
});
|
||||
|
||||
// Create item
|
||||
const result: SetEmojiItemRegex = {
|
||||
type: 'set',
|
||||
sets,
|
||||
regex: '',
|
||||
length: sets.reduce(
|
||||
(length, item) =>
|
||||
length ? Math.min(length, item.length) : item.length,
|
||||
0
|
||||
),
|
||||
group: false,
|
||||
};
|
||||
if (numbers) {
|
||||
@ -361,6 +376,7 @@ export function createOptionalEmojiRegexItem(
|
||||
type: 'optional',
|
||||
item,
|
||||
regex: '',
|
||||
length: item.length,
|
||||
group: true,
|
||||
};
|
||||
updateOptionalEmojiRegexItem(result);
|
||||
|
@ -12,6 +12,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'utf16',
|
||||
regex: '\\u2763',
|
||||
numbers: [0x2763],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -20,6 +21,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'utf16',
|
||||
regex: '[\\u2762-\\u2764]',
|
||||
numbers: [0x2762, 0x2763, 0x2764],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -28,6 +30,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'utf16',
|
||||
regex: '[\\u2760\\u2764\\uFE0F]',
|
||||
numbers: [0x2760, 0x2764, 0xfe0f],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -44,6 +47,7 @@ describe('Creating chunks of regex', () => {
|
||||
0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762,
|
||||
0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f,
|
||||
],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
});
|
||||
@ -58,6 +62,7 @@ describe('Creating chunks of regex', () => {
|
||||
regex: '[\\u2000\\u2001]',
|
||||
numbers: [0x2000, 0x2001],
|
||||
items: [num1],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -66,6 +71,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001][\\u2000\\u2100]',
|
||||
items: [num1, num2],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -80,6 +86,7 @@ describe('Creating chunks of regex', () => {
|
||||
regex: '[\\u2000\\u2001]',
|
||||
numbers: [0x2000, 0x2001],
|
||||
sets: [num1],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -89,6 +96,7 @@ describe('Creating chunks of regex', () => {
|
||||
regex: '[\\u2000\\u2001]|[\\u2000\\u2100]',
|
||||
numbers: [0x2000, 0x2001, 0x2000, 0x2100],
|
||||
sets: [num1, num2],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -102,6 +110,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'optional',
|
||||
regex: '\\uFE0F?',
|
||||
item: num1,
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -110,6 +119,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'optional',
|
||||
regex: '[\\uFE0E\\uFE0F]?',
|
||||
item: num2,
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
});
|
||||
@ -126,6 +136,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '\\uFE0F?',
|
||||
items: [fe0f],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -134,6 +145,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001]\\uFE0F?',
|
||||
items: [num1, fe0f],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -142,6 +154,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]',
|
||||
items: [num1, fe0f, num2],
|
||||
length: 3,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -150,6 +163,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?',
|
||||
items: [num2, num1, fe0f],
|
||||
length: 3,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -167,6 +181,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A',
|
||||
items: [utf32a1, utf32a2],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
utf32a.numbers = [0x1f49a];
|
||||
@ -176,6 +191,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'optional',
|
||||
regex: '(?:\\uD83D\\uDC9A)?',
|
||||
item: utf32a,
|
||||
length: 2,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -183,17 +199,19 @@ describe('Creating chunks of regex', () => {
|
||||
const set = createSetEmojiRegexItem([num1, utf32a]);
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A',
|
||||
sets: [num1, utf32a],
|
||||
regex: '\\uD83D\\uDC9A|[\\u1234-\\u1237]',
|
||||
sets: [utf32a, num1],
|
||||
numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Make it optional
|
||||
expect(createOptionalEmojiRegexItem(set)).toEqual({
|
||||
type: 'optional',
|
||||
regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?',
|
||||
regex: '(?:\\uD83D\\uDC9A|[\\u1234-\\u1237])?',
|
||||
item: set,
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -206,6 +224,7 @@ describe('Creating chunks of regex', () => {
|
||||
type: 'sequence',
|
||||
regex: '\\u2000(?:\\u2100|\\u2101)',
|
||||
items: [utf16a, set1],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
@ -16,6 +16,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
type: 'utf16',
|
||||
regex: '\\u2763',
|
||||
numbers: [0x2763],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -25,6 +26,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
type: 'utf16',
|
||||
regex: '[\\u2761\\u2763-\\u2765]',
|
||||
numbers: [0x2761, 0x2763, 0x2764, 0x2765],
|
||||
length: 1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
@ -37,16 +39,19 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
type: 'utf16',
|
||||
regex: '\\uD83D',
|
||||
numbers: [0xd83d],
|
||||
length: 1,
|
||||
group: true,
|
||||
},
|
||||
{
|
||||
type: 'utf16',
|
||||
regex: '\\uDC9A',
|
||||
numbers: [0xdc9a],
|
||||
length: 1,
|
||||
group: true,
|
||||
},
|
||||
],
|
||||
numbers: [0x1f49a],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -59,6 +64,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
type: 'sequence',
|
||||
regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]',
|
||||
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -71,6 +77,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
type: 'set',
|
||||
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
|
||||
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -82,11 +89,12 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
delete (items3 as unknown as Record<string, unknown>).sets;
|
||||
expect(items3).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
|
||||
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]|[\\u2763-\\u2765]',
|
||||
numbers: [
|
||||
0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a,
|
||||
0x1f89b, 0x1f89e,
|
||||
],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -106,6 +114,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
createUTF16EmojiRegexItem([0xd83d]),
|
||||
createUTF16EmojiRegexItem([0xdc9a]),
|
||||
],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -120,6 +129,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
createUTF16EmojiRegexItem([0xfe0f])
|
||||
),
|
||||
],
|
||||
length: 3,
|
||||
group: false,
|
||||
});
|
||||
|
||||
@ -132,6 +142,7 @@ describe('Creating chunks of regex for numbers', () => {
|
||||
createUTF16EmojiRegexItem([0xdc9a]),
|
||||
createUTF16EmojiRegexItem([0xfe0f]),
|
||||
],
|
||||
length: 3,
|
||||
group: false,
|
||||
}
|
||||
);
|
||||
|
@ -91,6 +91,7 @@ describe('Similar chunks of regex', () => {
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -163,6 +164,7 @@ describe('Similar chunks of regex', () => {
|
||||
]),
|
||||
]),
|
||||
],
|
||||
length: 4,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -226,6 +228,7 @@ describe('Similar chunks of regex', () => {
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -277,6 +280,7 @@ describe('Similar chunks of regex', () => {
|
||||
),
|
||||
]),
|
||||
],
|
||||
length: 4,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -340,6 +344,7 @@ describe('Similar chunks of regex', () => {
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
length: 1,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
@ -427,16 +432,17 @@ describe('Similar chunks of regex', () => {
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex:
|
||||
// last 2 items (set items are sorted alphabetically),
|
||||
// 6 numbers from common chunks, grouped mix
|
||||
'\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
|
||||
// last 2 items (set items are sorted by length, then alphabetically),
|
||||
'\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
|
||||
slicedSet.regex +
|
||||
')',
|
||||
')|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83D\\uDE4F',
|
||||
sets: [
|
||||
items[4],
|
||||
items[5],
|
||||
createSequenceEmojiRegexItem([...slicedSequence, slicedSet]),
|
||||
items[5],
|
||||
items[4],
|
||||
],
|
||||
length: 2,
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
102
packages/utils/tests/emoji-regex-test.ts
Normal file
102
packages/utils/tests/emoji-regex-test.ts
Normal file
@ -0,0 +1,102 @@
|
||||
import { createOptimisedRegex } from '../lib/emoji/regex/create';
|
||||
|
||||
describe('Emoji regex matching', () => {
|
||||
it('Simple regex', () => {
|
||||
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
|
||||
|
||||
const matches = `
|
||||
E1.0 grinning face: ${String.fromCodePoint(0x1f600)}
|
||||
E0.6 grinning face with big eyes: ${String.fromCodePoint(0x1f603)}
|
||||
E1.0 grinning face: ${String.fromCodePoint(0x1f600)}
|
||||
`.match(new RegExp(regexValue, 'g'));
|
||||
|
||||
expect(matches?.length).toBe(3);
|
||||
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f600));
|
||||
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f600));
|
||||
expect(matches?.[1]).toBe(String.fromCodePoint(0x1f603));
|
||||
});
|
||||
|
||||
it('Sequences', () => {
|
||||
const regexValue = createOptimisedRegex([
|
||||
// Emoji with optional variation
|
||||
'263A FE0F',
|
||||
// Sequence and single emojis after it
|
||||
// Add multiple variations to test ranges
|
||||
'1F62E 200D 1F4A7',
|
||||
'1F62E 200D 1F4A8',
|
||||
'1F62E 200D 1F4A9',
|
||||
'1F62E',
|
||||
'1F62D',
|
||||
'1F62F',
|
||||
'1F4A8',
|
||||
]);
|
||||
|
||||
const matches = `
|
||||
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
|
||||
E13.1 face exhaling: ${
|
||||
String.fromCodePoint(0x1f62e) +
|
||||
String.fromCodePoint(0x200d) +
|
||||
String.fromCodePoint(0x1f4a8)
|
||||
}
|
||||
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
|
||||
E0.6 smiling face: ${
|
||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
||||
} (icon)
|
||||
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
|
||||
`.match(new RegExp(regexValue, 'g'));
|
||||
|
||||
expect(matches?.length).toBe(5);
|
||||
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8));
|
||||
expect(matches?.[1]).toBe(
|
||||
String.fromCodePoint(0x1f62e) +
|
||||
String.fromCodePoint(0x200d) +
|
||||
String.fromCodePoint(0x1f4a8)
|
||||
);
|
||||
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
|
||||
expect(matches?.[3]).toBe(
|
||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
||||
);
|
||||
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
|
||||
});
|
||||
|
||||
it('Skin tones', () => {
|
||||
const list = [
|
||||
'1f44b',
|
||||
'1f44b-1f3fb',
|
||||
'1f44b-1f3fc',
|
||||
'1f44b-1f3fd',
|
||||
'1f44b-1f3fe',
|
||||
'1f44b-1f3ff',
|
||||
'1f91a',
|
||||
'1f91a-1f3fb',
|
||||
'1f91a-1f3fc',
|
||||
'1f91a-1f3fd',
|
||||
'1f91a-1f3fe',
|
||||
'1f91a-1f3ff',
|
||||
'1f590-fe0f',
|
||||
'1f590',
|
||||
'1f590-1f3fb',
|
||||
'1f590-1f3fc',
|
||||
'1f590-1f3fd',
|
||||
'1f590-1f3fe',
|
||||
'1f590-1f3ff',
|
||||
'1f3fb',
|
||||
'1f3fc',
|
||||
'1f3fd',
|
||||
'1f3fe',
|
||||
'1f3ff',
|
||||
];
|
||||
const regexValue = createOptimisedRegex(list);
|
||||
|
||||
const matches = `
|
||||
E1.0 waving hand: medium skin tone: ${
|
||||
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
|
||||
}
|
||||
`.match(new RegExp(regexValue, 'g'));
|
||||
|
||||
expect(matches?.length).toBe(1);
|
||||
expect(matches?.[0]).toBe(
|
||||
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
|
||||
);
|
||||
});
|
||||
});
|
@ -42,7 +42,7 @@ describe('Emoji regex tree', () => {
|
||||
]);
|
||||
|
||||
expect(parseEmojiTree(tree).regex).toEqual(
|
||||
'\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])'
|
||||
'\\uD83C(?:(?:\\uDFF4\\u200D\\u2620|\\uDFF3)\\uFE0F?|[\\uDFC1\\uDFF3])'
|
||||
);
|
||||
});
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user