2
0
mirror of https://github.com/iconify/iconify.git synced 2025-01-22 14:48:24 +00:00

feat: functions to optimise emoji regex

This commit is contained in:
Vjacheslav Trushkin 2022-12-09 19:26:05 +02:00
parent ac61bc049a
commit 78ce0d0c4c
10 changed files with 2331 additions and 8 deletions

View File

@ -102,6 +102,26 @@
"import": "./lib/customisations/rotate.mjs",
"types": "./lib/customisations/rotate.d.ts"
},
"./lib/emoji/regex/base": {
"require": "./lib/emoji/regex/base.cjs",
"import": "./lib/emoji/regex/base.mjs",
"types": "./lib/emoji/regex/base.d.ts"
},
"./lib/emoji/regex/numbers": {
"require": "./lib/emoji/regex/numbers.cjs",
"import": "./lib/emoji/regex/numbers.mjs",
"types": "./lib/emoji/regex/numbers.d.ts"
},
"./lib/emoji/regex/similar": {
"require": "./lib/emoji/regex/similar.cjs",
"import": "./lib/emoji/regex/similar.mjs",
"types": "./lib/emoji/regex/similar.d.ts"
},
"./lib/emoji/regex/tree": {
"require": "./lib/emoji/regex/tree.cjs",
"import": "./lib/emoji/regex/tree.mjs",
"types": "./lib/emoji/regex/tree.d.ts"
},
"./lib/emoji/cleanup": {
"require": "./lib/emoji/cleanup.cjs",
"import": "./lib/emoji/cleanup.mjs",

View File

@ -3,7 +3,7 @@ import {
convertEmojiSequenceToUTF32,
} from './convert';
interface UnicodeOptions {
export interface UnicodeFormattingOptions {
// Prefix before each character '\\u'
prefix: string;
@ -23,7 +23,7 @@ interface UnicodeOptions {
throwOnError: boolean;
}
const defaultUnicodeOptions: UnicodeOptions = {
const defaultUnicodeOptions: UnicodeFormattingOptions = {
prefix: '',
separator: '',
case: 'lower',
@ -35,7 +35,10 @@ const defaultUnicodeOptions: UnicodeOptions = {
/**
* Convert number to string
*/
function convert(sequence: number[], options: UnicodeOptions): string {
function convert(
sequence: number[],
options: UnicodeFormattingOptions
): string {
const prefix = options.prefix;
const func = options.case === 'upper' ? 'toUpperCase' : 'toLowerCase';
@ -60,7 +63,7 @@ function convert(sequence: number[], options: UnicodeOptions): string {
*/
export function getEmojiUnicodeString(
code: number,
options: Partial<UnicodeOptions> = {}
options: Partial<UnicodeFormattingOptions> = {}
): string {
return convert([code], {
...defaultUnicodeOptions,
@ -68,7 +71,7 @@ export function getEmojiUnicodeString(
});
}
const defaultSequenceOptions: UnicodeOptions = {
const defaultSequenceOptions: UnicodeFormattingOptions = {
...defaultUnicodeOptions,
separator: '-',
};
@ -78,7 +81,7 @@ const defaultSequenceOptions: UnicodeOptions = {
*/
export function getEmojiSequenceString(
sequence: number[],
options: Partial<UnicodeOptions> = {}
options: Partial<UnicodeFormattingOptions> = {}
): string {
return convert(sequence, {
...defaultSequenceOptions,
@ -86,7 +89,7 @@ export function getEmojiSequenceString(
});
}
const regexOptions: UnicodeOptions = {
const regexOptions: UnicodeFormattingOptions = {
prefix: '\\u',
separator: '',
case: 'upper',
@ -108,7 +111,7 @@ export function emojiSequenceToRegex(
});
}
const keywordOptions: UnicodeOptions = {
const keywordOptions: UnicodeFormattingOptions = {
prefix: '',
separator: '-',
case: 'lower',

View File

@ -0,0 +1,423 @@
import { getEmojiUnicodeString, UnicodeFormattingOptions } from '../format';
/**
* Regex in item
*/
interface BaseEmojiItemRegex {
// Regex type:
// 'utf16' -> utf16 number(s)
// 'sequence' -> sequence, not wrapped in `(?:` + `)`
// requires wrapping, unless marked as wrapped
// 'options' -> list of options, not wrapped in `(?:` + `)`
// requires wrapping
type: 'utf16' | 'sequence' | 'set' | 'optional';
// Regex
regex: string;
// True if regex can be treated as a group (does not require wrapping in `(?:` + `)`)
group: boolean;
}
interface EmojiItemRegexWithNumbers {
// Numbers in regex, set if regex represents set of numbers. Allows
// creation of number ranges when combining multiple regex items
// Cannot be empty array
numbers?: number[];
}
// Numbers
export interface UTF16EmojiItemRegex
extends BaseEmojiItemRegex,
Required<EmojiItemRegexWithNumbers> {
type: 'utf16';
// Always grouped
group: true;
// `numbers` is required
}
// Sequence
type SequenceEmojiItemRegexItem =
| UTF16EmojiItemRegex
| SetEmojiItemRegex
| OptionalEmojiItemRegex;
export interface SequenceEmojiItemRegex
extends BaseEmojiItemRegex,
EmojiItemRegexWithNumbers {
type: 'sequence';
// Items in sequence. Any type except another sequence
items: SequenceEmojiItemRegexItem[];
}
// Set
export type SetEmojiItemRegexItem =
| UTF16EmojiItemRegex
| SequenceEmojiItemRegex
| OptionalEmojiItemRegex;
export interface SetEmojiItemRegex
extends BaseEmojiItemRegex,
EmojiItemRegexWithNumbers {
type: 'set';
// Items in set. Any type except another set
sets: SetEmojiItemRegexItem[];
}
// Optional
type OptionalEmojiItemRegexItem =
| UTF16EmojiItemRegex
| SequenceEmojiItemRegex
| SetEmojiItemRegex;
export interface OptionalEmojiItemRegex extends BaseEmojiItemRegex {
type: 'optional';
// Wrapped item. Any type except another optional item
item: OptionalEmojiItemRegexItem;
// Always grouped
group: true;
}
export type EmojiItemRegex =
| UTF16EmojiItemRegex
| SequenceEmojiItemRegex
| SetEmojiItemRegex
| OptionalEmojiItemRegex;
/**
* Options for converting number to string
*/
const numberToStringOptions: Partial<UnicodeFormattingOptions> = {
prefix: '\\u',
separator: '',
case: 'upper',
format: 'utf-16',
add0: true,
};
/**
* Convert number to string
*/
function toString(number: number): string {
return getEmojiUnicodeString(number, numberToStringOptions);
}
/**
* Typescript stuff
*/
// eslint-disable-next-line @typescript-eslint/no-unused-vars
function assertNever(v: never) {
// Empty function that should never be called
}
/**
* Wrap regex in group
*/
export function wrapRegexInGroup(regex: string): string {
return '(?:' + regex + ')';
}
/**
* Update UTF16 item, return regex
*/
export function updateUTF16EmojiRegexItem(item: UTF16EmojiItemRegex): string {
const numbers = item.numbers;
if (numbers.length === 1) {
// 1 number
const num = numbers[0];
return (item.regex = toString(num));
}
// Multiple numbers
numbers.sort((a, b) => a - b);
const chars: string[] = [];
interface Range {
start: number;
last: number;
numbers: number[];
}
let range: Range | null = null;
const addRange = () => {
if (range) {
const { start, last, numbers } = range;
range = null;
if (last > start + 1) {
// More than 2 items
chars.push(toString(start) + '-' + toString(last));
} else {
for (let i = 0; i < numbers.length; i++) {
chars.push(toString(numbers[i]));
}
}
}
};
for (let i = 0; i < numbers.length; i++) {
const num = numbers[i];
if (range) {
if (range.last === num) {
// Duplicate
continue;
}
if (range.last === num - 1) {
// Add to existing range
range.numbers.push(num);
range.last = num;
continue;
}
}
// Not in range: start new one
addRange();
range = {
start: num,
last: num,
numbers: [num],
};
}
addRange();
if (!chars.length) {
throw new Error('Unexpected empty range');
}
return (item.regex = '[' + chars.join('') + ']');
}
/**
* Create UTF-16 regex
*/
export function createUTF16EmojiRegexItem(
numbers: number[]
): UTF16EmojiItemRegex {
const result: UTF16EmojiItemRegex = {
type: 'utf16',
regex: '',
numbers,
group: true,
};
updateUTF16EmojiRegexItem(result);
return result;
}
/**
* Update sequence regex. Does not update group
*/
export function updateSequenceEmojiRegexItem(
item: SequenceEmojiItemRegex
): string {
return (item.regex = item.items
.map((childItem) => {
if (!childItem.group && childItem.type === 'set') {
return wrapRegexInGroup(childItem.regex);
}
return childItem.regex;
})
.join(''));
}
/**
* Create sequence regex
*/
export function createSequenceEmojiRegexItem(
sequence: EmojiItemRegex[],
numbers?: number[]
): SequenceEmojiItemRegex {
// Merge items
let items: SequenceEmojiItemRegexItem[] = [];
sequence.forEach((item) => {
if (item.type === 'sequence') {
items = items.concat(item.items);
} else {
items.push(item);
}
});
// Generate item
if (!items.length) {
throw new Error('Empty sequence');
}
const result: SequenceEmojiItemRegex = {
type: 'sequence',
items,
regex: '',
group: false,
};
if (sequence.length === 1) {
const firstItem = sequence[0];
result.group = firstItem.group;
if (firstItem.type !== 'optional') {
const numbers = firstItem.numbers;
if (numbers) {
result.numbers = numbers;
}
}
}
if (numbers) {
result.numbers = numbers;
}
// Update regex
updateSequenceEmojiRegexItem(result);
return result;
}
/**
* Update set regex and group
*/
export function updateSetEmojiRegexItem(item: SetEmojiItemRegex): string {
if (item.sets.length === 1) {
// 1 item
const firstItem = item.sets[0];
item.group = firstItem.group;
return (item.regex = firstItem.regex);
}
// Multiple items
item.group = false;
return (item.regex = item.sets
.map((childItem) => childItem.regex)
.join('|'));
}
/**
* Create set regex
*/
export function createSetEmojiRegexItem(
set: EmojiItemRegex[]
): SetEmojiItemRegex {
let sets: SetEmojiItemRegexItem[] = [];
let numbers: number[] | null = [];
set.forEach((item) => {
if (item.type === 'set') {
sets = sets.concat(item.sets);
} else {
sets.push(item);
}
// Copy numbers
if (numbers) {
if (item.type === 'optional' || !item.numbers) {
numbers = null;
} else {
numbers = [...numbers, ...item.numbers];
}
}
});
// Sort items to guarantee same results regardless of order
sets.sort((a, b) => a.regex.localeCompare(b.regex));
// Create item
const result: SetEmojiItemRegex = {
type: 'set',
sets,
regex: '',
group: false,
};
if (numbers) {
result.numbers = numbers;
}
if (set.length === 1) {
const firstItem = set[0];
result.group = firstItem.group;
}
updateSetEmojiRegexItem(result);
return result;
}
/**
* Update optional regex
*/
export function updateOptionalEmojiRegexItem(
item: OptionalEmojiItemRegex
): string {
const childItem = item.item;
const regex =
(childItem.group
? childItem.regex
: wrapRegexInGroup(childItem.regex)) + '?';
return (item.regex = regex);
}
/**
* Create optional item
*/
export function createOptionalEmojiRegexItem(
item: EmojiItemRegex
): OptionalEmojiItemRegex {
if (item.type === 'optional') {
return item;
}
const result: OptionalEmojiItemRegex = {
type: 'optional',
item,
regex: '',
group: true,
};
updateOptionalEmojiRegexItem(result);
return result;
}
/**
* Clone item
*/
export function cloneEmojiRegexItem<T extends BaseEmojiItemRegex>(
item: T,
shallow = false
): T {
const result = {
...item,
} as unknown as EmojiItemRegex;
// Copy numbers
if (result.type !== 'optional' && result.numbers) {
result.numbers = [...result.numbers];
}
// Clone lists
switch (result.type) {
case 'utf16':
// Nothing to do
break;
case 'sequence':
if (shallow) {
result.items = [...result.items];
} else {
result.items = result.items.map((item) =>
cloneEmojiRegexItem(item, false)
);
}
break;
case 'set':
if (shallow) {
result.sets = [...result.sets];
} else {
result.sets = result.sets.map((item) =>
cloneEmojiRegexItem(item, false)
);
}
break;
case 'optional':
if (!shallow) {
result.item = cloneEmojiRegexItem(result.item, false);
}
break;
default:
assertNever(result);
}
return result as unknown as T;
}

View File

@ -0,0 +1,246 @@
import { splitUTF32Number } from '../convert';
import {
createOptionalEmojiRegexItem,
createSequenceEmojiRegexItem,
createSetEmojiRegexItem,
createUTF16EmojiRegexItem,
EmojiItemRegex,
OptionalEmojiItemRegex,
SequenceEmojiItemRegex,
SetEmojiItemRegex,
UTF16EmojiItemRegex,
} from './base';
import { vs16Emoji } from '../data';
/**
* Create regex item for set of numbers
*/
export function createEmojiRegexItemForNumbers(
numbers: number[]
): UTF16EmojiItemRegex | SequenceEmojiItemRegex | SetEmojiItemRegex {
// Separate UTF-16 and UTF-32
interface UTF32FirstNumber {
first: number;
second: number[];
numbers: number[];
}
const utf32: UTF32FirstNumber[] = [];
const utf16: number[] = [];
numbers.sort((a, b) => a - b);
let lastNumber: number | undefined;
for (let i = 0; i < numbers.length; i++) {
const number = numbers[i];
if (number === lastNumber) {
continue;
}
lastNumber = number;
const split = splitUTF32Number(number);
if (!split) {
utf16.push(number);
continue;
}
const [first, second] = split;
const item = utf32.find((item) => item.first === first);
if (item) {
item.second.push(second);
item.numbers.push(number);
} else {
utf32.push({
first,
second: [second],
numbers: [number],
});
}
}
const results: (UTF16EmojiItemRegex | SequenceEmojiItemRegex)[] = [];
// Merge UTF-16
if (utf16.length) {
results.push(createUTF16EmojiRegexItem(utf16));
}
// Merge UTF-32
if (utf32.length) {
// Create map of first and second chunks, joining by common second chunks
interface UTF32Item {
second: UTF16EmojiItemRegex;
first: number[];
numbers: number[];
}
const utf32Set: UTF32Item[] = [];
for (let i = 0; i < utf32.length; i++) {
const item = utf32[i];
const secondRegex = createUTF16EmojiRegexItem(item.second);
// Find matching elements
const listItem = utf32Set.find(
(item) => item.second.regex === secondRegex.regex
);
if (listItem) {
// Found multiple items with the same last set
listItem.first.push(item.first);
listItem.numbers = [...listItem.numbers, ...item.numbers];
} else {
utf32Set.push({
second: secondRegex,
first: [item.first],
numbers: [...item.numbers],
});
}
}
// Create regex for each set
for (let i = 0; i < utf32Set.length; i++) {
const item = utf32Set[i];
const firstRegex = createUTF16EmojiRegexItem(item.first);
const secondRegex = item.second;
// Generate regex, add numbers list for reference
results.push(
createSequenceEmojiRegexItem(
[firstRegex, secondRegex],
item.numbers
)
);
}
}
return results.length === 1 ? results[0] : createSetEmojiRegexItem(results);
}
/**
* Create sequence of numbers
*/
export function createRegexForNumbersSequence(
numbers: number[],
optionalVariations = true
): SequenceEmojiItemRegex | UTF16EmojiItemRegex | OptionalEmojiItemRegex {
const items: (UTF16EmojiItemRegex | OptionalEmojiItemRegex)[] = [];
for (let i = 0; i < numbers.length; i++) {
const num = numbers[i];
const split = splitUTF32Number(num);
if (!split) {
// UTF-16 number
const item = createUTF16EmojiRegexItem([num]);
if (optionalVariations && num === vs16Emoji) {
items.push(createOptionalEmojiRegexItem(item));
} else {
items.push(item);
}
} else {
// UTF-32 number
items.push(createUTF16EmojiRegexItem([split[0]]));
items.push(createUTF16EmojiRegexItem([split[1]]));
}
}
if (items.length === 1) {
// Only 1 item
return items[0];
}
const result = createSequenceEmojiRegexItem(items);
if (numbers.length === 1 && items[0].type === 'utf16') {
// Copy numbers if utf-16 or utf-32 sequence
result.numbers = [...numbers];
}
return result;
}
/**
* Attempt to optimise numbers in a set
*/
export function optimiseNumbersSet(set: SetEmojiItemRegex): EmojiItemRegex {
interface Match {
numbers: number[];
items: EmojiItemRegex[];
}
const mandatoryMatches: Match = {
numbers: [],
items: [],
};
const optionalMatches: Match = {
numbers: [],
items: [],
};
const filteredItems: EmojiItemRegex[] = set.sets.filter((item) => {
if (item.type === 'optional') {
const parentItem = item.item;
if (parentItem.numbers) {
optionalMatches.items.push(item);
optionalMatches.numbers = optionalMatches.numbers.concat(
parentItem.numbers
);
return false;
}
return true;
}
if (item.numbers) {
mandatoryMatches.items.push(item);
mandatoryMatches.numbers = mandatoryMatches.numbers.concat(
item.numbers
);
return false;
}
return true;
});
// Check if there is something to optimise
if (mandatoryMatches.items.length + optionalMatches.items.length < 2) {
return set;
}
// Remove duplicate numbers
const optionalNumbers = new Set(optionalMatches.numbers);
let foundMatches = false;
mandatoryMatches.numbers = mandatoryMatches.numbers.filter((number) => {
if (optionalNumbers.has(number)) {
foundMatches = true;
return false;
}
return true;
});
// Check mandatory numbers
if (mandatoryMatches.items.length) {
if (!foundMatches && mandatoryMatches.items.length === 1) {
// 1 unchanged item
filteredItems.push(mandatoryMatches.items[0]);
} else if (mandatoryMatches.numbers.length) {
// Merge items
filteredItems.push(
createEmojiRegexItemForNumbers(mandatoryMatches.numbers)
);
}
}
// Check optional numbers
switch (optionalMatches.items.length) {
case 0:
break;
case 1:
filteredItems.push(optionalMatches.items[0]);
break;
default:
filteredItems.push(
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers(optionalMatches.numbers)
)
);
}
// Return regex
return filteredItems.length === 1
? filteredItems[0]
: createSetEmojiRegexItem(filteredItems);
}

View File

@ -0,0 +1,372 @@
import {
cloneEmojiRegexItem,
createOptionalEmojiRegexItem,
createSequenceEmojiRegexItem,
createSetEmojiRegexItem,
EmojiItemRegex,
SetEmojiItemRegex,
} from './base';
import { optimiseNumbersSet } from './numbers';
type SlicePosition = 'start' | 'end';
type SliceValue = number | 'full';
/**
* Slice of sequence
*/
interface SimilarRegexItemSlice {
// Index of item in sequences list
index: number;
// Start (for 'end' slices) or end (for 'start' slices) of slice
// 'full' if nothing to slice
slice: SliceValue;
}
/**
* Similar sequence
*/
interface SimilarRegexItemSequence {
// Where common part is found
// Common chunks can exist only at start or end of sequence, not in middle
type: SlicePosition;
// Slices. Key is index in items list, value is start (for 'end' slices)
// or end (for 'start' slices) of slice, 'full' for full items
slices: SimilarRegexItemSlice[];
}
/**
* Result if findSimilarRegexItemSequences()
*/
interface SimilarRegexItemSequenceResult {
// Replacement score: how many characters will be saved by merging items
score: number;
// Sequences that match it
sequences: SimilarRegexItemSequence[];
}
/**
* Typescript stuff
*/
// eslint-disable-next-line @typescript-eslint/no-unused-vars
function assertNever(v: never) {
// Empty function that should never be called
}
/**
* Find similar item sequences
*
* Returns sequence(s) with highest score. Only one of results should be
* applied to items. If there are multiple sequences, clone items list,
* attempt to apply each sequence, run further optimisations on each fork
* and see which one returns better result.
*
* Returns undefined if no common sequences found
*/
export function findSimilarRegexItemSequences(
items: EmojiItemRegex[]
): SimilarRegexItemSequenceResult | undefined {
interface MapItem {
score: number;
slices: SimilarRegexItemSlice[];
}
// Regex at start and end of sequences
// Key = regex combination
const startRegex = Object.create(null) as Record<string, MapItem>;
const endRegex = Object.create(null) as Record<string, MapItem>;
const addMapItem = (
target: Record<string, MapItem>,
index: number,
regex: string,
slice: SliceValue
) => {
if (!target[regex]) {
// New item
target[regex] = {
// Start with 0. One item will remain after replacement
score: 0,
slices: [
{
index,
slice,
},
],
};
return;
}
// Existing item
const item = target[regex];
item.score += regex.length;
item.slices.push({
index,
slice,
});
};
// Create list of all possible sequences
for (let index = 0; index < items.length; index++) {
const baseItem = items[index];
switch (baseItem.type) {
case 'optional':
case 'utf16': {
// Nothing to split
addMapItem(startRegex, index, baseItem.regex, 'full');
addMapItem(endRegex, index, baseItem.regex, 'full');
break;
}
case 'sequence': {
// Add as full item
addMapItem(startRegex, index, baseItem.regex, 'full');
addMapItem(endRegex, index, baseItem.regex, 'full');
// Add chunks
const sequence = baseItem.items;
for (let i = 1; i < sequence.length; i++) {
const startSequence = createSequenceEmojiRegexItem(
sequence.slice(0, i)
);
addMapItem(startRegex, index, startSequence.regex, i);
const endSequence = createSequenceEmojiRegexItem(
sequence.slice(i)
);
addMapItem(endRegex, index, endSequence.regex, i);
}
break;
}
case 'set':
throw new Error('Unexpected set within a set');
default:
assertNever(baseItem);
}
}
// Create list of usable matches
let result: SimilarRegexItemSequenceResult | undefined;
const checkResults = (
target: Record<string, MapItem>,
type: SlicePosition
) => {
for (const regex in target) {
const item = target[regex];
if (!item.score) {
continue;
}
if (!result || result.score < item.score) {
// New highest score
result = {
score: item.score,
sequences: [
{
type,
slices: item.slices,
},
],
};
continue;
}
if (result.score === item.score) {
// Same score
result.sequences.push({
type,
slices: item.slices,
});
}
}
};
checkResults(startRegex, 'start');
checkResults(endRegex, 'end');
return result;
}
/**
* Merge similar sequences
*
* Accepts callback to run optimisation on created subset
*/
export function mergeSimilarRegexItemSequences(
items: EmojiItemRegex[],
merge: SimilarRegexItemSequence,
optimise?: (set: SetEmojiItemRegex) => EmojiItemRegex
): EmojiItemRegex[] {
const { type, slices } = merge;
// Get common chunks
const indexes: Set<number> = new Set();
let hasFullSequence = false;
let longestMatch = 0;
let longestMatchIndex = -1;
const differentSequences: EmojiItemRegex[][] = [];
for (let i = 0; i < slices.length; i++) {
const { index, slice } = slices[i];
const item = items[index];
let length: number;
if (slice === 'full') {
// Full match
hasFullSequence = true;
if (item.type === 'sequence') {
length = item.items.length;
} else {
length = 1;
}
} else {
length = slice;
if (item.type !== 'sequence') {
throw new Error(
`Unexpected partial match for type "${item.type}"`
);
}
// Copy remaining chunks
differentSequences.push(
type === 'start'
? item.items.slice(slice)
: item.items.slice(0, slice)
);
}
if (length > longestMatch) {
longestMatchIndex = index;
longestMatch = length;
}
indexes.add(index);
}
// Found common chunk
if (longestMatch < 1 || longestMatchIndex < 0) {
throw new Error('Cannot find common sequence');
}
// Get longest common item as sequence
const commonItem = items[longestMatchIndex];
let sequence: EmojiItemRegex[];
if (commonItem.type !== 'sequence') {
// Full match
if (longestMatch !== 1) {
throw new Error(
'Something went wrong. Cannot have long match in non-sequence'
);
}
sequence = [commonItem];
} else {
// Sequence
sequence =
type === 'start'
? commonItem.items.slice(0, longestMatch)
: commonItem.items.slice(longestMatch);
}
// Merge other chunks
const setItems: EmojiItemRegex[] = [];
for (let i = 0; i < differentSequences.length; i++) {
const list = differentSequences[i];
if (list.length === 1) {
// 1 item
setItems.push(list[0]);
} else {
// create sequence
setItems.push(createSequenceEmojiRegexItem(list));
}
}
// Create set, optimise is, make it optional
const set = createSetEmojiRegexItem(setItems);
let mergedChunk: EmojiItemRegex =
set.sets.length === 1
? // Do not run callback if only 1 item
set.sets[0]
: optimise
? // Run callback to optimise it
optimise(set)
: // Use set as is
set;
if (hasFullSequence) {
// Wrap in optional
mergedChunk = createOptionalEmojiRegexItem(mergedChunk);
}
// Add set to sequence
sequence[type === 'start' ? 'push' : 'unshift'](mergedChunk);
// Create result by combining merged item and remaining items
const results: EmojiItemRegex[] = [
createSequenceEmojiRegexItem(sequence),
...items.filter((item, index) => !indexes.has(index)),
];
return results;
}
/**
* Merge similar items
*/
export function mergeSimilarItemsInSet(set: SetEmojiItemRegex): EmojiItemRegex {
// Check for numbers
const updatedSet = optimiseNumbersSet(set);
if (updatedSet.type !== 'set') {
return updatedSet;
}
set = updatedSet;
// Attempt to find common stuff
let merges: SimilarRegexItemSequenceResult | undefined;
while ((merges = findSimilarRegexItemSequences(set.sets))) {
const sequences = merges.sequences;
if (sequences.length === 1) {
// Only 1 sequence
const merged = mergeSimilarRegexItemSequences(
set.sets.map((item) => cloneEmojiRegexItem(item, true)),
sequences[0],
mergeSimilarItemsInSet
);
if (merged.length === 1) {
// No longer a set
return merged[0];
}
// New set
set = createSetEmojiRegexItem(merged);
continue;
}
// Multiple merges
let newItem: EmojiItemRegex | undefined;
for (let i = 0; i < sequences.length; i++) {
const merged = mergeSimilarRegexItemSequences(
set.sets.map((item) => cloneEmojiRegexItem(item, true)),
sequences[i],
mergeSimilarItemsInSet
);
const mergedItem =
merged.length === 1
? merged[0]
: createSetEmojiRegexItem(merged);
if (!newItem || mergedItem.regex.length < newItem.regex.length) {
newItem = mergedItem;
}
}
if (!newItem) {
throw new Error('Empty sequences list');
}
if (newItem.type !== 'set') {
return newItem;
}
set = newItem;
}
return set;
}

View File

@ -0,0 +1,182 @@
import {
createOptionalEmojiRegexItem,
createSequenceEmojiRegexItem,
createSetEmojiRegexItem,
createUTF16EmojiRegexItem,
EmojiItemRegex,
} from './base';
import { splitEmojiSequences } from '../cleanup';
import { convertEmojiSequenceToUTF32 } from '../convert';
import { createRegexForNumbersSequence } from './numbers';
import { joinerEmoji } from '../data';
import { mergeSimilarItemsInSet } from './similar';
/**
* Tree item
*/
interface TreeItem {
// Regex
regex: EmojiItemRegex;
// True if end of sequence. If children are set, it means children are optional
end?: true;
// Child elements, separated with 0x200d
children?: TreeItem[];
}
/**
* Create tree
*/
export function createEmojisTree(sequences: number[][]): TreeItem[] {
const root: TreeItem[] = [];
for (let i = 0; i < sequences.length; i++) {
// Convert to UTF-32 and split
const split = splitEmojiSequences(
convertEmojiSequenceToUTF32(sequences[i])
);
// Get items
let parent = root;
for (let j = 0; j < split.length; j++) {
const regex = createRegexForNumbersSequence(split[j]);
// Find item
let item: TreeItem;
const match = parent.find(
(item) => item.regex.regex === regex.regex
);
if (!match) {
// Create new item
item = {
regex,
};
parent.push(item);
} else {
item = match;
}
// End?
if (j === split.length - 1) {
item.end = true;
break;
}
// Parse children
parent = item.children || (item.children = []);
}
}
return root;
}
/**
* Parse tree
*/
export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex {
interface ParsedTreeItem {
// Regex
regex: EmojiItemRegex;
// True if end of sequence. If children are set, it means children are optional
end: boolean;
// Regex for merged child elements
children?: EmojiItemRegex;
}
function mergeParsedChildren(items: ParsedTreeItem[]): EmojiItemRegex {
const parsedItems: EmojiItemRegex[] = [];
// Find items with same 'end' and 'children'
type TreeItemsMap = Record<string, Required<ParsedTreeItem>[]>;
const mapWithoutEnd = Object.create(null) as TreeItemsMap;
const mapWithEnd = Object.create(null) as TreeItemsMap;
for (let i = 0; i < items.length; i++) {
const item = items[i];
const children = item.children;
if (children) {
const fullItem = item as Required<ParsedTreeItem>;
const target = item.end ? mapWithEnd : mapWithoutEnd;
const regex = children.regex;
if (!target[regex]) {
target[regex] = [fullItem];
} else {
target[regex].push(fullItem);
}
} else {
// Nothing to parse
parsedItems.push(item.regex);
}
}
// Parse all sets
[mapWithEnd, mapWithoutEnd].forEach((source) => {
for (const regex in source) {
const items = source[regex];
const firstItem = items[0];
// Merge common chunk + joiner
let childSequence: EmojiItemRegex[] = [
createUTF16EmojiRegexItem([joinerEmoji]),
firstItem.children,
];
if (firstItem.end) {
// Make it optional
childSequence = [
createOptionalEmojiRegexItem(
createSequenceEmojiRegexItem(childSequence)
),
];
}
// Get remaining chunk
let mergedRegex: EmojiItemRegex;
if (items.length === 1) {
// No matches
mergedRegex = firstItem.regex;
} else {
// Merge items
mergedRegex = mergeSimilarItemsInSet(
createSetEmojiRegexItem(items.map((item) => item.regex))
);
}
// Merge
const sequence = createSequenceEmojiRegexItem([
mergedRegex,
...childSequence,
]);
parsedItems.push(sequence);
}
});
// Merge sequences
if (parsedItems.length === 1) {
return parsedItems[0];
}
return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems));
}
function parseItemChildren(item: TreeItem): ParsedTreeItem {
const result: ParsedTreeItem = {
regex: item.regex,
end: !!item.end,
};
// Parse child elements
const children = item.children;
if (!children) {
return result;
}
const parsedChildren = children.map(parseItemChildren);
result.children = mergeParsedChildren(parsedChildren);
return result;
}
// Parse all items
const parsed = items.map(parseItemChildren);
return mergeParsedChildren(parsed);
}

View File

@ -0,0 +1,212 @@
import {
createOptionalEmojiRegexItem,
createSequenceEmojiRegexItem,
createSetEmojiRegexItem,
createUTF16EmojiRegexItem,
} from '../lib/emoji/regex/base';
describe('Creating chunks of regex', () => {
it('UTF-16 numbers', () => {
// Number
expect(createUTF16EmojiRegexItem([0x2763])).toEqual({
type: 'utf16',
regex: '\\u2763',
numbers: [0x2763],
group: true,
});
// Range
expect(createUTF16EmojiRegexItem([0x2762, 0x2764, 0x2763])).toEqual({
type: 'utf16',
regex: '[\\u2762-\\u2764]',
numbers: [0x2762, 0x2763, 0x2764],
group: true,
});
// Separate numbers
expect(createUTF16EmojiRegexItem([0x2760, 0x2764, 0xfe0f])).toEqual({
type: 'utf16',
regex: '[\\u2760\\u2764\\uFE0F]',
numbers: [0x2760, 0x2764, 0xfe0f],
group: true,
});
// Ranges + numbers, duplicate item
expect(
createUTF16EmojiRegexItem([
0x2760, 0x2762, 0x2761, 0x2765, 0x2763, 0xfe0f, 0xfe0f, 0xfe0e,
0x2000, 0x2001, 0x2100, 0x2102, 0x2101,
])
).toEqual({
type: 'utf16',
regex: '[\\u2000\\u2001\\u2100-\\u2102\\u2760-\\u2763\\u2765\\uFE0E\\uFE0F]',
numbers: [
0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762,
0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f,
],
group: true,
});
});
it('Sequence from numbers', () => {
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
// 1 item
expect(createSequenceEmojiRegexItem([num1])).toEqual({
type: 'sequence',
regex: '[\\u2000\\u2001]',
numbers: [0x2000, 0x2001],
items: [num1],
group: true,
});
// 2 numbers
expect(createSequenceEmojiRegexItem([num1, num2])).toEqual({
type: 'sequence',
regex: '[\\u2000\\u2001][\\u2000\\u2100]',
items: [num1, num2],
group: false,
});
});
it('Sets from numbers', () => {
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
// 1 item
expect(createSetEmojiRegexItem([num1])).toEqual({
type: 'set',
regex: '[\\u2000\\u2001]',
numbers: [0x2000, 0x2001],
sets: [num1],
group: true,
});
// 2 numbers
expect(createSetEmojiRegexItem([num1, num2])).toEqual({
type: 'set',
regex: '[\\u2000\\u2001]|[\\u2000\\u2100]',
numbers: [0x2000, 0x2001, 0x2000, 0x2100],
sets: [num1, num2],
group: false,
});
});
it('Optional numbers', () => {
const num1 = createUTF16EmojiRegexItem([0xfe0f]);
const num2 = createUTF16EmojiRegexItem([0xfe0e, 0xfe0f]);
// simple item
expect(createOptionalEmojiRegexItem(num1)).toEqual({
type: 'optional',
regex: '\\uFE0F?',
item: num1,
group: true,
});
// 2 numbers
expect(createOptionalEmojiRegexItem(num2)).toEqual({
type: 'optional',
regex: '[\\uFE0E\\uFE0F]?',
item: num2,
group: true,
});
});
it('Sequence', () => {
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
const fe0f = createOptionalEmojiRegexItem(
createUTF16EmojiRegexItem([0xfe0f])
);
// optional item
expect(createSequenceEmojiRegexItem([fe0f])).toEqual({
type: 'sequence',
regex: '\\uFE0F?',
items: [fe0f],
group: true,
});
const seq1 = createSequenceEmojiRegexItem([num1, fe0f]);
expect(seq1).toEqual({
type: 'sequence',
regex: '[\\u2000\\u2001]\\uFE0F?',
items: [num1, fe0f],
group: false,
});
// number + optional item + number
expect(createSequenceEmojiRegexItem([num1, fe0f, num2])).toEqual({
type: 'sequence',
regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]',
items: [num1, fe0f, num2],
group: false,
});
// number + nested sequence
expect(createSequenceEmojiRegexItem([num2, seq1])).toEqual({
type: 'sequence',
regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?',
items: [num2, num1, fe0f],
group: false,
});
});
it('Mix', () => {
const num1 = createUTF16EmojiRegexItem([
0x1234, 0x1235, 0x1236, 0x1237,
]);
// UTF-32
const utf32a1 = createUTF16EmojiRegexItem([0xd83d]);
const utf32a2 = createUTF16EmojiRegexItem([0xdc9a]);
const utf32a = createSequenceEmojiRegexItem([utf32a1, utf32a2]);
expect(utf32a).toEqual({
type: 'sequence',
regex: '\\uD83D\\uDC9A',
items: [utf32a1, utf32a2],
group: false,
});
utf32a.numbers = [0x1f49a];
// Make it optional
expect(createOptionalEmojiRegexItem(utf32a)).toEqual({
type: 'optional',
regex: '(?:\\uD83D\\uDC9A)?',
item: utf32a,
group: true,
});
// Set of numbers
const set = createSetEmojiRegexItem([num1, utf32a]);
expect(set).toEqual({
type: 'set',
regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A',
sets: [num1, utf32a],
numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a],
group: false,
});
// Make it optional
expect(createOptionalEmojiRegexItem(set)).toEqual({
type: 'optional',
regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?',
item: set,
group: true,
});
// Sequence with set
const utf16a = createUTF16EmojiRegexItem([0x2000]);
const utf16b = createUTF16EmojiRegexItem([0x2100]);
const utf16c = createUTF16EmojiRegexItem([0x2101]);
const set1 = createSetEmojiRegexItem([utf16b, utf16c]);
expect(createSequenceEmojiRegexItem([utf16a, set1])).toEqual({
type: 'sequence',
regex: '\\u2000(?:\\u2100|\\u2101)',
items: [utf16a, set1],
group: false,
});
});
});

View File

@ -0,0 +1,198 @@
import {
createOptionalEmojiRegexItem,
createSetEmojiRegexItem,
createUTF16EmojiRegexItem,
} from '../lib/emoji/regex/base';
import {
createEmojiRegexItemForNumbers,
createRegexForNumbersSequence,
optimiseNumbersSet,
} from '../lib/emoji/regex/numbers';
describe('Creating chunks of regex for numbers', () => {
it('Numbers', () => {
// UTF-16
expect(createEmojiRegexItemForNumbers([0x2763])).toEqual({
type: 'utf16',
regex: '\\u2763',
numbers: [0x2763],
group: true,
});
expect(
createEmojiRegexItemForNumbers([0x2761, 0x2765, 0x2764, 0x2763])
).toEqual({
type: 'utf16',
regex: '[\\u2761\\u2763-\\u2765]',
numbers: [0x2761, 0x2763, 0x2764, 0x2765],
group: true,
});
// UTF-32
expect(createEmojiRegexItemForNumbers([0x1f49a])).toEqual({
type: 'sequence',
regex: '\\uD83D\\uDC9A',
items: [
{
type: 'utf16',
regex: '\\uD83D',
numbers: [0xd83d],
group: true,
},
{
type: 'utf16',
regex: '\\uDC9A',
numbers: [0xdc9a],
group: true,
},
],
numbers: [0x1f49a],
group: false,
});
// Similar ranges
const items1 = createEmojiRegexItemForNumbers([
0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89c,
]);
delete (items1 as unknown as Record<string, unknown>).items;
expect(items1).toEqual({
type: 'sequence',
regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]',
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c],
group: false,
});
// Mismatched ranges
const items2 = createEmojiRegexItemForNumbers([
0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89e,
]);
delete (items2 as unknown as Record<string, unknown>).sets;
expect(items2).toEqual({
type: 'set',
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e],
group: false,
});
// Mix
const items3 = createEmojiRegexItemForNumbers([
0x2763, 0x2765, 0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b,
0x1f89e, 0x2764,
]);
delete (items3 as unknown as Record<string, unknown>).sets;
expect(items3).toEqual({
type: 'set',
regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
numbers: [
0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a,
0x1f89b, 0x1f89e,
],
group: false,
});
});
it('Numbers sequence', () => {
// UTF-16: cannot be sequence
expect(createRegexForNumbersSequence([0x2763])).toEqual(
createUTF16EmojiRegexItem([0x2763])
);
// UTF-32
expect(createRegexForNumbersSequence([0x1f49a])).toEqual({
type: 'sequence',
regex: '\\uD83D\\uDC9A',
numbers: [0x1f49a],
items: [
createUTF16EmojiRegexItem([0xd83d]),
createUTF16EmojiRegexItem([0xdc9a]),
],
group: false,
});
// Variation
expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f])).toEqual({
type: 'sequence',
regex: '\\uD83D\\uDC9A\\uFE0F?',
items: [
createUTF16EmojiRegexItem([0xd83d]),
createUTF16EmojiRegexItem([0xdc9a]),
createOptionalEmojiRegexItem(
createUTF16EmojiRegexItem([0xfe0f])
),
],
group: false,
});
expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f], false)).toEqual(
{
type: 'sequence',
regex: '\\uD83D\\uDC9A\\uFE0F',
items: [
createUTF16EmojiRegexItem([0xd83d]),
createUTF16EmojiRegexItem([0xdc9a]),
createUTF16EmojiRegexItem([0xfe0f]),
],
group: false,
}
);
// Variation only
expect(createRegexForNumbersSequence([0xfe0f])).toEqual(
createOptionalEmojiRegexItem(createUTF16EmojiRegexItem([0xfe0f]))
);
});
it('Optimising set', () => {
// Mix of numbers
expect(
optimiseNumbersSet(
createSetEmojiRegexItem([
// Mandatory
createUTF16EmojiRegexItem([0x2000]),
createUTF16EmojiRegexItem([0x2001]),
createEmojiRegexItemForNumbers([0x1f932]),
// Optional
createOptionalEmojiRegexItem(
createUTF16EmojiRegexItem([0x2100])
),
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers([0x1f91d])
),
])
)
).toEqual(
createSetEmojiRegexItem([
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers([0x1f91d, 0x2100])
),
createEmojiRegexItemForNumbers([0x2000, 0x2001, 0x1f932]),
])
);
// Duplicate optional and mandatory numbers
expect(
optimiseNumbersSet(
createSetEmojiRegexItem([
// Mandatory
createUTF16EmojiRegexItem([0x2000]),
createUTF16EmojiRegexItem([0x2001]),
createEmojiRegexItemForNumbers([0x1f932]),
// Optional
createOptionalEmojiRegexItem(
createUTF16EmojiRegexItem([0x2001, 0x2002])
),
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers([0x1f91d])
),
])
)
).toEqual(
createSetEmojiRegexItem([
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers([0x1f91d, 0x2001, 0x2002])
),
createEmojiRegexItemForNumbers([0x2000, 0x1f932]),
])
);
});
});

View File

@ -0,0 +1,443 @@
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import { splitUTF32Number } from '../lib/emoji/convert';
import {
createOptionalEmojiRegexItem,
createSequenceEmojiRegexItem,
createSetEmojiRegexItem,
createUTF16EmojiRegexItem,
SequenceEmojiItemRegex,
} from '../lib/emoji/regex/base';
import {
createEmojiRegexItemForNumbers,
createRegexForNumbersSequence,
} from '../lib/emoji/regex/numbers';
import {
findSimilarRegexItemSequences,
mergeSimilarItemsInSet,
mergeSimilarRegexItemSequences,
} from '../lib/emoji/regex/similar';
describe('Similar chunks of regex', () => {
it('Nothing in common', () => {
// Nothing in common
expect(
findSimilarRegexItemSequences([
createRegexForNumbersSequence([0x1234, 0x2345]),
])
).toBeUndefined();
expect(
findSimilarRegexItemSequences([
createEmojiRegexItemForNumbers([0x1234]),
createOptionalEmojiRegexItem(
createEmojiRegexItemForNumbers([0x1234])
),
])
).toBeUndefined();
expect(
findSimilarRegexItemSequences([
createEmojiRegexItemForNumbers([0x1234]),
// Match is in middle of sequence
createRegexForNumbersSequence([0x1230, 0x1234, 0x1235]),
])
).toBeUndefined();
});
it('Simple match', () => {
const items = [
createEmojiRegexItemForNumbers([0x1234]),
createRegexForNumbersSequence([0x1234, 0x1235]),
createRegexForNumbersSequence([0xfe0f]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 6,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 'full',
},
{
index: 1,
slice: 1,
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(items, sequence)
);
expect(set).toEqual({
type: 'set',
regex: '\\u1234\\u1235?|\\uFE0F?',
sets: [
createSequenceEmojiRegexItem([
items[0],
createOptionalEmojiRegexItem(
createUTF16EmojiRegexItem([0x1235])
),
]),
items[2],
],
group: false,
});
});
it('Range of numbers', () => {
const items = [
createRegexForNumbersSequence([0x1f91d, 0x1f3fb]),
createRegexForNumbersSequence([0x1f91d, 0x1f3fc]),
createRegexForNumbersSequence([0x1f91d, 0x1f3fd]),
createRegexForNumbersSequence([0x1f91d, 0x1f3fe]),
createRegexForNumbersSequence([0x1f91d, 0x1f3ff]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 72,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 3,
},
{
index: 1,
slice: 3,
},
{
index: 2,
slice: 3,
},
{
index: 3,
slice: 3,
},
{
index: 4,
slice: 3,
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(
items,
sequence,
mergeSimilarItemsInSet
)
);
const commonChunk = (items[0] as SequenceEmojiItemRegex).items.slice(
0,
3
);
expect(set).toEqual({
type: 'set',
regex: '\\uD83E\\uDD1D\\uD83C[\\uDFFB-\\uDFFF]',
sets: [
createSequenceEmojiRegexItem([
...commonChunk,
createUTF16EmojiRegexItem([
0xdffb, 0xdffc, 0xdffd, 0xdffe, 0xdfff,
]),
]),
],
group: false,
});
});
it('Multiple matches', () => {
const items = [
createEmojiRegexItemForNumbers([0x1234]),
createRegexForNumbersSequence([0x1234, 0x1235]),
createEmojiRegexItemForNumbers([0x1235]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 6,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 'full',
},
{
index: 1,
slice: 1,
},
],
},
{
type: 'end',
slices: [
{
index: 1,
slice: 1,
},
{
index: 2,
slice: 'full',
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply first merge only
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(items, sequence)
);
expect(set).toEqual({
type: 'set',
regex: '\\u1234\\u1235?|\\u1235',
sets: [
createSequenceEmojiRegexItem([
items[0],
createOptionalEmojiRegexItem(items[2]),
]),
items[2],
],
group: false,
});
});
it('Extra number', () => {
const items = [
createRegexForNumbersSequence([0x1f64f]),
createRegexForNumbersSequence([0x1f64f, 0x1f3fb]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 12,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 'full',
},
{
index: 1,
slice: 2,
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply merge
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(items, sequence)
);
expect(set).toEqual({
type: 'set',
regex: '\\uD83D\\uDE4F(?:\\uD83C\\uDFFB)?',
sets: [
createSequenceEmojiRegexItem([
...items[0].items,
createOptionalEmojiRegexItem(
createRegexForNumbersSequence(
splitUTF32Number(0x1f3fb)!
)
),
]),
],
group: false,
});
});
it('Multiple matches', () => {
const items = [
createEmojiRegexItemForNumbers([0x1234]),
createRegexForNumbersSequence([0x1234, 0x1235]),
createEmojiRegexItemForNumbers([0x1235]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 6,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 'full',
},
{
index: 1,
slice: 1,
},
],
},
{
type: 'end',
slices: [
{
index: 1,
slice: 1,
},
{
index: 2,
slice: 'full',
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply first merge only
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(items, sequence)
);
expect(set).toEqual({
type: 'set',
regex: '\\u1234\\u1235?|\\u1235',
sets: [
createSequenceEmojiRegexItem([
items[0],
createOptionalEmojiRegexItem(items[2]),
]),
items[2],
],
group: false,
});
});
it('Complex sequence', () => {
const items = [
// First 3 elements match, also last 2 elements create variations
createRegexForNumbersSequence([
0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fc,
]),
createRegexForNumbersSequence([
0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fd,
]),
createRegexForNumbersSequence([
0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fc,
]),
createRegexForNumbersSequence([
0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fd,
]),
// Variation
createRegexForNumbersSequence([0x1f64f]),
createRegexForNumbersSequence([0x1f64f, 0x1f3fb]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 108,
sequences: [
{
type: 'start',
slices: [
{
index: 0,
slice: 6,
},
{
index: 1,
slice: 6,
},
{
index: 2,
slice: 6,
},
{
index: 3,
slice: 6,
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Apply first merge only
const set = createSetEmojiRegexItem(
mergeSimilarRegexItemSequences(items, sequence)
);
const slicedSequence = (items[0] as SequenceEmojiItemRegex).items.slice(
0,
6
);
const slicedSet = createSetEmojiRegexItem([
createSequenceEmojiRegexItem(
(items[0] as SequenceEmojiItemRegex).items.slice(6)
),
createSequenceEmojiRegexItem(
(items[1] as SequenceEmojiItemRegex).items.slice(6)
),
createSequenceEmojiRegexItem(
(items[2] as SequenceEmojiItemRegex).items.slice(6)
),
createSequenceEmojiRegexItem(
(items[3] as SequenceEmojiItemRegex).items.slice(6)
),
]);
expect(slicedSet.regex).toBe(
// Test mix separately to see if it is correct instead of parsing whole regex
'\\uDEF1\\uD83C\\uDFFC|\\uDEF1\\uD83C\\uDFFD|\\uDEF2\\uD83C\\uDFFC|\\uDEF2\\uD83C\\uDFFD'
);
expect(set).toEqual({
type: 'set',
regex:
// last 2 items (set items are sorted alphabetically),
// 6 numbers from common chunks, grouped mix
'\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
slicedSet.regex +
')',
sets: [
items[4],
items[5],
createSequenceEmojiRegexItem([...slicedSequence, slicedSet]),
],
group: false,
});
});
});

View File

@ -0,0 +1,224 @@
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import { getEmojiSequenceFromString } from '../lib/emoji/cleanup';
import { createRegexForNumbersSequence } from '../lib/emoji/regex/numbers';
import { createEmojisTree, parseEmojiTree } from '../lib/emoji/regex/tree';
describe('Emoji regex tree', () => {
it('Creating simple tree', () => {
const numbers = [
getEmojiSequenceFromString('1F3C1'),
getEmojiSequenceFromString('1F3F3'),
getEmojiSequenceFromString('1F3F3 FE0F'),
getEmojiSequenceFromString('1F3F4 200D 2620 FE0F'),
getEmojiSequenceFromString('1F3F4 200D 2620'),
];
const tree = createEmojisTree(numbers);
expect(tree).toEqual([
{
regex: createRegexForNumbersSequence([0x1f3c1]),
end: true,
},
{
regex: createRegexForNumbersSequence([0x1f3f3]),
end: true,
},
{
regex: createRegexForNumbersSequence([0x1f3f3, 0xfe0f]),
end: true,
},
{
regex: createRegexForNumbersSequence([0x1f3f4]),
children: [
{
regex: createRegexForNumbersSequence([0x2620, 0xfe0f]),
end: true,
},
{
regex: createRegexForNumbersSequence([0x2620]),
end: true,
},
],
},
]);
expect(parseEmojiTree(tree).regex).toEqual(
'\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])'
);
});
it('Creating complex tree', () => {
const numbers = [
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FB'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'),
getEmojiSequenceFromString('1FAF1 1F3FB'),
];
const tree = createEmojisTree(numbers);
expect(tree).toEqual([
{
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]),
end: true,
children: [
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fc,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fd,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fe,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3ff,
]),
end: true,
},
],
},
{
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]),
children: [
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fb,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fd,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fe,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3ff,
]),
end: true,
},
],
},
]);
expect(parseEmojiTree(tree).regex).toEqual(
'\\uD83E\\uDEF1\\uD83C' +
// depth: 1
'(?:\\uDFFB' +
// depth: 2
'(?:\\u200D\\uD83E\\uDEF2\\uD83C' +
// depth: 3
'[\\uDFFC-\\uDFFF]' +
// depth: 2
')?' +
// depth: 1
'|\\uDFFC\\u200D\\uD83E\\uDEF2\\uD83C' +
// depth: 2
'[\\uDFFB\\uDFFD-\\uDFFF]' +
// depth: 1
')'
);
});
it('Creating complex optimisable tree', () => {
const numbers = [
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'),
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FC'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'),
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'),
getEmojiSequenceFromString('1FAF1 1F3FB'),
getEmojiSequenceFromString('1FAF1 1F3FC'),
];
const tree = createEmojisTree(numbers);
expect(tree).toEqual([
{
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]),
end: true,
children: [
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fc,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fd,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fe,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3ff,
]),
end: true,
},
],
},
{
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]),
end: true,
children: [
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fc,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fd,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3fe,
]),
end: true,
},
{
regex: createRegexForNumbersSequence([
0x1faf2, 0x1f3ff,
]),
end: true,
},
],
},
]);
// expect(parseEmojiTree(tree).regex).toEqual(
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
// );
});
});