mirror of
https://github.com/iconify/iconify.git
synced 2024-12-13 14:13:06 +00:00
feat: functions to optimise emoji regex
This commit is contained in:
parent
ac61bc049a
commit
78ce0d0c4c
@ -102,6 +102,26 @@
|
||||
"import": "./lib/customisations/rotate.mjs",
|
||||
"types": "./lib/customisations/rotate.d.ts"
|
||||
},
|
||||
"./lib/emoji/regex/base": {
|
||||
"require": "./lib/emoji/regex/base.cjs",
|
||||
"import": "./lib/emoji/regex/base.mjs",
|
||||
"types": "./lib/emoji/regex/base.d.ts"
|
||||
},
|
||||
"./lib/emoji/regex/numbers": {
|
||||
"require": "./lib/emoji/regex/numbers.cjs",
|
||||
"import": "./lib/emoji/regex/numbers.mjs",
|
||||
"types": "./lib/emoji/regex/numbers.d.ts"
|
||||
},
|
||||
"./lib/emoji/regex/similar": {
|
||||
"require": "./lib/emoji/regex/similar.cjs",
|
||||
"import": "./lib/emoji/regex/similar.mjs",
|
||||
"types": "./lib/emoji/regex/similar.d.ts"
|
||||
},
|
||||
"./lib/emoji/regex/tree": {
|
||||
"require": "./lib/emoji/regex/tree.cjs",
|
||||
"import": "./lib/emoji/regex/tree.mjs",
|
||||
"types": "./lib/emoji/regex/tree.d.ts"
|
||||
},
|
||||
"./lib/emoji/cleanup": {
|
||||
"require": "./lib/emoji/cleanup.cjs",
|
||||
"import": "./lib/emoji/cleanup.mjs",
|
||||
|
@ -3,7 +3,7 @@ import {
|
||||
convertEmojiSequenceToUTF32,
|
||||
} from './convert';
|
||||
|
||||
interface UnicodeOptions {
|
||||
export interface UnicodeFormattingOptions {
|
||||
// Prefix before each character '\\u'
|
||||
prefix: string;
|
||||
|
||||
@ -23,7 +23,7 @@ interface UnicodeOptions {
|
||||
throwOnError: boolean;
|
||||
}
|
||||
|
||||
const defaultUnicodeOptions: UnicodeOptions = {
|
||||
const defaultUnicodeOptions: UnicodeFormattingOptions = {
|
||||
prefix: '',
|
||||
separator: '',
|
||||
case: 'lower',
|
||||
@ -35,7 +35,10 @@ const defaultUnicodeOptions: UnicodeOptions = {
|
||||
/**
|
||||
* Convert number to string
|
||||
*/
|
||||
function convert(sequence: number[], options: UnicodeOptions): string {
|
||||
function convert(
|
||||
sequence: number[],
|
||||
options: UnicodeFormattingOptions
|
||||
): string {
|
||||
const prefix = options.prefix;
|
||||
const func = options.case === 'upper' ? 'toUpperCase' : 'toLowerCase';
|
||||
|
||||
@ -60,7 +63,7 @@ function convert(sequence: number[], options: UnicodeOptions): string {
|
||||
*/
|
||||
export function getEmojiUnicodeString(
|
||||
code: number,
|
||||
options: Partial<UnicodeOptions> = {}
|
||||
options: Partial<UnicodeFormattingOptions> = {}
|
||||
): string {
|
||||
return convert([code], {
|
||||
...defaultUnicodeOptions,
|
||||
@ -68,7 +71,7 @@ export function getEmojiUnicodeString(
|
||||
});
|
||||
}
|
||||
|
||||
const defaultSequenceOptions: UnicodeOptions = {
|
||||
const defaultSequenceOptions: UnicodeFormattingOptions = {
|
||||
...defaultUnicodeOptions,
|
||||
separator: '-',
|
||||
};
|
||||
@ -78,7 +81,7 @@ const defaultSequenceOptions: UnicodeOptions = {
|
||||
*/
|
||||
export function getEmojiSequenceString(
|
||||
sequence: number[],
|
||||
options: Partial<UnicodeOptions> = {}
|
||||
options: Partial<UnicodeFormattingOptions> = {}
|
||||
): string {
|
||||
return convert(sequence, {
|
||||
...defaultSequenceOptions,
|
||||
@ -86,7 +89,7 @@ export function getEmojiSequenceString(
|
||||
});
|
||||
}
|
||||
|
||||
const regexOptions: UnicodeOptions = {
|
||||
const regexOptions: UnicodeFormattingOptions = {
|
||||
prefix: '\\u',
|
||||
separator: '',
|
||||
case: 'upper',
|
||||
@ -108,7 +111,7 @@ export function emojiSequenceToRegex(
|
||||
});
|
||||
}
|
||||
|
||||
const keywordOptions: UnicodeOptions = {
|
||||
const keywordOptions: UnicodeFormattingOptions = {
|
||||
prefix: '',
|
||||
separator: '-',
|
||||
case: 'lower',
|
||||
|
423
packages/utils/src/emoji/regex/base.ts
Normal file
423
packages/utils/src/emoji/regex/base.ts
Normal file
@ -0,0 +1,423 @@
|
||||
import { getEmojiUnicodeString, UnicodeFormattingOptions } from '../format';
|
||||
|
||||
/**
|
||||
* Regex in item
|
||||
*/
|
||||
interface BaseEmojiItemRegex {
|
||||
// Regex type:
|
||||
// 'utf16' -> utf16 number(s)
|
||||
// 'sequence' -> sequence, not wrapped in `(?:` + `)`
|
||||
// requires wrapping, unless marked as wrapped
|
||||
// 'options' -> list of options, not wrapped in `(?:` + `)`
|
||||
// requires wrapping
|
||||
type: 'utf16' | 'sequence' | 'set' | 'optional';
|
||||
|
||||
// Regex
|
||||
regex: string;
|
||||
|
||||
// True if regex can be treated as a group (does not require wrapping in `(?:` + `)`)
|
||||
group: boolean;
|
||||
}
|
||||
|
||||
interface EmojiItemRegexWithNumbers {
|
||||
// Numbers in regex, set if regex represents set of numbers. Allows
|
||||
// creation of number ranges when combining multiple regex items
|
||||
// Cannot be empty array
|
||||
numbers?: number[];
|
||||
}
|
||||
|
||||
// Numbers
|
||||
export interface UTF16EmojiItemRegex
|
||||
extends BaseEmojiItemRegex,
|
||||
Required<EmojiItemRegexWithNumbers> {
|
||||
type: 'utf16';
|
||||
|
||||
// Always grouped
|
||||
group: true;
|
||||
|
||||
// `numbers` is required
|
||||
}
|
||||
|
||||
// Sequence
|
||||
type SequenceEmojiItemRegexItem =
|
||||
| UTF16EmojiItemRegex
|
||||
| SetEmojiItemRegex
|
||||
| OptionalEmojiItemRegex;
|
||||
export interface SequenceEmojiItemRegex
|
||||
extends BaseEmojiItemRegex,
|
||||
EmojiItemRegexWithNumbers {
|
||||
type: 'sequence';
|
||||
|
||||
// Items in sequence. Any type except another sequence
|
||||
items: SequenceEmojiItemRegexItem[];
|
||||
}
|
||||
|
||||
// Set
|
||||
export type SetEmojiItemRegexItem =
|
||||
| UTF16EmojiItemRegex
|
||||
| SequenceEmojiItemRegex
|
||||
| OptionalEmojiItemRegex;
|
||||
export interface SetEmojiItemRegex
|
||||
extends BaseEmojiItemRegex,
|
||||
EmojiItemRegexWithNumbers {
|
||||
type: 'set';
|
||||
|
||||
// Items in set. Any type except another set
|
||||
sets: SetEmojiItemRegexItem[];
|
||||
}
|
||||
|
||||
// Optional
|
||||
type OptionalEmojiItemRegexItem =
|
||||
| UTF16EmojiItemRegex
|
||||
| SequenceEmojiItemRegex
|
||||
| SetEmojiItemRegex;
|
||||
export interface OptionalEmojiItemRegex extends BaseEmojiItemRegex {
|
||||
type: 'optional';
|
||||
|
||||
// Wrapped item. Any type except another optional item
|
||||
item: OptionalEmojiItemRegexItem;
|
||||
|
||||
// Always grouped
|
||||
group: true;
|
||||
}
|
||||
|
||||
export type EmojiItemRegex =
|
||||
| UTF16EmojiItemRegex
|
||||
| SequenceEmojiItemRegex
|
||||
| SetEmojiItemRegex
|
||||
| OptionalEmojiItemRegex;
|
||||
|
||||
/**
|
||||
* Options for converting number to string
|
||||
*/
|
||||
const numberToStringOptions: Partial<UnicodeFormattingOptions> = {
|
||||
prefix: '\\u',
|
||||
separator: '',
|
||||
case: 'upper',
|
||||
format: 'utf-16',
|
||||
add0: true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert number to string
|
||||
*/
|
||||
function toString(number: number): string {
|
||||
return getEmojiUnicodeString(number, numberToStringOptions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Typescript stuff
|
||||
*/
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
function assertNever(v: never) {
|
||||
// Empty function that should never be called
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap regex in group
|
||||
*/
|
||||
export function wrapRegexInGroup(regex: string): string {
|
||||
return '(?:' + regex + ')';
|
||||
}
|
||||
|
||||
/**
|
||||
* Update UTF16 item, return regex
|
||||
*/
|
||||
export function updateUTF16EmojiRegexItem(item: UTF16EmojiItemRegex): string {
|
||||
const numbers = item.numbers;
|
||||
if (numbers.length === 1) {
|
||||
// 1 number
|
||||
const num = numbers[0];
|
||||
return (item.regex = toString(num));
|
||||
}
|
||||
|
||||
// Multiple numbers
|
||||
numbers.sort((a, b) => a - b);
|
||||
const chars: string[] = [];
|
||||
interface Range {
|
||||
start: number;
|
||||
last: number;
|
||||
numbers: number[];
|
||||
}
|
||||
let range: Range | null = null;
|
||||
const addRange = () => {
|
||||
if (range) {
|
||||
const { start, last, numbers } = range;
|
||||
range = null;
|
||||
if (last > start + 1) {
|
||||
// More than 2 items
|
||||
chars.push(toString(start) + '-' + toString(last));
|
||||
} else {
|
||||
for (let i = 0; i < numbers.length; i++) {
|
||||
chars.push(toString(numbers[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (let i = 0; i < numbers.length; i++) {
|
||||
const num = numbers[i];
|
||||
if (range) {
|
||||
if (range.last === num) {
|
||||
// Duplicate
|
||||
continue;
|
||||
}
|
||||
if (range.last === num - 1) {
|
||||
// Add to existing range
|
||||
range.numbers.push(num);
|
||||
range.last = num;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Not in range: start new one
|
||||
addRange();
|
||||
range = {
|
||||
start: num,
|
||||
last: num,
|
||||
numbers: [num],
|
||||
};
|
||||
}
|
||||
addRange();
|
||||
|
||||
if (!chars.length) {
|
||||
throw new Error('Unexpected empty range');
|
||||
}
|
||||
return (item.regex = '[' + chars.join('') + ']');
|
||||
}
|
||||
|
||||
/**
|
||||
* Create UTF-16 regex
|
||||
*/
|
||||
export function createUTF16EmojiRegexItem(
|
||||
numbers: number[]
|
||||
): UTF16EmojiItemRegex {
|
||||
const result: UTF16EmojiItemRegex = {
|
||||
type: 'utf16',
|
||||
regex: '',
|
||||
numbers,
|
||||
group: true,
|
||||
};
|
||||
updateUTF16EmojiRegexItem(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update sequence regex. Does not update group
|
||||
*/
|
||||
export function updateSequenceEmojiRegexItem(
|
||||
item: SequenceEmojiItemRegex
|
||||
): string {
|
||||
return (item.regex = item.items
|
||||
.map((childItem) => {
|
||||
if (!childItem.group && childItem.type === 'set') {
|
||||
return wrapRegexInGroup(childItem.regex);
|
||||
}
|
||||
return childItem.regex;
|
||||
})
|
||||
.join(''));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create sequence regex
|
||||
*/
|
||||
export function createSequenceEmojiRegexItem(
|
||||
sequence: EmojiItemRegex[],
|
||||
numbers?: number[]
|
||||
): SequenceEmojiItemRegex {
|
||||
// Merge items
|
||||
let items: SequenceEmojiItemRegexItem[] = [];
|
||||
sequence.forEach((item) => {
|
||||
if (item.type === 'sequence') {
|
||||
items = items.concat(item.items);
|
||||
} else {
|
||||
items.push(item);
|
||||
}
|
||||
});
|
||||
|
||||
// Generate item
|
||||
if (!items.length) {
|
||||
throw new Error('Empty sequence');
|
||||
}
|
||||
const result: SequenceEmojiItemRegex = {
|
||||
type: 'sequence',
|
||||
items,
|
||||
regex: '',
|
||||
group: false,
|
||||
};
|
||||
|
||||
if (sequence.length === 1) {
|
||||
const firstItem = sequence[0];
|
||||
result.group = firstItem.group;
|
||||
if (firstItem.type !== 'optional') {
|
||||
const numbers = firstItem.numbers;
|
||||
if (numbers) {
|
||||
result.numbers = numbers;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numbers) {
|
||||
result.numbers = numbers;
|
||||
}
|
||||
|
||||
// Update regex
|
||||
updateSequenceEmojiRegexItem(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update set regex and group
|
||||
*/
|
||||
export function updateSetEmojiRegexItem(item: SetEmojiItemRegex): string {
|
||||
if (item.sets.length === 1) {
|
||||
// 1 item
|
||||
const firstItem = item.sets[0];
|
||||
item.group = firstItem.group;
|
||||
return (item.regex = firstItem.regex);
|
||||
}
|
||||
|
||||
// Multiple items
|
||||
item.group = false;
|
||||
return (item.regex = item.sets
|
||||
.map((childItem) => childItem.regex)
|
||||
.join('|'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create set regex
|
||||
*/
|
||||
export function createSetEmojiRegexItem(
|
||||
set: EmojiItemRegex[]
|
||||
): SetEmojiItemRegex {
|
||||
let sets: SetEmojiItemRegexItem[] = [];
|
||||
let numbers: number[] | null = [];
|
||||
|
||||
set.forEach((item) => {
|
||||
if (item.type === 'set') {
|
||||
sets = sets.concat(item.sets);
|
||||
} else {
|
||||
sets.push(item);
|
||||
}
|
||||
|
||||
// Copy numbers
|
||||
if (numbers) {
|
||||
if (item.type === 'optional' || !item.numbers) {
|
||||
numbers = null;
|
||||
} else {
|
||||
numbers = [...numbers, ...item.numbers];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Sort items to guarantee same results regardless of order
|
||||
sets.sort((a, b) => a.regex.localeCompare(b.regex));
|
||||
|
||||
// Create item
|
||||
const result: SetEmojiItemRegex = {
|
||||
type: 'set',
|
||||
sets,
|
||||
regex: '',
|
||||
group: false,
|
||||
};
|
||||
if (numbers) {
|
||||
result.numbers = numbers;
|
||||
}
|
||||
|
||||
if (set.length === 1) {
|
||||
const firstItem = set[0];
|
||||
result.group = firstItem.group;
|
||||
}
|
||||
|
||||
updateSetEmojiRegexItem(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update optional regex
|
||||
*/
|
||||
export function updateOptionalEmojiRegexItem(
|
||||
item: OptionalEmojiItemRegex
|
||||
): string {
|
||||
const childItem = item.item;
|
||||
const regex =
|
||||
(childItem.group
|
||||
? childItem.regex
|
||||
: wrapRegexInGroup(childItem.regex)) + '?';
|
||||
return (item.regex = regex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create optional item
|
||||
*/
|
||||
export function createOptionalEmojiRegexItem(
|
||||
item: EmojiItemRegex
|
||||
): OptionalEmojiItemRegex {
|
||||
if (item.type === 'optional') {
|
||||
return item;
|
||||
}
|
||||
|
||||
const result: OptionalEmojiItemRegex = {
|
||||
type: 'optional',
|
||||
item,
|
||||
regex: '',
|
||||
group: true,
|
||||
};
|
||||
updateOptionalEmojiRegexItem(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone item
|
||||
*/
|
||||
export function cloneEmojiRegexItem<T extends BaseEmojiItemRegex>(
|
||||
item: T,
|
||||
shallow = false
|
||||
): T {
|
||||
const result = {
|
||||
...item,
|
||||
} as unknown as EmojiItemRegex;
|
||||
|
||||
// Copy numbers
|
||||
if (result.type !== 'optional' && result.numbers) {
|
||||
result.numbers = [...result.numbers];
|
||||
}
|
||||
|
||||
// Clone lists
|
||||
switch (result.type) {
|
||||
case 'utf16':
|
||||
// Nothing to do
|
||||
break;
|
||||
|
||||
case 'sequence':
|
||||
if (shallow) {
|
||||
result.items = [...result.items];
|
||||
} else {
|
||||
result.items = result.items.map((item) =>
|
||||
cloneEmojiRegexItem(item, false)
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'set':
|
||||
if (shallow) {
|
||||
result.sets = [...result.sets];
|
||||
} else {
|
||||
result.sets = result.sets.map((item) =>
|
||||
cloneEmojiRegexItem(item, false)
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'optional':
|
||||
if (!shallow) {
|
||||
result.item = cloneEmojiRegexItem(result.item, false);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
assertNever(result);
|
||||
}
|
||||
|
||||
return result as unknown as T;
|
||||
}
|
246
packages/utils/src/emoji/regex/numbers.ts
Normal file
246
packages/utils/src/emoji/regex/numbers.ts
Normal file
@ -0,0 +1,246 @@
|
||||
import { splitUTF32Number } from '../convert';
|
||||
import {
|
||||
createOptionalEmojiRegexItem,
|
||||
createSequenceEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
createUTF16EmojiRegexItem,
|
||||
EmojiItemRegex,
|
||||
OptionalEmojiItemRegex,
|
||||
SequenceEmojiItemRegex,
|
||||
SetEmojiItemRegex,
|
||||
UTF16EmojiItemRegex,
|
||||
} from './base';
|
||||
import { vs16Emoji } from '../data';
|
||||
|
||||
/**
|
||||
* Create regex item for set of numbers
|
||||
*/
|
||||
export function createEmojiRegexItemForNumbers(
|
||||
numbers: number[]
|
||||
): UTF16EmojiItemRegex | SequenceEmojiItemRegex | SetEmojiItemRegex {
|
||||
// Separate UTF-16 and UTF-32
|
||||
interface UTF32FirstNumber {
|
||||
first: number;
|
||||
second: number[];
|
||||
numbers: number[];
|
||||
}
|
||||
const utf32: UTF32FirstNumber[] = [];
|
||||
const utf16: number[] = [];
|
||||
|
||||
numbers.sort((a, b) => a - b);
|
||||
|
||||
let lastNumber: number | undefined;
|
||||
for (let i = 0; i < numbers.length; i++) {
|
||||
const number = numbers[i];
|
||||
if (number === lastNumber) {
|
||||
continue;
|
||||
}
|
||||
lastNumber = number;
|
||||
|
||||
const split = splitUTF32Number(number);
|
||||
if (!split) {
|
||||
utf16.push(number);
|
||||
continue;
|
||||
}
|
||||
|
||||
const [first, second] = split;
|
||||
const item = utf32.find((item) => item.first === first);
|
||||
if (item) {
|
||||
item.second.push(second);
|
||||
item.numbers.push(number);
|
||||
} else {
|
||||
utf32.push({
|
||||
first,
|
||||
second: [second],
|
||||
numbers: [number],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const results: (UTF16EmojiItemRegex | SequenceEmojiItemRegex)[] = [];
|
||||
|
||||
// Merge UTF-16
|
||||
if (utf16.length) {
|
||||
results.push(createUTF16EmojiRegexItem(utf16));
|
||||
}
|
||||
|
||||
// Merge UTF-32
|
||||
if (utf32.length) {
|
||||
// Create map of first and second chunks, joining by common second chunks
|
||||
interface UTF32Item {
|
||||
second: UTF16EmojiItemRegex;
|
||||
first: number[];
|
||||
numbers: number[];
|
||||
}
|
||||
const utf32Set: UTF32Item[] = [];
|
||||
|
||||
for (let i = 0; i < utf32.length; i++) {
|
||||
const item = utf32[i];
|
||||
const secondRegex = createUTF16EmojiRegexItem(item.second);
|
||||
|
||||
// Find matching elements
|
||||
const listItem = utf32Set.find(
|
||||
(item) => item.second.regex === secondRegex.regex
|
||||
);
|
||||
if (listItem) {
|
||||
// Found multiple items with the same last set
|
||||
listItem.first.push(item.first);
|
||||
listItem.numbers = [...listItem.numbers, ...item.numbers];
|
||||
} else {
|
||||
utf32Set.push({
|
||||
second: secondRegex,
|
||||
first: [item.first],
|
||||
numbers: [...item.numbers],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Create regex for each set
|
||||
for (let i = 0; i < utf32Set.length; i++) {
|
||||
const item = utf32Set[i];
|
||||
const firstRegex = createUTF16EmojiRegexItem(item.first);
|
||||
const secondRegex = item.second;
|
||||
|
||||
// Generate regex, add numbers list for reference
|
||||
results.push(
|
||||
createSequenceEmojiRegexItem(
|
||||
[firstRegex, secondRegex],
|
||||
item.numbers
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return results.length === 1 ? results[0] : createSetEmojiRegexItem(results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create sequence of numbers
|
||||
*/
|
||||
export function createRegexForNumbersSequence(
|
||||
numbers: number[],
|
||||
optionalVariations = true
|
||||
): SequenceEmojiItemRegex | UTF16EmojiItemRegex | OptionalEmojiItemRegex {
|
||||
const items: (UTF16EmojiItemRegex | OptionalEmojiItemRegex)[] = [];
|
||||
for (let i = 0; i < numbers.length; i++) {
|
||||
const num = numbers[i];
|
||||
const split = splitUTF32Number(num);
|
||||
if (!split) {
|
||||
// UTF-16 number
|
||||
const item = createUTF16EmojiRegexItem([num]);
|
||||
if (optionalVariations && num === vs16Emoji) {
|
||||
items.push(createOptionalEmojiRegexItem(item));
|
||||
} else {
|
||||
items.push(item);
|
||||
}
|
||||
} else {
|
||||
// UTF-32 number
|
||||
items.push(createUTF16EmojiRegexItem([split[0]]));
|
||||
items.push(createUTF16EmojiRegexItem([split[1]]));
|
||||
}
|
||||
}
|
||||
|
||||
if (items.length === 1) {
|
||||
// Only 1 item
|
||||
return items[0];
|
||||
}
|
||||
|
||||
const result = createSequenceEmojiRegexItem(items);
|
||||
if (numbers.length === 1 && items[0].type === 'utf16') {
|
||||
// Copy numbers if utf-16 or utf-32 sequence
|
||||
result.numbers = [...numbers];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to optimise numbers in a set
|
||||
*/
|
||||
export function optimiseNumbersSet(set: SetEmojiItemRegex): EmojiItemRegex {
|
||||
interface Match {
|
||||
numbers: number[];
|
||||
items: EmojiItemRegex[];
|
||||
}
|
||||
const mandatoryMatches: Match = {
|
||||
numbers: [],
|
||||
items: [],
|
||||
};
|
||||
const optionalMatches: Match = {
|
||||
numbers: [],
|
||||
items: [],
|
||||
};
|
||||
|
||||
const filteredItems: EmojiItemRegex[] = set.sets.filter((item) => {
|
||||
if (item.type === 'optional') {
|
||||
const parentItem = item.item;
|
||||
if (parentItem.numbers) {
|
||||
optionalMatches.items.push(item);
|
||||
optionalMatches.numbers = optionalMatches.numbers.concat(
|
||||
parentItem.numbers
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (item.numbers) {
|
||||
mandatoryMatches.items.push(item);
|
||||
mandatoryMatches.numbers = mandatoryMatches.numbers.concat(
|
||||
item.numbers
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
// Check if there is something to optimise
|
||||
if (mandatoryMatches.items.length + optionalMatches.items.length < 2) {
|
||||
return set;
|
||||
}
|
||||
|
||||
// Remove duplicate numbers
|
||||
const optionalNumbers = new Set(optionalMatches.numbers);
|
||||
let foundMatches = false;
|
||||
mandatoryMatches.numbers = mandatoryMatches.numbers.filter((number) => {
|
||||
if (optionalNumbers.has(number)) {
|
||||
foundMatches = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
// Check mandatory numbers
|
||||
if (mandatoryMatches.items.length) {
|
||||
if (!foundMatches && mandatoryMatches.items.length === 1) {
|
||||
// 1 unchanged item
|
||||
filteredItems.push(mandatoryMatches.items[0]);
|
||||
} else if (mandatoryMatches.numbers.length) {
|
||||
// Merge items
|
||||
filteredItems.push(
|
||||
createEmojiRegexItemForNumbers(mandatoryMatches.numbers)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Check optional numbers
|
||||
switch (optionalMatches.items.length) {
|
||||
case 0:
|
||||
break;
|
||||
|
||||
case 1:
|
||||
filteredItems.push(optionalMatches.items[0]);
|
||||
break;
|
||||
|
||||
default:
|
||||
filteredItems.push(
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers(optionalMatches.numbers)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Return regex
|
||||
return filteredItems.length === 1
|
||||
? filteredItems[0]
|
||||
: createSetEmojiRegexItem(filteredItems);
|
||||
}
|
372
packages/utils/src/emoji/regex/similar.ts
Normal file
372
packages/utils/src/emoji/regex/similar.ts
Normal file
@ -0,0 +1,372 @@
|
||||
import {
|
||||
cloneEmojiRegexItem,
|
||||
createOptionalEmojiRegexItem,
|
||||
createSequenceEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
EmojiItemRegex,
|
||||
SetEmojiItemRegex,
|
||||
} from './base';
|
||||
import { optimiseNumbersSet } from './numbers';
|
||||
|
||||
type SlicePosition = 'start' | 'end';
|
||||
type SliceValue = number | 'full';
|
||||
|
||||
/**
|
||||
* Slice of sequence
|
||||
*/
|
||||
interface SimilarRegexItemSlice {
|
||||
// Index of item in sequences list
|
||||
index: number;
|
||||
|
||||
// Start (for 'end' slices) or end (for 'start' slices) of slice
|
||||
// 'full' if nothing to slice
|
||||
slice: SliceValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Similar sequence
|
||||
*/
|
||||
interface SimilarRegexItemSequence {
|
||||
// Where common part is found
|
||||
// Common chunks can exist only at start or end of sequence, not in middle
|
||||
type: SlicePosition;
|
||||
|
||||
// Slices. Key is index in items list, value is start (for 'end' slices)
|
||||
// or end (for 'start' slices) of slice, 'full' for full items
|
||||
slices: SimilarRegexItemSlice[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Result if findSimilarRegexItemSequences()
|
||||
*/
|
||||
interface SimilarRegexItemSequenceResult {
|
||||
// Replacement score: how many characters will be saved by merging items
|
||||
score: number;
|
||||
|
||||
// Sequences that match it
|
||||
sequences: SimilarRegexItemSequence[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Typescript stuff
|
||||
*/
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
function assertNever(v: never) {
|
||||
// Empty function that should never be called
|
||||
}
|
||||
|
||||
/**
|
||||
* Find similar item sequences
|
||||
*
|
||||
* Returns sequence(s) with highest score. Only one of results should be
|
||||
* applied to items. If there are multiple sequences, clone items list,
|
||||
* attempt to apply each sequence, run further optimisations on each fork
|
||||
* and see which one returns better result.
|
||||
*
|
||||
* Returns undefined if no common sequences found
|
||||
*/
|
||||
export function findSimilarRegexItemSequences(
|
||||
items: EmojiItemRegex[]
|
||||
): SimilarRegexItemSequenceResult | undefined {
|
||||
interface MapItem {
|
||||
score: number;
|
||||
slices: SimilarRegexItemSlice[];
|
||||
}
|
||||
|
||||
// Regex at start and end of sequences
|
||||
// Key = regex combination
|
||||
const startRegex = Object.create(null) as Record<string, MapItem>;
|
||||
const endRegex = Object.create(null) as Record<string, MapItem>;
|
||||
|
||||
const addMapItem = (
|
||||
target: Record<string, MapItem>,
|
||||
index: number,
|
||||
regex: string,
|
||||
slice: SliceValue
|
||||
) => {
|
||||
if (!target[regex]) {
|
||||
// New item
|
||||
target[regex] = {
|
||||
// Start with 0. One item will remain after replacement
|
||||
score: 0,
|
||||
slices: [
|
||||
{
|
||||
index,
|
||||
slice,
|
||||
},
|
||||
],
|
||||
};
|
||||
return;
|
||||
}
|
||||
|
||||
// Existing item
|
||||
const item = target[regex];
|
||||
item.score += regex.length;
|
||||
item.slices.push({
|
||||
index,
|
||||
slice,
|
||||
});
|
||||
};
|
||||
|
||||
// Create list of all possible sequences
|
||||
for (let index = 0; index < items.length; index++) {
|
||||
const baseItem = items[index];
|
||||
switch (baseItem.type) {
|
||||
case 'optional':
|
||||
case 'utf16': {
|
||||
// Nothing to split
|
||||
addMapItem(startRegex, index, baseItem.regex, 'full');
|
||||
addMapItem(endRegex, index, baseItem.regex, 'full');
|
||||
break;
|
||||
}
|
||||
|
||||
case 'sequence': {
|
||||
// Add as full item
|
||||
addMapItem(startRegex, index, baseItem.regex, 'full');
|
||||
addMapItem(endRegex, index, baseItem.regex, 'full');
|
||||
|
||||
// Add chunks
|
||||
const sequence = baseItem.items;
|
||||
for (let i = 1; i < sequence.length; i++) {
|
||||
const startSequence = createSequenceEmojiRegexItem(
|
||||
sequence.slice(0, i)
|
||||
);
|
||||
addMapItem(startRegex, index, startSequence.regex, i);
|
||||
|
||||
const endSequence = createSequenceEmojiRegexItem(
|
||||
sequence.slice(i)
|
||||
);
|
||||
addMapItem(endRegex, index, endSequence.regex, i);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case 'set':
|
||||
throw new Error('Unexpected set within a set');
|
||||
|
||||
default:
|
||||
assertNever(baseItem);
|
||||
}
|
||||
}
|
||||
|
||||
// Create list of usable matches
|
||||
let result: SimilarRegexItemSequenceResult | undefined;
|
||||
|
||||
const checkResults = (
|
||||
target: Record<string, MapItem>,
|
||||
type: SlicePosition
|
||||
) => {
|
||||
for (const regex in target) {
|
||||
const item = target[regex];
|
||||
if (!item.score) {
|
||||
continue;
|
||||
}
|
||||
if (!result || result.score < item.score) {
|
||||
// New highest score
|
||||
result = {
|
||||
score: item.score,
|
||||
sequences: [
|
||||
{
|
||||
type,
|
||||
slices: item.slices,
|
||||
},
|
||||
],
|
||||
};
|
||||
continue;
|
||||
}
|
||||
if (result.score === item.score) {
|
||||
// Same score
|
||||
result.sequences.push({
|
||||
type,
|
||||
slices: item.slices,
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
checkResults(startRegex, 'start');
|
||||
checkResults(endRegex, 'end');
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge similar sequences
|
||||
*
|
||||
* Accepts callback to run optimisation on created subset
|
||||
*/
|
||||
export function mergeSimilarRegexItemSequences(
|
||||
items: EmojiItemRegex[],
|
||||
merge: SimilarRegexItemSequence,
|
||||
optimise?: (set: SetEmojiItemRegex) => EmojiItemRegex
|
||||
): EmojiItemRegex[] {
|
||||
const { type, slices } = merge;
|
||||
|
||||
// Get common chunks
|
||||
const indexes: Set<number> = new Set();
|
||||
let hasFullSequence = false;
|
||||
let longestMatch = 0;
|
||||
let longestMatchIndex = -1;
|
||||
const differentSequences: EmojiItemRegex[][] = [];
|
||||
|
||||
for (let i = 0; i < slices.length; i++) {
|
||||
const { index, slice } = slices[i];
|
||||
const item = items[index];
|
||||
|
||||
let length: number;
|
||||
if (slice === 'full') {
|
||||
// Full match
|
||||
hasFullSequence = true;
|
||||
if (item.type === 'sequence') {
|
||||
length = item.items.length;
|
||||
} else {
|
||||
length = 1;
|
||||
}
|
||||
} else {
|
||||
length = slice;
|
||||
|
||||
if (item.type !== 'sequence') {
|
||||
throw new Error(
|
||||
`Unexpected partial match for type "${item.type}"`
|
||||
);
|
||||
}
|
||||
|
||||
// Copy remaining chunks
|
||||
differentSequences.push(
|
||||
type === 'start'
|
||||
? item.items.slice(slice)
|
||||
: item.items.slice(0, slice)
|
||||
);
|
||||
}
|
||||
|
||||
if (length > longestMatch) {
|
||||
longestMatchIndex = index;
|
||||
longestMatch = length;
|
||||
}
|
||||
|
||||
indexes.add(index);
|
||||
}
|
||||
|
||||
// Found common chunk
|
||||
if (longestMatch < 1 || longestMatchIndex < 0) {
|
||||
throw new Error('Cannot find common sequence');
|
||||
}
|
||||
|
||||
// Get longest common item as sequence
|
||||
const commonItem = items[longestMatchIndex];
|
||||
let sequence: EmojiItemRegex[];
|
||||
if (commonItem.type !== 'sequence') {
|
||||
// Full match
|
||||
if (longestMatch !== 1) {
|
||||
throw new Error(
|
||||
'Something went wrong. Cannot have long match in non-sequence'
|
||||
);
|
||||
}
|
||||
sequence = [commonItem];
|
||||
} else {
|
||||
// Sequence
|
||||
sequence =
|
||||
type === 'start'
|
||||
? commonItem.items.slice(0, longestMatch)
|
||||
: commonItem.items.slice(longestMatch);
|
||||
}
|
||||
|
||||
// Merge other chunks
|
||||
const setItems: EmojiItemRegex[] = [];
|
||||
for (let i = 0; i < differentSequences.length; i++) {
|
||||
const list = differentSequences[i];
|
||||
if (list.length === 1) {
|
||||
// 1 item
|
||||
setItems.push(list[0]);
|
||||
} else {
|
||||
// create sequence
|
||||
setItems.push(createSequenceEmojiRegexItem(list));
|
||||
}
|
||||
}
|
||||
|
||||
// Create set, optimise is, make it optional
|
||||
const set = createSetEmojiRegexItem(setItems);
|
||||
let mergedChunk: EmojiItemRegex =
|
||||
set.sets.length === 1
|
||||
? // Do not run callback if only 1 item
|
||||
set.sets[0]
|
||||
: optimise
|
||||
? // Run callback to optimise it
|
||||
optimise(set)
|
||||
: // Use set as is
|
||||
set;
|
||||
if (hasFullSequence) {
|
||||
// Wrap in optional
|
||||
mergedChunk = createOptionalEmojiRegexItem(mergedChunk);
|
||||
}
|
||||
|
||||
// Add set to sequence
|
||||
sequence[type === 'start' ? 'push' : 'unshift'](mergedChunk);
|
||||
|
||||
// Create result by combining merged item and remaining items
|
||||
const results: EmojiItemRegex[] = [
|
||||
createSequenceEmojiRegexItem(sequence),
|
||||
...items.filter((item, index) => !indexes.has(index)),
|
||||
];
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge similar items
|
||||
*/
|
||||
export function mergeSimilarItemsInSet(set: SetEmojiItemRegex): EmojiItemRegex {
|
||||
// Check for numbers
|
||||
const updatedSet = optimiseNumbersSet(set);
|
||||
if (updatedSet.type !== 'set') {
|
||||
return updatedSet;
|
||||
}
|
||||
set = updatedSet;
|
||||
|
||||
// Attempt to find common stuff
|
||||
let merges: SimilarRegexItemSequenceResult | undefined;
|
||||
while ((merges = findSimilarRegexItemSequences(set.sets))) {
|
||||
const sequences = merges.sequences;
|
||||
if (sequences.length === 1) {
|
||||
// Only 1 sequence
|
||||
const merged = mergeSimilarRegexItemSequences(
|
||||
set.sets.map((item) => cloneEmojiRegexItem(item, true)),
|
||||
sequences[0],
|
||||
mergeSimilarItemsInSet
|
||||
);
|
||||
if (merged.length === 1) {
|
||||
// No longer a set
|
||||
return merged[0];
|
||||
}
|
||||
|
||||
// New set
|
||||
set = createSetEmojiRegexItem(merged);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multiple merges
|
||||
let newItem: EmojiItemRegex | undefined;
|
||||
for (let i = 0; i < sequences.length; i++) {
|
||||
const merged = mergeSimilarRegexItemSequences(
|
||||
set.sets.map((item) => cloneEmojiRegexItem(item, true)),
|
||||
sequences[i],
|
||||
mergeSimilarItemsInSet
|
||||
);
|
||||
|
||||
const mergedItem =
|
||||
merged.length === 1
|
||||
? merged[0]
|
||||
: createSetEmojiRegexItem(merged);
|
||||
if (!newItem || mergedItem.regex.length < newItem.regex.length) {
|
||||
newItem = mergedItem;
|
||||
}
|
||||
}
|
||||
if (!newItem) {
|
||||
throw new Error('Empty sequences list');
|
||||
}
|
||||
if (newItem.type !== 'set') {
|
||||
return newItem;
|
||||
}
|
||||
set = newItem;
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
182
packages/utils/src/emoji/regex/tree.ts
Normal file
182
packages/utils/src/emoji/regex/tree.ts
Normal file
@ -0,0 +1,182 @@
|
||||
import {
|
||||
createOptionalEmojiRegexItem,
|
||||
createSequenceEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
createUTF16EmojiRegexItem,
|
||||
EmojiItemRegex,
|
||||
} from './base';
|
||||
import { splitEmojiSequences } from '../cleanup';
|
||||
import { convertEmojiSequenceToUTF32 } from '../convert';
|
||||
import { createRegexForNumbersSequence } from './numbers';
|
||||
import { joinerEmoji } from '../data';
|
||||
import { mergeSimilarItemsInSet } from './similar';
|
||||
|
||||
/**
|
||||
* Tree item
|
||||
*/
|
||||
interface TreeItem {
|
||||
// Regex
|
||||
regex: EmojiItemRegex;
|
||||
|
||||
// True if end of sequence. If children are set, it means children are optional
|
||||
end?: true;
|
||||
|
||||
// Child elements, separated with 0x200d
|
||||
children?: TreeItem[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create tree
|
||||
*/
|
||||
export function createEmojisTree(sequences: number[][]): TreeItem[] {
|
||||
const root: TreeItem[] = [];
|
||||
|
||||
for (let i = 0; i < sequences.length; i++) {
|
||||
// Convert to UTF-32 and split
|
||||
const split = splitEmojiSequences(
|
||||
convertEmojiSequenceToUTF32(sequences[i])
|
||||
);
|
||||
|
||||
// Get items
|
||||
let parent = root;
|
||||
for (let j = 0; j < split.length; j++) {
|
||||
const regex = createRegexForNumbersSequence(split[j]);
|
||||
|
||||
// Find item
|
||||
let item: TreeItem;
|
||||
const match = parent.find(
|
||||
(item) => item.regex.regex === regex.regex
|
||||
);
|
||||
if (!match) {
|
||||
// Create new item
|
||||
item = {
|
||||
regex,
|
||||
};
|
||||
parent.push(item);
|
||||
} else {
|
||||
item = match;
|
||||
}
|
||||
|
||||
// End?
|
||||
if (j === split.length - 1) {
|
||||
item.end = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Parse children
|
||||
parent = item.children || (item.children = []);
|
||||
}
|
||||
}
|
||||
|
||||
return root;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse tree
|
||||
*/
|
||||
export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex {
|
||||
interface ParsedTreeItem {
|
||||
// Regex
|
||||
regex: EmojiItemRegex;
|
||||
|
||||
// True if end of sequence. If children are set, it means children are optional
|
||||
end: boolean;
|
||||
|
||||
// Regex for merged child elements
|
||||
children?: EmojiItemRegex;
|
||||
}
|
||||
|
||||
function mergeParsedChildren(items: ParsedTreeItem[]): EmojiItemRegex {
|
||||
const parsedItems: EmojiItemRegex[] = [];
|
||||
|
||||
// Find items with same 'end' and 'children'
|
||||
type TreeItemsMap = Record<string, Required<ParsedTreeItem>[]>;
|
||||
const mapWithoutEnd = Object.create(null) as TreeItemsMap;
|
||||
const mapWithEnd = Object.create(null) as TreeItemsMap;
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i];
|
||||
const children = item.children;
|
||||
if (children) {
|
||||
const fullItem = item as Required<ParsedTreeItem>;
|
||||
const target = item.end ? mapWithEnd : mapWithoutEnd;
|
||||
const regex = children.regex;
|
||||
if (!target[regex]) {
|
||||
target[regex] = [fullItem];
|
||||
} else {
|
||||
target[regex].push(fullItem);
|
||||
}
|
||||
} else {
|
||||
// Nothing to parse
|
||||
parsedItems.push(item.regex);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse all sets
|
||||
[mapWithEnd, mapWithoutEnd].forEach((source) => {
|
||||
for (const regex in source) {
|
||||
const items = source[regex];
|
||||
const firstItem = items[0];
|
||||
|
||||
// Merge common chunk + joiner
|
||||
let childSequence: EmojiItemRegex[] = [
|
||||
createUTF16EmojiRegexItem([joinerEmoji]),
|
||||
firstItem.children,
|
||||
];
|
||||
if (firstItem.end) {
|
||||
// Make it optional
|
||||
childSequence = [
|
||||
createOptionalEmojiRegexItem(
|
||||
createSequenceEmojiRegexItem(childSequence)
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
// Get remaining chunk
|
||||
let mergedRegex: EmojiItemRegex;
|
||||
if (items.length === 1) {
|
||||
// No matches
|
||||
mergedRegex = firstItem.regex;
|
||||
} else {
|
||||
// Merge items
|
||||
mergedRegex = mergeSimilarItemsInSet(
|
||||
createSetEmojiRegexItem(items.map((item) => item.regex))
|
||||
);
|
||||
}
|
||||
|
||||
// Merge
|
||||
const sequence = createSequenceEmojiRegexItem([
|
||||
mergedRegex,
|
||||
...childSequence,
|
||||
]);
|
||||
parsedItems.push(sequence);
|
||||
}
|
||||
});
|
||||
|
||||
// Merge sequences
|
||||
if (parsedItems.length === 1) {
|
||||
return parsedItems[0];
|
||||
}
|
||||
return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems));
|
||||
}
|
||||
|
||||
function parseItemChildren(item: TreeItem): ParsedTreeItem {
|
||||
const result: ParsedTreeItem = {
|
||||
regex: item.regex,
|
||||
end: !!item.end,
|
||||
};
|
||||
|
||||
// Parse child elements
|
||||
const children = item.children;
|
||||
if (!children) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const parsedChildren = children.map(parseItemChildren);
|
||||
result.children = mergeParsedChildren(parsedChildren);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Parse all items
|
||||
const parsed = items.map(parseItemChildren);
|
||||
return mergeParsedChildren(parsed);
|
||||
}
|
212
packages/utils/tests/emoji-regex-item-creation-test.ts
Normal file
212
packages/utils/tests/emoji-regex-item-creation-test.ts
Normal file
@ -0,0 +1,212 @@
|
||||
import {
|
||||
createOptionalEmojiRegexItem,
|
||||
createSequenceEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
createUTF16EmojiRegexItem,
|
||||
} from '../lib/emoji/regex/base';
|
||||
|
||||
describe('Creating chunks of regex', () => {
|
||||
it('UTF-16 numbers', () => {
|
||||
// Number
|
||||
expect(createUTF16EmojiRegexItem([0x2763])).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '\\u2763',
|
||||
numbers: [0x2763],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// Range
|
||||
expect(createUTF16EmojiRegexItem([0x2762, 0x2764, 0x2763])).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '[\\u2762-\\u2764]',
|
||||
numbers: [0x2762, 0x2763, 0x2764],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// Separate numbers
|
||||
expect(createUTF16EmojiRegexItem([0x2760, 0x2764, 0xfe0f])).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '[\\u2760\\u2764\\uFE0F]',
|
||||
numbers: [0x2760, 0x2764, 0xfe0f],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// Ranges + numbers, duplicate item
|
||||
expect(
|
||||
createUTF16EmojiRegexItem([
|
||||
0x2760, 0x2762, 0x2761, 0x2765, 0x2763, 0xfe0f, 0xfe0f, 0xfe0e,
|
||||
0x2000, 0x2001, 0x2100, 0x2102, 0x2101,
|
||||
])
|
||||
).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '[\\u2000\\u2001\\u2100-\\u2102\\u2760-\\u2763\\u2765\\uFE0E\\uFE0F]',
|
||||
numbers: [
|
||||
0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762,
|
||||
0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f,
|
||||
],
|
||||
group: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('Sequence from numbers', () => {
|
||||
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
|
||||
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
|
||||
|
||||
// 1 item
|
||||
expect(createSequenceEmojiRegexItem([num1])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001]',
|
||||
numbers: [0x2000, 0x2001],
|
||||
items: [num1],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// 2 numbers
|
||||
expect(createSequenceEmojiRegexItem([num1, num2])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001][\\u2000\\u2100]',
|
||||
items: [num1, num2],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Sets from numbers', () => {
|
||||
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
|
||||
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
|
||||
|
||||
// 1 item
|
||||
expect(createSetEmojiRegexItem([num1])).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u2000\\u2001]',
|
||||
numbers: [0x2000, 0x2001],
|
||||
sets: [num1],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// 2 numbers
|
||||
expect(createSetEmojiRegexItem([num1, num2])).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u2000\\u2001]|[\\u2000\\u2100]',
|
||||
numbers: [0x2000, 0x2001, 0x2000, 0x2100],
|
||||
sets: [num1, num2],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Optional numbers', () => {
|
||||
const num1 = createUTF16EmojiRegexItem([0xfe0f]);
|
||||
const num2 = createUTF16EmojiRegexItem([0xfe0e, 0xfe0f]);
|
||||
|
||||
// simple item
|
||||
expect(createOptionalEmojiRegexItem(num1)).toEqual({
|
||||
type: 'optional',
|
||||
regex: '\\uFE0F?',
|
||||
item: num1,
|
||||
group: true,
|
||||
});
|
||||
|
||||
// 2 numbers
|
||||
expect(createOptionalEmojiRegexItem(num2)).toEqual({
|
||||
type: 'optional',
|
||||
regex: '[\\uFE0E\\uFE0F]?',
|
||||
item: num2,
|
||||
group: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('Sequence', () => {
|
||||
const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]);
|
||||
const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]);
|
||||
const fe0f = createOptionalEmojiRegexItem(
|
||||
createUTF16EmojiRegexItem([0xfe0f])
|
||||
);
|
||||
|
||||
// optional item
|
||||
expect(createSequenceEmojiRegexItem([fe0f])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\uFE0F?',
|
||||
items: [fe0f],
|
||||
group: true,
|
||||
});
|
||||
|
||||
const seq1 = createSequenceEmojiRegexItem([num1, fe0f]);
|
||||
expect(seq1).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001]\\uFE0F?',
|
||||
items: [num1, fe0f],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// number + optional item + number
|
||||
expect(createSequenceEmojiRegexItem([num1, fe0f, num2])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]',
|
||||
items: [num1, fe0f, num2],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// number + nested sequence
|
||||
expect(createSequenceEmojiRegexItem([num2, seq1])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?',
|
||||
items: [num2, num1, fe0f],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Mix', () => {
|
||||
const num1 = createUTF16EmojiRegexItem([
|
||||
0x1234, 0x1235, 0x1236, 0x1237,
|
||||
]);
|
||||
|
||||
// UTF-32
|
||||
const utf32a1 = createUTF16EmojiRegexItem([0xd83d]);
|
||||
const utf32a2 = createUTF16EmojiRegexItem([0xdc9a]);
|
||||
const utf32a = createSequenceEmojiRegexItem([utf32a1, utf32a2]);
|
||||
expect(utf32a).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A',
|
||||
items: [utf32a1, utf32a2],
|
||||
group: false,
|
||||
});
|
||||
utf32a.numbers = [0x1f49a];
|
||||
|
||||
// Make it optional
|
||||
expect(createOptionalEmojiRegexItem(utf32a)).toEqual({
|
||||
type: 'optional',
|
||||
regex: '(?:\\uD83D\\uDC9A)?',
|
||||
item: utf32a,
|
||||
group: true,
|
||||
});
|
||||
|
||||
// Set of numbers
|
||||
const set = createSetEmojiRegexItem([num1, utf32a]);
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A',
|
||||
sets: [num1, utf32a],
|
||||
numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Make it optional
|
||||
expect(createOptionalEmojiRegexItem(set)).toEqual({
|
||||
type: 'optional',
|
||||
regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?',
|
||||
item: set,
|
||||
group: true,
|
||||
});
|
||||
|
||||
// Sequence with set
|
||||
const utf16a = createUTF16EmojiRegexItem([0x2000]);
|
||||
const utf16b = createUTF16EmojiRegexItem([0x2100]);
|
||||
const utf16c = createUTF16EmojiRegexItem([0x2101]);
|
||||
const set1 = createSetEmojiRegexItem([utf16b, utf16c]);
|
||||
expect(createSequenceEmojiRegexItem([utf16a, set1])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\u2000(?:\\u2100|\\u2101)',
|
||||
items: [utf16a, set1],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
});
|
198
packages/utils/tests/emoji-regex-numbers-test.ts
Normal file
198
packages/utils/tests/emoji-regex-numbers-test.ts
Normal file
@ -0,0 +1,198 @@
|
||||
import {
|
||||
createOptionalEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
createUTF16EmojiRegexItem,
|
||||
} from '../lib/emoji/regex/base';
|
||||
import {
|
||||
createEmojiRegexItemForNumbers,
|
||||
createRegexForNumbersSequence,
|
||||
optimiseNumbersSet,
|
||||
} from '../lib/emoji/regex/numbers';
|
||||
|
||||
describe('Creating chunks of regex for numbers', () => {
|
||||
it('Numbers', () => {
|
||||
// UTF-16
|
||||
expect(createEmojiRegexItemForNumbers([0x2763])).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '\\u2763',
|
||||
numbers: [0x2763],
|
||||
group: true,
|
||||
});
|
||||
|
||||
expect(
|
||||
createEmojiRegexItemForNumbers([0x2761, 0x2765, 0x2764, 0x2763])
|
||||
).toEqual({
|
||||
type: 'utf16',
|
||||
regex: '[\\u2761\\u2763-\\u2765]',
|
||||
numbers: [0x2761, 0x2763, 0x2764, 0x2765],
|
||||
group: true,
|
||||
});
|
||||
|
||||
// UTF-32
|
||||
expect(createEmojiRegexItemForNumbers([0x1f49a])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A',
|
||||
items: [
|
||||
{
|
||||
type: 'utf16',
|
||||
regex: '\\uD83D',
|
||||
numbers: [0xd83d],
|
||||
group: true,
|
||||
},
|
||||
{
|
||||
type: 'utf16',
|
||||
regex: '\\uDC9A',
|
||||
numbers: [0xdc9a],
|
||||
group: true,
|
||||
},
|
||||
],
|
||||
numbers: [0x1f49a],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Similar ranges
|
||||
const items1 = createEmojiRegexItemForNumbers([
|
||||
0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89c,
|
||||
]);
|
||||
delete (items1 as unknown as Record<string, unknown>).items;
|
||||
expect(items1).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]',
|
||||
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Mismatched ranges
|
||||
const items2 = createEmojiRegexItemForNumbers([
|
||||
0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89e,
|
||||
]);
|
||||
delete (items2 as unknown as Record<string, unknown>).sets;
|
||||
expect(items2).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
|
||||
numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Mix
|
||||
const items3 = createEmojiRegexItemForNumbers([
|
||||
0x2763, 0x2765, 0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b,
|
||||
0x1f89e, 0x2764,
|
||||
]);
|
||||
delete (items3 as unknown as Record<string, unknown>).sets;
|
||||
expect(items3).toEqual({
|
||||
type: 'set',
|
||||
regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]',
|
||||
numbers: [
|
||||
0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a,
|
||||
0x1f89b, 0x1f89e,
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Numbers sequence', () => {
|
||||
// UTF-16: cannot be sequence
|
||||
expect(createRegexForNumbersSequence([0x2763])).toEqual(
|
||||
createUTF16EmojiRegexItem([0x2763])
|
||||
);
|
||||
|
||||
// UTF-32
|
||||
expect(createRegexForNumbersSequence([0x1f49a])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A',
|
||||
numbers: [0x1f49a],
|
||||
items: [
|
||||
createUTF16EmojiRegexItem([0xd83d]),
|
||||
createUTF16EmojiRegexItem([0xdc9a]),
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
|
||||
// Variation
|
||||
expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f])).toEqual({
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A\\uFE0F?',
|
||||
items: [
|
||||
createUTF16EmojiRegexItem([0xd83d]),
|
||||
createUTF16EmojiRegexItem([0xdc9a]),
|
||||
createOptionalEmojiRegexItem(
|
||||
createUTF16EmojiRegexItem([0xfe0f])
|
||||
),
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
|
||||
expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f], false)).toEqual(
|
||||
{
|
||||
type: 'sequence',
|
||||
regex: '\\uD83D\\uDC9A\\uFE0F',
|
||||
items: [
|
||||
createUTF16EmojiRegexItem([0xd83d]),
|
||||
createUTF16EmojiRegexItem([0xdc9a]),
|
||||
createUTF16EmojiRegexItem([0xfe0f]),
|
||||
],
|
||||
group: false,
|
||||
}
|
||||
);
|
||||
|
||||
// Variation only
|
||||
expect(createRegexForNumbersSequence([0xfe0f])).toEqual(
|
||||
createOptionalEmojiRegexItem(createUTF16EmojiRegexItem([0xfe0f]))
|
||||
);
|
||||
});
|
||||
|
||||
it('Optimising set', () => {
|
||||
// Mix of numbers
|
||||
expect(
|
||||
optimiseNumbersSet(
|
||||
createSetEmojiRegexItem([
|
||||
// Mandatory
|
||||
createUTF16EmojiRegexItem([0x2000]),
|
||||
createUTF16EmojiRegexItem([0x2001]),
|
||||
createEmojiRegexItemForNumbers([0x1f932]),
|
||||
// Optional
|
||||
createOptionalEmojiRegexItem(
|
||||
createUTF16EmojiRegexItem([0x2100])
|
||||
),
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers([0x1f91d])
|
||||
),
|
||||
])
|
||||
)
|
||||
).toEqual(
|
||||
createSetEmojiRegexItem([
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers([0x1f91d, 0x2100])
|
||||
),
|
||||
createEmojiRegexItemForNumbers([0x2000, 0x2001, 0x1f932]),
|
||||
])
|
||||
);
|
||||
|
||||
// Duplicate optional and mandatory numbers
|
||||
expect(
|
||||
optimiseNumbersSet(
|
||||
createSetEmojiRegexItem([
|
||||
// Mandatory
|
||||
createUTF16EmojiRegexItem([0x2000]),
|
||||
createUTF16EmojiRegexItem([0x2001]),
|
||||
createEmojiRegexItemForNumbers([0x1f932]),
|
||||
// Optional
|
||||
createOptionalEmojiRegexItem(
|
||||
createUTF16EmojiRegexItem([0x2001, 0x2002])
|
||||
),
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers([0x1f91d])
|
||||
),
|
||||
])
|
||||
)
|
||||
).toEqual(
|
||||
createSetEmojiRegexItem([
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers([0x1f91d, 0x2001, 0x2002])
|
||||
),
|
||||
createEmojiRegexItemForNumbers([0x2000, 0x1f932]),
|
||||
])
|
||||
);
|
||||
});
|
||||
});
|
443
packages/utils/tests/emoji-regex-similar-items-test.ts
Normal file
443
packages/utils/tests/emoji-regex-similar-items-test.ts
Normal file
@ -0,0 +1,443 @@
|
||||
/* eslint-disable @typescript-eslint/no-non-null-assertion */
|
||||
import { splitUTF32Number } from '../lib/emoji/convert';
|
||||
import {
|
||||
createOptionalEmojiRegexItem,
|
||||
createSequenceEmojiRegexItem,
|
||||
createSetEmojiRegexItem,
|
||||
createUTF16EmojiRegexItem,
|
||||
SequenceEmojiItemRegex,
|
||||
} from '../lib/emoji/regex/base';
|
||||
import {
|
||||
createEmojiRegexItemForNumbers,
|
||||
createRegexForNumbersSequence,
|
||||
} from '../lib/emoji/regex/numbers';
|
||||
import {
|
||||
findSimilarRegexItemSequences,
|
||||
mergeSimilarItemsInSet,
|
||||
mergeSimilarRegexItemSequences,
|
||||
} from '../lib/emoji/regex/similar';
|
||||
|
||||
describe('Similar chunks of regex', () => {
|
||||
it('Nothing in common', () => {
|
||||
// Nothing in common
|
||||
expect(
|
||||
findSimilarRegexItemSequences([
|
||||
createRegexForNumbersSequence([0x1234, 0x2345]),
|
||||
])
|
||||
).toBeUndefined();
|
||||
|
||||
expect(
|
||||
findSimilarRegexItemSequences([
|
||||
createEmojiRegexItemForNumbers([0x1234]),
|
||||
createOptionalEmojiRegexItem(
|
||||
createEmojiRegexItemForNumbers([0x1234])
|
||||
),
|
||||
])
|
||||
).toBeUndefined();
|
||||
|
||||
expect(
|
||||
findSimilarRegexItemSequences([
|
||||
createEmojiRegexItemForNumbers([0x1234]),
|
||||
// Match is in middle of sequence
|
||||
createRegexForNumbersSequence([0x1230, 0x1234, 0x1235]),
|
||||
])
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it('Simple match', () => {
|
||||
const items = [
|
||||
createEmojiRegexItemForNumbers([0x1234]),
|
||||
createRegexForNumbersSequence([0x1234, 0x1235]),
|
||||
createRegexForNumbersSequence([0xfe0f]),
|
||||
];
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 6,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 'full',
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(items, sequence)
|
||||
);
|
||||
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\u1234\\u1235?|\\uFE0F?',
|
||||
sets: [
|
||||
createSequenceEmojiRegexItem([
|
||||
items[0],
|
||||
createOptionalEmojiRegexItem(
|
||||
createUTF16EmojiRegexItem([0x1235])
|
||||
),
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Range of numbers', () => {
|
||||
const items = [
|
||||
createRegexForNumbersSequence([0x1f91d, 0x1f3fb]),
|
||||
createRegexForNumbersSequence([0x1f91d, 0x1f3fc]),
|
||||
createRegexForNumbersSequence([0x1f91d, 0x1f3fd]),
|
||||
createRegexForNumbersSequence([0x1f91d, 0x1f3fe]),
|
||||
createRegexForNumbersSequence([0x1f91d, 0x1f3ff]),
|
||||
];
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 72,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 3,
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 3,
|
||||
},
|
||||
{
|
||||
index: 2,
|
||||
slice: 3,
|
||||
},
|
||||
{
|
||||
index: 3,
|
||||
slice: 3,
|
||||
},
|
||||
{
|
||||
index: 4,
|
||||
slice: 3,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(
|
||||
items,
|
||||
sequence,
|
||||
mergeSimilarItemsInSet
|
||||
)
|
||||
);
|
||||
|
||||
const commonChunk = (items[0] as SequenceEmojiItemRegex).items.slice(
|
||||
0,
|
||||
3
|
||||
);
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\uD83E\\uDD1D\\uD83C[\\uDFFB-\\uDFFF]',
|
||||
sets: [
|
||||
createSequenceEmojiRegexItem([
|
||||
...commonChunk,
|
||||
createUTF16EmojiRegexItem([
|
||||
0xdffb, 0xdffc, 0xdffd, 0xdffe, 0xdfff,
|
||||
]),
|
||||
]),
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Multiple matches', () => {
|
||||
const items = [
|
||||
createEmojiRegexItemForNumbers([0x1234]),
|
||||
createRegexForNumbersSequence([0x1234, 0x1235]),
|
||||
createEmojiRegexItemForNumbers([0x1235]),
|
||||
];
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 6,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 'full',
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
type: 'end',
|
||||
slices: [
|
||||
{
|
||||
index: 1,
|
||||
slice: 1,
|
||||
},
|
||||
{
|
||||
index: 2,
|
||||
slice: 'full',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply first merge only
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(items, sequence)
|
||||
);
|
||||
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\u1234\\u1235?|\\u1235',
|
||||
sets: [
|
||||
createSequenceEmojiRegexItem([
|
||||
items[0],
|
||||
createOptionalEmojiRegexItem(items[2]),
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Extra number', () => {
|
||||
const items = [
|
||||
createRegexForNumbersSequence([0x1f64f]),
|
||||
createRegexForNumbersSequence([0x1f64f, 0x1f3fb]),
|
||||
];
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 12,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 'full',
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply merge
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(items, sequence)
|
||||
);
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\uD83D\\uDE4F(?:\\uD83C\\uDFFB)?',
|
||||
sets: [
|
||||
createSequenceEmojiRegexItem([
|
||||
...items[0].items,
|
||||
createOptionalEmojiRegexItem(
|
||||
createRegexForNumbersSequence(
|
||||
splitUTF32Number(0x1f3fb)!
|
||||
)
|
||||
),
|
||||
]),
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Multiple matches', () => {
|
||||
const items = [
|
||||
createEmojiRegexItemForNumbers([0x1234]),
|
||||
createRegexForNumbersSequence([0x1234, 0x1235]),
|
||||
createEmojiRegexItemForNumbers([0x1235]),
|
||||
];
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 6,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 'full',
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
type: 'end',
|
||||
slices: [
|
||||
{
|
||||
index: 1,
|
||||
slice: 1,
|
||||
},
|
||||
{
|
||||
index: 2,
|
||||
slice: 'full',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply first merge only
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(items, sequence)
|
||||
);
|
||||
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex: '\\u1234\\u1235?|\\u1235',
|
||||
sets: [
|
||||
createSequenceEmojiRegexItem([
|
||||
items[0],
|
||||
createOptionalEmojiRegexItem(items[2]),
|
||||
]),
|
||||
items[2],
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Complex sequence', () => {
|
||||
const items = [
|
||||
// First 3 elements match, also last 2 elements create variations
|
||||
createRegexForNumbersSequence([
|
||||
0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fc,
|
||||
]),
|
||||
createRegexForNumbersSequence([
|
||||
0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fd,
|
||||
]),
|
||||
createRegexForNumbersSequence([
|
||||
0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fc,
|
||||
]),
|
||||
createRegexForNumbersSequence([
|
||||
0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fd,
|
||||
]),
|
||||
// Variation
|
||||
createRegexForNumbersSequence([0x1f64f]),
|
||||
createRegexForNumbersSequence([0x1f64f, 0x1f3fb]),
|
||||
];
|
||||
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 108,
|
||||
sequences: [
|
||||
{
|
||||
type: 'start',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 6,
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 6,
|
||||
},
|
||||
{
|
||||
index: 2,
|
||||
slice: 6,
|
||||
},
|
||||
{
|
||||
index: 3,
|
||||
slice: 6,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Apply first merge only
|
||||
const set = createSetEmojiRegexItem(
|
||||
mergeSimilarRegexItemSequences(items, sequence)
|
||||
);
|
||||
|
||||
const slicedSequence = (items[0] as SequenceEmojiItemRegex).items.slice(
|
||||
0,
|
||||
6
|
||||
);
|
||||
const slicedSet = createSetEmojiRegexItem([
|
||||
createSequenceEmojiRegexItem(
|
||||
(items[0] as SequenceEmojiItemRegex).items.slice(6)
|
||||
),
|
||||
createSequenceEmojiRegexItem(
|
||||
(items[1] as SequenceEmojiItemRegex).items.slice(6)
|
||||
),
|
||||
createSequenceEmojiRegexItem(
|
||||
(items[2] as SequenceEmojiItemRegex).items.slice(6)
|
||||
),
|
||||
createSequenceEmojiRegexItem(
|
||||
(items[3] as SequenceEmojiItemRegex).items.slice(6)
|
||||
),
|
||||
]);
|
||||
expect(slicedSet.regex).toBe(
|
||||
// Test mix separately to see if it is correct instead of parsing whole regex
|
||||
'\\uDEF1\\uD83C\\uDFFC|\\uDEF1\\uD83C\\uDFFD|\\uDEF2\\uD83C\\uDFFC|\\uDEF2\\uD83C\\uDFFD'
|
||||
);
|
||||
expect(set).toEqual({
|
||||
type: 'set',
|
||||
regex:
|
||||
// last 2 items (set items are sorted alphabetically),
|
||||
// 6 numbers from common chunks, grouped mix
|
||||
'\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' +
|
||||
slicedSet.regex +
|
||||
')',
|
||||
sets: [
|
||||
items[4],
|
||||
items[5],
|
||||
createSequenceEmojiRegexItem([...slicedSequence, slicedSet]),
|
||||
],
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
});
|
224
packages/utils/tests/emoji-tree-test.ts
Normal file
224
packages/utils/tests/emoji-tree-test.ts
Normal file
@ -0,0 +1,224 @@
|
||||
/* eslint-disable @typescript-eslint/no-non-null-assertion */
|
||||
import { getEmojiSequenceFromString } from '../lib/emoji/cleanup';
|
||||
import { createRegexForNumbersSequence } from '../lib/emoji/regex/numbers';
|
||||
import { createEmojisTree, parseEmojiTree } from '../lib/emoji/regex/tree';
|
||||
|
||||
describe('Emoji regex tree', () => {
|
||||
it('Creating simple tree', () => {
|
||||
const numbers = [
|
||||
getEmojiSequenceFromString('1F3C1'),
|
||||
getEmojiSequenceFromString('1F3F3'),
|
||||
getEmojiSequenceFromString('1F3F3 FE0F'),
|
||||
getEmojiSequenceFromString('1F3F4 200D 2620 FE0F'),
|
||||
getEmojiSequenceFromString('1F3F4 200D 2620'),
|
||||
];
|
||||
const tree = createEmojisTree(numbers);
|
||||
expect(tree).toEqual([
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f3c1]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f3f3]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f3f3, 0xfe0f]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f3f4]),
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x2620, 0xfe0f]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x2620]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
|
||||
expect(parseEmojiTree(tree).regex).toEqual(
|
||||
'\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])'
|
||||
);
|
||||
});
|
||||
|
||||
it('Creating complex tree', () => {
|
||||
const numbers = [
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FB'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB'),
|
||||
];
|
||||
const tree = createEmojisTree(numbers);
|
||||
expect(tree).toEqual([
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]),
|
||||
end: true,
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fc,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fd,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fe,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3ff,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]),
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fb,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fd,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fe,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3ff,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
|
||||
expect(parseEmojiTree(tree).regex).toEqual(
|
||||
'\\uD83E\\uDEF1\\uD83C' +
|
||||
// depth: 1
|
||||
'(?:\\uDFFB' +
|
||||
// depth: 2
|
||||
'(?:\\u200D\\uD83E\\uDEF2\\uD83C' +
|
||||
// depth: 3
|
||||
'[\\uDFFC-\\uDFFF]' +
|
||||
// depth: 2
|
||||
')?' +
|
||||
// depth: 1
|
||||
'|\\uDFFC\\u200D\\uD83E\\uDEF2\\uD83C' +
|
||||
// depth: 2
|
||||
'[\\uDFFB\\uDFFD-\\uDFFF]' +
|
||||
// depth: 1
|
||||
')'
|
||||
);
|
||||
});
|
||||
|
||||
it('Creating complex optimisable tree', () => {
|
||||
const numbers = [
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FC'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FB'),
|
||||
getEmojiSequenceFromString('1FAF1 1F3FC'),
|
||||
];
|
||||
const tree = createEmojisTree(numbers);
|
||||
expect(tree).toEqual([
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]),
|
||||
end: true,
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fc,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fd,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fe,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3ff,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]),
|
||||
end: true,
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fc,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fd,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3fe,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1faf2, 0x1f3ff,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
|
||||
// expect(parseEmojiTree(tree).regex).toEqual(
|
||||
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
|
||||
// );
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue
Block a user