diff --git a/packages/utils/package.json b/packages/utils/package.json index 43b4956..2e8abab 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -102,6 +102,26 @@ "import": "./lib/customisations/rotate.mjs", "types": "./lib/customisations/rotate.d.ts" }, + "./lib/emoji/regex/base": { + "require": "./lib/emoji/regex/base.cjs", + "import": "./lib/emoji/regex/base.mjs", + "types": "./lib/emoji/regex/base.d.ts" + }, + "./lib/emoji/regex/numbers": { + "require": "./lib/emoji/regex/numbers.cjs", + "import": "./lib/emoji/regex/numbers.mjs", + "types": "./lib/emoji/regex/numbers.d.ts" + }, + "./lib/emoji/regex/similar": { + "require": "./lib/emoji/regex/similar.cjs", + "import": "./lib/emoji/regex/similar.mjs", + "types": "./lib/emoji/regex/similar.d.ts" + }, + "./lib/emoji/regex/tree": { + "require": "./lib/emoji/regex/tree.cjs", + "import": "./lib/emoji/regex/tree.mjs", + "types": "./lib/emoji/regex/tree.d.ts" + }, "./lib/emoji/cleanup": { "require": "./lib/emoji/cleanup.cjs", "import": "./lib/emoji/cleanup.mjs", diff --git a/packages/utils/src/emoji/format.ts b/packages/utils/src/emoji/format.ts index 67ae800..225399f 100644 --- a/packages/utils/src/emoji/format.ts +++ b/packages/utils/src/emoji/format.ts @@ -3,7 +3,7 @@ import { convertEmojiSequenceToUTF32, } from './convert'; -interface UnicodeOptions { +export interface UnicodeFormattingOptions { // Prefix before each character '\\u' prefix: string; @@ -23,7 +23,7 @@ interface UnicodeOptions { throwOnError: boolean; } -const defaultUnicodeOptions: UnicodeOptions = { +const defaultUnicodeOptions: UnicodeFormattingOptions = { prefix: '', separator: '', case: 'lower', @@ -35,7 +35,10 @@ const defaultUnicodeOptions: UnicodeOptions = { /** * Convert number to string */ -function convert(sequence: number[], options: UnicodeOptions): string { +function convert( + sequence: number[], + options: UnicodeFormattingOptions +): string { const prefix = options.prefix; const func = options.case === 'upper' ? 'toUpperCase' : 'toLowerCase'; @@ -60,7 +63,7 @@ function convert(sequence: number[], options: UnicodeOptions): string { */ export function getEmojiUnicodeString( code: number, - options: Partial = {} + options: Partial = {} ): string { return convert([code], { ...defaultUnicodeOptions, @@ -68,7 +71,7 @@ export function getEmojiUnicodeString( }); } -const defaultSequenceOptions: UnicodeOptions = { +const defaultSequenceOptions: UnicodeFormattingOptions = { ...defaultUnicodeOptions, separator: '-', }; @@ -78,7 +81,7 @@ const defaultSequenceOptions: UnicodeOptions = { */ export function getEmojiSequenceString( sequence: number[], - options: Partial = {} + options: Partial = {} ): string { return convert(sequence, { ...defaultSequenceOptions, @@ -86,7 +89,7 @@ export function getEmojiSequenceString( }); } -const regexOptions: UnicodeOptions = { +const regexOptions: UnicodeFormattingOptions = { prefix: '\\u', separator: '', case: 'upper', @@ -108,7 +111,7 @@ export function emojiSequenceToRegex( }); } -const keywordOptions: UnicodeOptions = { +const keywordOptions: UnicodeFormattingOptions = { prefix: '', separator: '-', case: 'lower', diff --git a/packages/utils/src/emoji/regex/base.ts b/packages/utils/src/emoji/regex/base.ts new file mode 100644 index 0000000..3cc54d3 --- /dev/null +++ b/packages/utils/src/emoji/regex/base.ts @@ -0,0 +1,423 @@ +import { getEmojiUnicodeString, UnicodeFormattingOptions } from '../format'; + +/** + * Regex in item + */ +interface BaseEmojiItemRegex { + // Regex type: + // 'utf16' -> utf16 number(s) + // 'sequence' -> sequence, not wrapped in `(?:` + `)` + // requires wrapping, unless marked as wrapped + // 'options' -> list of options, not wrapped in `(?:` + `)` + // requires wrapping + type: 'utf16' | 'sequence' | 'set' | 'optional'; + + // Regex + regex: string; + + // True if regex can be treated as a group (does not require wrapping in `(?:` + `)`) + group: boolean; +} + +interface EmojiItemRegexWithNumbers { + // Numbers in regex, set if regex represents set of numbers. Allows + // creation of number ranges when combining multiple regex items + // Cannot be empty array + numbers?: number[]; +} + +// Numbers +export interface UTF16EmojiItemRegex + extends BaseEmojiItemRegex, + Required { + type: 'utf16'; + + // Always grouped + group: true; + + // `numbers` is required +} + +// Sequence +type SequenceEmojiItemRegexItem = + | UTF16EmojiItemRegex + | SetEmojiItemRegex + | OptionalEmojiItemRegex; +export interface SequenceEmojiItemRegex + extends BaseEmojiItemRegex, + EmojiItemRegexWithNumbers { + type: 'sequence'; + + // Items in sequence. Any type except another sequence + items: SequenceEmojiItemRegexItem[]; +} + +// Set +export type SetEmojiItemRegexItem = + | UTF16EmojiItemRegex + | SequenceEmojiItemRegex + | OptionalEmojiItemRegex; +export interface SetEmojiItemRegex + extends BaseEmojiItemRegex, + EmojiItemRegexWithNumbers { + type: 'set'; + + // Items in set. Any type except another set + sets: SetEmojiItemRegexItem[]; +} + +// Optional +type OptionalEmojiItemRegexItem = + | UTF16EmojiItemRegex + | SequenceEmojiItemRegex + | SetEmojiItemRegex; +export interface OptionalEmojiItemRegex extends BaseEmojiItemRegex { + type: 'optional'; + + // Wrapped item. Any type except another optional item + item: OptionalEmojiItemRegexItem; + + // Always grouped + group: true; +} + +export type EmojiItemRegex = + | UTF16EmojiItemRegex + | SequenceEmojiItemRegex + | SetEmojiItemRegex + | OptionalEmojiItemRegex; + +/** + * Options for converting number to string + */ +const numberToStringOptions: Partial = { + prefix: '\\u', + separator: '', + case: 'upper', + format: 'utf-16', + add0: true, +}; + +/** + * Convert number to string + */ +function toString(number: number): string { + return getEmojiUnicodeString(number, numberToStringOptions); +} + +/** + * Typescript stuff + */ +// eslint-disable-next-line @typescript-eslint/no-unused-vars +function assertNever(v: never) { + // Empty function that should never be called +} + +/** + * Wrap regex in group + */ +export function wrapRegexInGroup(regex: string): string { + return '(?:' + regex + ')'; +} + +/** + * Update UTF16 item, return regex + */ +export function updateUTF16EmojiRegexItem(item: UTF16EmojiItemRegex): string { + const numbers = item.numbers; + if (numbers.length === 1) { + // 1 number + const num = numbers[0]; + return (item.regex = toString(num)); + } + + // Multiple numbers + numbers.sort((a, b) => a - b); + const chars: string[] = []; + interface Range { + start: number; + last: number; + numbers: number[]; + } + let range: Range | null = null; + const addRange = () => { + if (range) { + const { start, last, numbers } = range; + range = null; + if (last > start + 1) { + // More than 2 items + chars.push(toString(start) + '-' + toString(last)); + } else { + for (let i = 0; i < numbers.length; i++) { + chars.push(toString(numbers[i])); + } + } + } + }; + + for (let i = 0; i < numbers.length; i++) { + const num = numbers[i]; + if (range) { + if (range.last === num) { + // Duplicate + continue; + } + if (range.last === num - 1) { + // Add to existing range + range.numbers.push(num); + range.last = num; + continue; + } + } + + // Not in range: start new one + addRange(); + range = { + start: num, + last: num, + numbers: [num], + }; + } + addRange(); + + if (!chars.length) { + throw new Error('Unexpected empty range'); + } + return (item.regex = '[' + chars.join('') + ']'); +} + +/** + * Create UTF-16 regex + */ +export function createUTF16EmojiRegexItem( + numbers: number[] +): UTF16EmojiItemRegex { + const result: UTF16EmojiItemRegex = { + type: 'utf16', + regex: '', + numbers, + group: true, + }; + updateUTF16EmojiRegexItem(result); + return result; +} + +/** + * Update sequence regex. Does not update group + */ +export function updateSequenceEmojiRegexItem( + item: SequenceEmojiItemRegex +): string { + return (item.regex = item.items + .map((childItem) => { + if (!childItem.group && childItem.type === 'set') { + return wrapRegexInGroup(childItem.regex); + } + return childItem.regex; + }) + .join('')); +} + +/** + * Create sequence regex + */ +export function createSequenceEmojiRegexItem( + sequence: EmojiItemRegex[], + numbers?: number[] +): SequenceEmojiItemRegex { + // Merge items + let items: SequenceEmojiItemRegexItem[] = []; + sequence.forEach((item) => { + if (item.type === 'sequence') { + items = items.concat(item.items); + } else { + items.push(item); + } + }); + + // Generate item + if (!items.length) { + throw new Error('Empty sequence'); + } + const result: SequenceEmojiItemRegex = { + type: 'sequence', + items, + regex: '', + group: false, + }; + + if (sequence.length === 1) { + const firstItem = sequence[0]; + result.group = firstItem.group; + if (firstItem.type !== 'optional') { + const numbers = firstItem.numbers; + if (numbers) { + result.numbers = numbers; + } + } + } + + if (numbers) { + result.numbers = numbers; + } + + // Update regex + updateSequenceEmojiRegexItem(result); + return result; +} + +/** + * Update set regex and group + */ +export function updateSetEmojiRegexItem(item: SetEmojiItemRegex): string { + if (item.sets.length === 1) { + // 1 item + const firstItem = item.sets[0]; + item.group = firstItem.group; + return (item.regex = firstItem.regex); + } + + // Multiple items + item.group = false; + return (item.regex = item.sets + .map((childItem) => childItem.regex) + .join('|')); +} + +/** + * Create set regex + */ +export function createSetEmojiRegexItem( + set: EmojiItemRegex[] +): SetEmojiItemRegex { + let sets: SetEmojiItemRegexItem[] = []; + let numbers: number[] | null = []; + + set.forEach((item) => { + if (item.type === 'set') { + sets = sets.concat(item.sets); + } else { + sets.push(item); + } + + // Copy numbers + if (numbers) { + if (item.type === 'optional' || !item.numbers) { + numbers = null; + } else { + numbers = [...numbers, ...item.numbers]; + } + } + }); + + // Sort items to guarantee same results regardless of order + sets.sort((a, b) => a.regex.localeCompare(b.regex)); + + // Create item + const result: SetEmojiItemRegex = { + type: 'set', + sets, + regex: '', + group: false, + }; + if (numbers) { + result.numbers = numbers; + } + + if (set.length === 1) { + const firstItem = set[0]; + result.group = firstItem.group; + } + + updateSetEmojiRegexItem(result); + return result; +} + +/** + * Update optional regex + */ +export function updateOptionalEmojiRegexItem( + item: OptionalEmojiItemRegex +): string { + const childItem = item.item; + const regex = + (childItem.group + ? childItem.regex + : wrapRegexInGroup(childItem.regex)) + '?'; + return (item.regex = regex); +} + +/** + * Create optional item + */ +export function createOptionalEmojiRegexItem( + item: EmojiItemRegex +): OptionalEmojiItemRegex { + if (item.type === 'optional') { + return item; + } + + const result: OptionalEmojiItemRegex = { + type: 'optional', + item, + regex: '', + group: true, + }; + updateOptionalEmojiRegexItem(result); + return result; +} + +/** + * Clone item + */ +export function cloneEmojiRegexItem( + item: T, + shallow = false +): T { + const result = { + ...item, + } as unknown as EmojiItemRegex; + + // Copy numbers + if (result.type !== 'optional' && result.numbers) { + result.numbers = [...result.numbers]; + } + + // Clone lists + switch (result.type) { + case 'utf16': + // Nothing to do + break; + + case 'sequence': + if (shallow) { + result.items = [...result.items]; + } else { + result.items = result.items.map((item) => + cloneEmojiRegexItem(item, false) + ); + } + break; + + case 'set': + if (shallow) { + result.sets = [...result.sets]; + } else { + result.sets = result.sets.map((item) => + cloneEmojiRegexItem(item, false) + ); + } + break; + + case 'optional': + if (!shallow) { + result.item = cloneEmojiRegexItem(result.item, false); + } + break; + + default: + assertNever(result); + } + + return result as unknown as T; +} diff --git a/packages/utils/src/emoji/regex/numbers.ts b/packages/utils/src/emoji/regex/numbers.ts new file mode 100644 index 0000000..4bd5280 --- /dev/null +++ b/packages/utils/src/emoji/regex/numbers.ts @@ -0,0 +1,246 @@ +import { splitUTF32Number } from '../convert'; +import { + createOptionalEmojiRegexItem, + createSequenceEmojiRegexItem, + createSetEmojiRegexItem, + createUTF16EmojiRegexItem, + EmojiItemRegex, + OptionalEmojiItemRegex, + SequenceEmojiItemRegex, + SetEmojiItemRegex, + UTF16EmojiItemRegex, +} from './base'; +import { vs16Emoji } from '../data'; + +/** + * Create regex item for set of numbers + */ +export function createEmojiRegexItemForNumbers( + numbers: number[] +): UTF16EmojiItemRegex | SequenceEmojiItemRegex | SetEmojiItemRegex { + // Separate UTF-16 and UTF-32 + interface UTF32FirstNumber { + first: number; + second: number[]; + numbers: number[]; + } + const utf32: UTF32FirstNumber[] = []; + const utf16: number[] = []; + + numbers.sort((a, b) => a - b); + + let lastNumber: number | undefined; + for (let i = 0; i < numbers.length; i++) { + const number = numbers[i]; + if (number === lastNumber) { + continue; + } + lastNumber = number; + + const split = splitUTF32Number(number); + if (!split) { + utf16.push(number); + continue; + } + + const [first, second] = split; + const item = utf32.find((item) => item.first === first); + if (item) { + item.second.push(second); + item.numbers.push(number); + } else { + utf32.push({ + first, + second: [second], + numbers: [number], + }); + } + } + + const results: (UTF16EmojiItemRegex | SequenceEmojiItemRegex)[] = []; + + // Merge UTF-16 + if (utf16.length) { + results.push(createUTF16EmojiRegexItem(utf16)); + } + + // Merge UTF-32 + if (utf32.length) { + // Create map of first and second chunks, joining by common second chunks + interface UTF32Item { + second: UTF16EmojiItemRegex; + first: number[]; + numbers: number[]; + } + const utf32Set: UTF32Item[] = []; + + for (let i = 0; i < utf32.length; i++) { + const item = utf32[i]; + const secondRegex = createUTF16EmojiRegexItem(item.second); + + // Find matching elements + const listItem = utf32Set.find( + (item) => item.second.regex === secondRegex.regex + ); + if (listItem) { + // Found multiple items with the same last set + listItem.first.push(item.first); + listItem.numbers = [...listItem.numbers, ...item.numbers]; + } else { + utf32Set.push({ + second: secondRegex, + first: [item.first], + numbers: [...item.numbers], + }); + } + } + + // Create regex for each set + for (let i = 0; i < utf32Set.length; i++) { + const item = utf32Set[i]; + const firstRegex = createUTF16EmojiRegexItem(item.first); + const secondRegex = item.second; + + // Generate regex, add numbers list for reference + results.push( + createSequenceEmojiRegexItem( + [firstRegex, secondRegex], + item.numbers + ) + ); + } + } + + return results.length === 1 ? results[0] : createSetEmojiRegexItem(results); +} + +/** + * Create sequence of numbers + */ +export function createRegexForNumbersSequence( + numbers: number[], + optionalVariations = true +): SequenceEmojiItemRegex | UTF16EmojiItemRegex | OptionalEmojiItemRegex { + const items: (UTF16EmojiItemRegex | OptionalEmojiItemRegex)[] = []; + for (let i = 0; i < numbers.length; i++) { + const num = numbers[i]; + const split = splitUTF32Number(num); + if (!split) { + // UTF-16 number + const item = createUTF16EmojiRegexItem([num]); + if (optionalVariations && num === vs16Emoji) { + items.push(createOptionalEmojiRegexItem(item)); + } else { + items.push(item); + } + } else { + // UTF-32 number + items.push(createUTF16EmojiRegexItem([split[0]])); + items.push(createUTF16EmojiRegexItem([split[1]])); + } + } + + if (items.length === 1) { + // Only 1 item + return items[0]; + } + + const result = createSequenceEmojiRegexItem(items); + if (numbers.length === 1 && items[0].type === 'utf16') { + // Copy numbers if utf-16 or utf-32 sequence + result.numbers = [...numbers]; + } + return result; +} + +/** + * Attempt to optimise numbers in a set + */ +export function optimiseNumbersSet(set: SetEmojiItemRegex): EmojiItemRegex { + interface Match { + numbers: number[]; + items: EmojiItemRegex[]; + } + const mandatoryMatches: Match = { + numbers: [], + items: [], + }; + const optionalMatches: Match = { + numbers: [], + items: [], + }; + + const filteredItems: EmojiItemRegex[] = set.sets.filter((item) => { + if (item.type === 'optional') { + const parentItem = item.item; + if (parentItem.numbers) { + optionalMatches.items.push(item); + optionalMatches.numbers = optionalMatches.numbers.concat( + parentItem.numbers + ); + return false; + } + return true; + } + + if (item.numbers) { + mandatoryMatches.items.push(item); + mandatoryMatches.numbers = mandatoryMatches.numbers.concat( + item.numbers + ); + return false; + } + return true; + }); + + // Check if there is something to optimise + if (mandatoryMatches.items.length + optionalMatches.items.length < 2) { + return set; + } + + // Remove duplicate numbers + const optionalNumbers = new Set(optionalMatches.numbers); + let foundMatches = false; + mandatoryMatches.numbers = mandatoryMatches.numbers.filter((number) => { + if (optionalNumbers.has(number)) { + foundMatches = true; + return false; + } + return true; + }); + + // Check mandatory numbers + if (mandatoryMatches.items.length) { + if (!foundMatches && mandatoryMatches.items.length === 1) { + // 1 unchanged item + filteredItems.push(mandatoryMatches.items[0]); + } else if (mandatoryMatches.numbers.length) { + // Merge items + filteredItems.push( + createEmojiRegexItemForNumbers(mandatoryMatches.numbers) + ); + } + } + + // Check optional numbers + switch (optionalMatches.items.length) { + case 0: + break; + + case 1: + filteredItems.push(optionalMatches.items[0]); + break; + + default: + filteredItems.push( + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers(optionalMatches.numbers) + ) + ); + } + + // Return regex + return filteredItems.length === 1 + ? filteredItems[0] + : createSetEmojiRegexItem(filteredItems); +} diff --git a/packages/utils/src/emoji/regex/similar.ts b/packages/utils/src/emoji/regex/similar.ts new file mode 100644 index 0000000..1f7df92 --- /dev/null +++ b/packages/utils/src/emoji/regex/similar.ts @@ -0,0 +1,372 @@ +import { + cloneEmojiRegexItem, + createOptionalEmojiRegexItem, + createSequenceEmojiRegexItem, + createSetEmojiRegexItem, + EmojiItemRegex, + SetEmojiItemRegex, +} from './base'; +import { optimiseNumbersSet } from './numbers'; + +type SlicePosition = 'start' | 'end'; +type SliceValue = number | 'full'; + +/** + * Slice of sequence + */ +interface SimilarRegexItemSlice { + // Index of item in sequences list + index: number; + + // Start (for 'end' slices) or end (for 'start' slices) of slice + // 'full' if nothing to slice + slice: SliceValue; +} + +/** + * Similar sequence + */ +interface SimilarRegexItemSequence { + // Where common part is found + // Common chunks can exist only at start or end of sequence, not in middle + type: SlicePosition; + + // Slices. Key is index in items list, value is start (for 'end' slices) + // or end (for 'start' slices) of slice, 'full' for full items + slices: SimilarRegexItemSlice[]; +} + +/** + * Result if findSimilarRegexItemSequences() + */ +interface SimilarRegexItemSequenceResult { + // Replacement score: how many characters will be saved by merging items + score: number; + + // Sequences that match it + sequences: SimilarRegexItemSequence[]; +} + +/** + * Typescript stuff + */ +// eslint-disable-next-line @typescript-eslint/no-unused-vars +function assertNever(v: never) { + // Empty function that should never be called +} + +/** + * Find similar item sequences + * + * Returns sequence(s) with highest score. Only one of results should be + * applied to items. If there are multiple sequences, clone items list, + * attempt to apply each sequence, run further optimisations on each fork + * and see which one returns better result. + * + * Returns undefined if no common sequences found + */ +export function findSimilarRegexItemSequences( + items: EmojiItemRegex[] +): SimilarRegexItemSequenceResult | undefined { + interface MapItem { + score: number; + slices: SimilarRegexItemSlice[]; + } + + // Regex at start and end of sequences + // Key = regex combination + const startRegex = Object.create(null) as Record; + const endRegex = Object.create(null) as Record; + + const addMapItem = ( + target: Record, + index: number, + regex: string, + slice: SliceValue + ) => { + if (!target[regex]) { + // New item + target[regex] = { + // Start with 0. One item will remain after replacement + score: 0, + slices: [ + { + index, + slice, + }, + ], + }; + return; + } + + // Existing item + const item = target[regex]; + item.score += regex.length; + item.slices.push({ + index, + slice, + }); + }; + + // Create list of all possible sequences + for (let index = 0; index < items.length; index++) { + const baseItem = items[index]; + switch (baseItem.type) { + case 'optional': + case 'utf16': { + // Nothing to split + addMapItem(startRegex, index, baseItem.regex, 'full'); + addMapItem(endRegex, index, baseItem.regex, 'full'); + break; + } + + case 'sequence': { + // Add as full item + addMapItem(startRegex, index, baseItem.regex, 'full'); + addMapItem(endRegex, index, baseItem.regex, 'full'); + + // Add chunks + const sequence = baseItem.items; + for (let i = 1; i < sequence.length; i++) { + const startSequence = createSequenceEmojiRegexItem( + sequence.slice(0, i) + ); + addMapItem(startRegex, index, startSequence.regex, i); + + const endSequence = createSequenceEmojiRegexItem( + sequence.slice(i) + ); + addMapItem(endRegex, index, endSequence.regex, i); + } + + break; + } + + case 'set': + throw new Error('Unexpected set within a set'); + + default: + assertNever(baseItem); + } + } + + // Create list of usable matches + let result: SimilarRegexItemSequenceResult | undefined; + + const checkResults = ( + target: Record, + type: SlicePosition + ) => { + for (const regex in target) { + const item = target[regex]; + if (!item.score) { + continue; + } + if (!result || result.score < item.score) { + // New highest score + result = { + score: item.score, + sequences: [ + { + type, + slices: item.slices, + }, + ], + }; + continue; + } + if (result.score === item.score) { + // Same score + result.sequences.push({ + type, + slices: item.slices, + }); + } + } + }; + checkResults(startRegex, 'start'); + checkResults(endRegex, 'end'); + return result; +} + +/** + * Merge similar sequences + * + * Accepts callback to run optimisation on created subset + */ +export function mergeSimilarRegexItemSequences( + items: EmojiItemRegex[], + merge: SimilarRegexItemSequence, + optimise?: (set: SetEmojiItemRegex) => EmojiItemRegex +): EmojiItemRegex[] { + const { type, slices } = merge; + + // Get common chunks + const indexes: Set = new Set(); + let hasFullSequence = false; + let longestMatch = 0; + let longestMatchIndex = -1; + const differentSequences: EmojiItemRegex[][] = []; + + for (let i = 0; i < slices.length; i++) { + const { index, slice } = slices[i]; + const item = items[index]; + + let length: number; + if (slice === 'full') { + // Full match + hasFullSequence = true; + if (item.type === 'sequence') { + length = item.items.length; + } else { + length = 1; + } + } else { + length = slice; + + if (item.type !== 'sequence') { + throw new Error( + `Unexpected partial match for type "${item.type}"` + ); + } + + // Copy remaining chunks + differentSequences.push( + type === 'start' + ? item.items.slice(slice) + : item.items.slice(0, slice) + ); + } + + if (length > longestMatch) { + longestMatchIndex = index; + longestMatch = length; + } + + indexes.add(index); + } + + // Found common chunk + if (longestMatch < 1 || longestMatchIndex < 0) { + throw new Error('Cannot find common sequence'); + } + + // Get longest common item as sequence + const commonItem = items[longestMatchIndex]; + let sequence: EmojiItemRegex[]; + if (commonItem.type !== 'sequence') { + // Full match + if (longestMatch !== 1) { + throw new Error( + 'Something went wrong. Cannot have long match in non-sequence' + ); + } + sequence = [commonItem]; + } else { + // Sequence + sequence = + type === 'start' + ? commonItem.items.slice(0, longestMatch) + : commonItem.items.slice(longestMatch); + } + + // Merge other chunks + const setItems: EmojiItemRegex[] = []; + for (let i = 0; i < differentSequences.length; i++) { + const list = differentSequences[i]; + if (list.length === 1) { + // 1 item + setItems.push(list[0]); + } else { + // create sequence + setItems.push(createSequenceEmojiRegexItem(list)); + } + } + + // Create set, optimise is, make it optional + const set = createSetEmojiRegexItem(setItems); + let mergedChunk: EmojiItemRegex = + set.sets.length === 1 + ? // Do not run callback if only 1 item + set.sets[0] + : optimise + ? // Run callback to optimise it + optimise(set) + : // Use set as is + set; + if (hasFullSequence) { + // Wrap in optional + mergedChunk = createOptionalEmojiRegexItem(mergedChunk); + } + + // Add set to sequence + sequence[type === 'start' ? 'push' : 'unshift'](mergedChunk); + + // Create result by combining merged item and remaining items + const results: EmojiItemRegex[] = [ + createSequenceEmojiRegexItem(sequence), + ...items.filter((item, index) => !indexes.has(index)), + ]; + return results; +} + +/** + * Merge similar items + */ +export function mergeSimilarItemsInSet(set: SetEmojiItemRegex): EmojiItemRegex { + // Check for numbers + const updatedSet = optimiseNumbersSet(set); + if (updatedSet.type !== 'set') { + return updatedSet; + } + set = updatedSet; + + // Attempt to find common stuff + let merges: SimilarRegexItemSequenceResult | undefined; + while ((merges = findSimilarRegexItemSequences(set.sets))) { + const sequences = merges.sequences; + if (sequences.length === 1) { + // Only 1 sequence + const merged = mergeSimilarRegexItemSequences( + set.sets.map((item) => cloneEmojiRegexItem(item, true)), + sequences[0], + mergeSimilarItemsInSet + ); + if (merged.length === 1) { + // No longer a set + return merged[0]; + } + + // New set + set = createSetEmojiRegexItem(merged); + continue; + } + + // Multiple merges + let newItem: EmojiItemRegex | undefined; + for (let i = 0; i < sequences.length; i++) { + const merged = mergeSimilarRegexItemSequences( + set.sets.map((item) => cloneEmojiRegexItem(item, true)), + sequences[i], + mergeSimilarItemsInSet + ); + + const mergedItem = + merged.length === 1 + ? merged[0] + : createSetEmojiRegexItem(merged); + if (!newItem || mergedItem.regex.length < newItem.regex.length) { + newItem = mergedItem; + } + } + if (!newItem) { + throw new Error('Empty sequences list'); + } + if (newItem.type !== 'set') { + return newItem; + } + set = newItem; + } + + return set; +} diff --git a/packages/utils/src/emoji/regex/tree.ts b/packages/utils/src/emoji/regex/tree.ts new file mode 100644 index 0000000..9daa36a --- /dev/null +++ b/packages/utils/src/emoji/regex/tree.ts @@ -0,0 +1,182 @@ +import { + createOptionalEmojiRegexItem, + createSequenceEmojiRegexItem, + createSetEmojiRegexItem, + createUTF16EmojiRegexItem, + EmojiItemRegex, +} from './base'; +import { splitEmojiSequences } from '../cleanup'; +import { convertEmojiSequenceToUTF32 } from '../convert'; +import { createRegexForNumbersSequence } from './numbers'; +import { joinerEmoji } from '../data'; +import { mergeSimilarItemsInSet } from './similar'; + +/** + * Tree item + */ +interface TreeItem { + // Regex + regex: EmojiItemRegex; + + // True if end of sequence. If children are set, it means children are optional + end?: true; + + // Child elements, separated with 0x200d + children?: TreeItem[]; +} + +/** + * Create tree + */ +export function createEmojisTree(sequences: number[][]): TreeItem[] { + const root: TreeItem[] = []; + + for (let i = 0; i < sequences.length; i++) { + // Convert to UTF-32 and split + const split = splitEmojiSequences( + convertEmojiSequenceToUTF32(sequences[i]) + ); + + // Get items + let parent = root; + for (let j = 0; j < split.length; j++) { + const regex = createRegexForNumbersSequence(split[j]); + + // Find item + let item: TreeItem; + const match = parent.find( + (item) => item.regex.regex === regex.regex + ); + if (!match) { + // Create new item + item = { + regex, + }; + parent.push(item); + } else { + item = match; + } + + // End? + if (j === split.length - 1) { + item.end = true; + break; + } + + // Parse children + parent = item.children || (item.children = []); + } + } + + return root; +} + +/** + * Parse tree + */ +export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex { + interface ParsedTreeItem { + // Regex + regex: EmojiItemRegex; + + // True if end of sequence. If children are set, it means children are optional + end: boolean; + + // Regex for merged child elements + children?: EmojiItemRegex; + } + + function mergeParsedChildren(items: ParsedTreeItem[]): EmojiItemRegex { + const parsedItems: EmojiItemRegex[] = []; + + // Find items with same 'end' and 'children' + type TreeItemsMap = Record[]>; + const mapWithoutEnd = Object.create(null) as TreeItemsMap; + const mapWithEnd = Object.create(null) as TreeItemsMap; + for (let i = 0; i < items.length; i++) { + const item = items[i]; + const children = item.children; + if (children) { + const fullItem = item as Required; + const target = item.end ? mapWithEnd : mapWithoutEnd; + const regex = children.regex; + if (!target[regex]) { + target[regex] = [fullItem]; + } else { + target[regex].push(fullItem); + } + } else { + // Nothing to parse + parsedItems.push(item.regex); + } + } + + // Parse all sets + [mapWithEnd, mapWithoutEnd].forEach((source) => { + for (const regex in source) { + const items = source[regex]; + const firstItem = items[0]; + + // Merge common chunk + joiner + let childSequence: EmojiItemRegex[] = [ + createUTF16EmojiRegexItem([joinerEmoji]), + firstItem.children, + ]; + if (firstItem.end) { + // Make it optional + childSequence = [ + createOptionalEmojiRegexItem( + createSequenceEmojiRegexItem(childSequence) + ), + ]; + } + + // Get remaining chunk + let mergedRegex: EmojiItemRegex; + if (items.length === 1) { + // No matches + mergedRegex = firstItem.regex; + } else { + // Merge items + mergedRegex = mergeSimilarItemsInSet( + createSetEmojiRegexItem(items.map((item) => item.regex)) + ); + } + + // Merge + const sequence = createSequenceEmojiRegexItem([ + mergedRegex, + ...childSequence, + ]); + parsedItems.push(sequence); + } + }); + + // Merge sequences + if (parsedItems.length === 1) { + return parsedItems[0]; + } + return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems)); + } + + function parseItemChildren(item: TreeItem): ParsedTreeItem { + const result: ParsedTreeItem = { + regex: item.regex, + end: !!item.end, + }; + + // Parse child elements + const children = item.children; + if (!children) { + return result; + } + + const parsedChildren = children.map(parseItemChildren); + result.children = mergeParsedChildren(parsedChildren); + return result; + } + + // Parse all items + const parsed = items.map(parseItemChildren); + return mergeParsedChildren(parsed); +} diff --git a/packages/utils/tests/emoji-regex-item-creation-test.ts b/packages/utils/tests/emoji-regex-item-creation-test.ts new file mode 100644 index 0000000..7947439 --- /dev/null +++ b/packages/utils/tests/emoji-regex-item-creation-test.ts @@ -0,0 +1,212 @@ +import { + createOptionalEmojiRegexItem, + createSequenceEmojiRegexItem, + createSetEmojiRegexItem, + createUTF16EmojiRegexItem, +} from '../lib/emoji/regex/base'; + +describe('Creating chunks of regex', () => { + it('UTF-16 numbers', () => { + // Number + expect(createUTF16EmojiRegexItem([0x2763])).toEqual({ + type: 'utf16', + regex: '\\u2763', + numbers: [0x2763], + group: true, + }); + + // Range + expect(createUTF16EmojiRegexItem([0x2762, 0x2764, 0x2763])).toEqual({ + type: 'utf16', + regex: '[\\u2762-\\u2764]', + numbers: [0x2762, 0x2763, 0x2764], + group: true, + }); + + // Separate numbers + expect(createUTF16EmojiRegexItem([0x2760, 0x2764, 0xfe0f])).toEqual({ + type: 'utf16', + regex: '[\\u2760\\u2764\\uFE0F]', + numbers: [0x2760, 0x2764, 0xfe0f], + group: true, + }); + + // Ranges + numbers, duplicate item + expect( + createUTF16EmojiRegexItem([ + 0x2760, 0x2762, 0x2761, 0x2765, 0x2763, 0xfe0f, 0xfe0f, 0xfe0e, + 0x2000, 0x2001, 0x2100, 0x2102, 0x2101, + ]) + ).toEqual({ + type: 'utf16', + regex: '[\\u2000\\u2001\\u2100-\\u2102\\u2760-\\u2763\\u2765\\uFE0E\\uFE0F]', + numbers: [ + 0x2000, 0x2001, 0x2100, 0x2101, 0x2102, 0x2760, 0x2761, 0x2762, + 0x2763, 0x2765, 0xfe0e, 0xfe0f, 0xfe0f, + ], + group: true, + }); + }); + + it('Sequence from numbers', () => { + const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]); + const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]); + + // 1 item + expect(createSequenceEmojiRegexItem([num1])).toEqual({ + type: 'sequence', + regex: '[\\u2000\\u2001]', + numbers: [0x2000, 0x2001], + items: [num1], + group: true, + }); + + // 2 numbers + expect(createSequenceEmojiRegexItem([num1, num2])).toEqual({ + type: 'sequence', + regex: '[\\u2000\\u2001][\\u2000\\u2100]', + items: [num1, num2], + group: false, + }); + }); + + it('Sets from numbers', () => { + const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]); + const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]); + + // 1 item + expect(createSetEmojiRegexItem([num1])).toEqual({ + type: 'set', + regex: '[\\u2000\\u2001]', + numbers: [0x2000, 0x2001], + sets: [num1], + group: true, + }); + + // 2 numbers + expect(createSetEmojiRegexItem([num1, num2])).toEqual({ + type: 'set', + regex: '[\\u2000\\u2001]|[\\u2000\\u2100]', + numbers: [0x2000, 0x2001, 0x2000, 0x2100], + sets: [num1, num2], + group: false, + }); + }); + + it('Optional numbers', () => { + const num1 = createUTF16EmojiRegexItem([0xfe0f]); + const num2 = createUTF16EmojiRegexItem([0xfe0e, 0xfe0f]); + + // simple item + expect(createOptionalEmojiRegexItem(num1)).toEqual({ + type: 'optional', + regex: '\\uFE0F?', + item: num1, + group: true, + }); + + // 2 numbers + expect(createOptionalEmojiRegexItem(num2)).toEqual({ + type: 'optional', + regex: '[\\uFE0E\\uFE0F]?', + item: num2, + group: true, + }); + }); + + it('Sequence', () => { + const num1 = createUTF16EmojiRegexItem([0x2000, 0x2001]); + const num2 = createUTF16EmojiRegexItem([0x2000, 0x2100]); + const fe0f = createOptionalEmojiRegexItem( + createUTF16EmojiRegexItem([0xfe0f]) + ); + + // optional item + expect(createSequenceEmojiRegexItem([fe0f])).toEqual({ + type: 'sequence', + regex: '\\uFE0F?', + items: [fe0f], + group: true, + }); + + const seq1 = createSequenceEmojiRegexItem([num1, fe0f]); + expect(seq1).toEqual({ + type: 'sequence', + regex: '[\\u2000\\u2001]\\uFE0F?', + items: [num1, fe0f], + group: false, + }); + + // number + optional item + number + expect(createSequenceEmojiRegexItem([num1, fe0f, num2])).toEqual({ + type: 'sequence', + regex: '[\\u2000\\u2001]\\uFE0F?[\\u2000\\u2100]', + items: [num1, fe0f, num2], + group: false, + }); + + // number + nested sequence + expect(createSequenceEmojiRegexItem([num2, seq1])).toEqual({ + type: 'sequence', + regex: '[\\u2000\\u2100][\\u2000\\u2001]\\uFE0F?', + items: [num2, num1, fe0f], + group: false, + }); + }); + + it('Mix', () => { + const num1 = createUTF16EmojiRegexItem([ + 0x1234, 0x1235, 0x1236, 0x1237, + ]); + + // UTF-32 + const utf32a1 = createUTF16EmojiRegexItem([0xd83d]); + const utf32a2 = createUTF16EmojiRegexItem([0xdc9a]); + const utf32a = createSequenceEmojiRegexItem([utf32a1, utf32a2]); + expect(utf32a).toEqual({ + type: 'sequence', + regex: '\\uD83D\\uDC9A', + items: [utf32a1, utf32a2], + group: false, + }); + utf32a.numbers = [0x1f49a]; + + // Make it optional + expect(createOptionalEmojiRegexItem(utf32a)).toEqual({ + type: 'optional', + regex: '(?:\\uD83D\\uDC9A)?', + item: utf32a, + group: true, + }); + + // Set of numbers + const set = createSetEmojiRegexItem([num1, utf32a]); + expect(set).toEqual({ + type: 'set', + regex: '[\\u1234-\\u1237]|\\uD83D\\uDC9A', + sets: [num1, utf32a], + numbers: [0x1234, 0x1235, 0x1236, 0x1237, 0x1f49a], + group: false, + }); + + // Make it optional + expect(createOptionalEmojiRegexItem(set)).toEqual({ + type: 'optional', + regex: '(?:[\\u1234-\\u1237]|\\uD83D\\uDC9A)?', + item: set, + group: true, + }); + + // Sequence with set + const utf16a = createUTF16EmojiRegexItem([0x2000]); + const utf16b = createUTF16EmojiRegexItem([0x2100]); + const utf16c = createUTF16EmojiRegexItem([0x2101]); + const set1 = createSetEmojiRegexItem([utf16b, utf16c]); + expect(createSequenceEmojiRegexItem([utf16a, set1])).toEqual({ + type: 'sequence', + regex: '\\u2000(?:\\u2100|\\u2101)', + items: [utf16a, set1], + group: false, + }); + }); +}); diff --git a/packages/utils/tests/emoji-regex-numbers-test.ts b/packages/utils/tests/emoji-regex-numbers-test.ts new file mode 100644 index 0000000..095a5fe --- /dev/null +++ b/packages/utils/tests/emoji-regex-numbers-test.ts @@ -0,0 +1,198 @@ +import { + createOptionalEmojiRegexItem, + createSetEmojiRegexItem, + createUTF16EmojiRegexItem, +} from '../lib/emoji/regex/base'; +import { + createEmojiRegexItemForNumbers, + createRegexForNumbersSequence, + optimiseNumbersSet, +} from '../lib/emoji/regex/numbers'; + +describe('Creating chunks of regex for numbers', () => { + it('Numbers', () => { + // UTF-16 + expect(createEmojiRegexItemForNumbers([0x2763])).toEqual({ + type: 'utf16', + regex: '\\u2763', + numbers: [0x2763], + group: true, + }); + + expect( + createEmojiRegexItemForNumbers([0x2761, 0x2765, 0x2764, 0x2763]) + ).toEqual({ + type: 'utf16', + regex: '[\\u2761\\u2763-\\u2765]', + numbers: [0x2761, 0x2763, 0x2764, 0x2765], + group: true, + }); + + // UTF-32 + expect(createEmojiRegexItemForNumbers([0x1f49a])).toEqual({ + type: 'sequence', + regex: '\\uD83D\\uDC9A', + items: [ + { + type: 'utf16', + regex: '\\uD83D', + numbers: [0xd83d], + group: true, + }, + { + type: 'utf16', + regex: '\\uDC9A', + numbers: [0xdc9a], + group: true, + }, + ], + numbers: [0x1f49a], + group: false, + }); + + // Similar ranges + const items1 = createEmojiRegexItemForNumbers([ + 0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89c, + ]); + delete (items1 as unknown as Record).items; + expect(items1).toEqual({ + type: 'sequence', + regex: '[\\uD83D\\uD83E][\\uDC9A-\\uDC9C]', + numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89c], + group: false, + }); + + // Mismatched ranges + const items2 = createEmojiRegexItemForNumbers([ + 0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, 0x1f89e, + ]); + delete (items2 as unknown as Record).sets; + expect(items2).toEqual({ + type: 'set', + regex: '\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]', + numbers: [0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, 0x1f89b, 0x1f89e], + group: false, + }); + + // Mix + const items3 = createEmojiRegexItemForNumbers([ + 0x2763, 0x2765, 0x1f49a, 0x1f49c, 0x1f49b, 0x1f89a, 0x1f89b, + 0x1f89e, 0x2764, + ]); + delete (items3 as unknown as Record).sets; + expect(items3).toEqual({ + type: 'set', + regex: '[\\u2763-\\u2765]|\\uD83D[\\uDC9A-\\uDC9C]|\\uD83E[\\uDC9A\\uDC9B\\uDC9E]', + numbers: [ + 0x2763, 0x2764, 0x2765, 0x1f49a, 0x1f49b, 0x1f49c, 0x1f89a, + 0x1f89b, 0x1f89e, + ], + group: false, + }); + }); + + it('Numbers sequence', () => { + // UTF-16: cannot be sequence + expect(createRegexForNumbersSequence([0x2763])).toEqual( + createUTF16EmojiRegexItem([0x2763]) + ); + + // UTF-32 + expect(createRegexForNumbersSequence([0x1f49a])).toEqual({ + type: 'sequence', + regex: '\\uD83D\\uDC9A', + numbers: [0x1f49a], + items: [ + createUTF16EmojiRegexItem([0xd83d]), + createUTF16EmojiRegexItem([0xdc9a]), + ], + group: false, + }); + + // Variation + expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f])).toEqual({ + type: 'sequence', + regex: '\\uD83D\\uDC9A\\uFE0F?', + items: [ + createUTF16EmojiRegexItem([0xd83d]), + createUTF16EmojiRegexItem([0xdc9a]), + createOptionalEmojiRegexItem( + createUTF16EmojiRegexItem([0xfe0f]) + ), + ], + group: false, + }); + + expect(createRegexForNumbersSequence([0x1f49a, 0xfe0f], false)).toEqual( + { + type: 'sequence', + regex: '\\uD83D\\uDC9A\\uFE0F', + items: [ + createUTF16EmojiRegexItem([0xd83d]), + createUTF16EmojiRegexItem([0xdc9a]), + createUTF16EmojiRegexItem([0xfe0f]), + ], + group: false, + } + ); + + // Variation only + expect(createRegexForNumbersSequence([0xfe0f])).toEqual( + createOptionalEmojiRegexItem(createUTF16EmojiRegexItem([0xfe0f])) + ); + }); + + it('Optimising set', () => { + // Mix of numbers + expect( + optimiseNumbersSet( + createSetEmojiRegexItem([ + // Mandatory + createUTF16EmojiRegexItem([0x2000]), + createUTF16EmojiRegexItem([0x2001]), + createEmojiRegexItemForNumbers([0x1f932]), + // Optional + createOptionalEmojiRegexItem( + createUTF16EmojiRegexItem([0x2100]) + ), + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers([0x1f91d]) + ), + ]) + ) + ).toEqual( + createSetEmojiRegexItem([ + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers([0x1f91d, 0x2100]) + ), + createEmojiRegexItemForNumbers([0x2000, 0x2001, 0x1f932]), + ]) + ); + + // Duplicate optional and mandatory numbers + expect( + optimiseNumbersSet( + createSetEmojiRegexItem([ + // Mandatory + createUTF16EmojiRegexItem([0x2000]), + createUTF16EmojiRegexItem([0x2001]), + createEmojiRegexItemForNumbers([0x1f932]), + // Optional + createOptionalEmojiRegexItem( + createUTF16EmojiRegexItem([0x2001, 0x2002]) + ), + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers([0x1f91d]) + ), + ]) + ) + ).toEqual( + createSetEmojiRegexItem([ + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers([0x1f91d, 0x2001, 0x2002]) + ), + createEmojiRegexItemForNumbers([0x2000, 0x1f932]), + ]) + ); + }); +}); diff --git a/packages/utils/tests/emoji-regex-similar-items-test.ts b/packages/utils/tests/emoji-regex-similar-items-test.ts new file mode 100644 index 0000000..b6c878f --- /dev/null +++ b/packages/utils/tests/emoji-regex-similar-items-test.ts @@ -0,0 +1,443 @@ +/* eslint-disable @typescript-eslint/no-non-null-assertion */ +import { splitUTF32Number } from '../lib/emoji/convert'; +import { + createOptionalEmojiRegexItem, + createSequenceEmojiRegexItem, + createSetEmojiRegexItem, + createUTF16EmojiRegexItem, + SequenceEmojiItemRegex, +} from '../lib/emoji/regex/base'; +import { + createEmojiRegexItemForNumbers, + createRegexForNumbersSequence, +} from '../lib/emoji/regex/numbers'; +import { + findSimilarRegexItemSequences, + mergeSimilarItemsInSet, + mergeSimilarRegexItemSequences, +} from '../lib/emoji/regex/similar'; + +describe('Similar chunks of regex', () => { + it('Nothing in common', () => { + // Nothing in common + expect( + findSimilarRegexItemSequences([ + createRegexForNumbersSequence([0x1234, 0x2345]), + ]) + ).toBeUndefined(); + + expect( + findSimilarRegexItemSequences([ + createEmojiRegexItemForNumbers([0x1234]), + createOptionalEmojiRegexItem( + createEmojiRegexItemForNumbers([0x1234]) + ), + ]) + ).toBeUndefined(); + + expect( + findSimilarRegexItemSequences([ + createEmojiRegexItemForNumbers([0x1234]), + // Match is in middle of sequence + createRegexForNumbersSequence([0x1230, 0x1234, 0x1235]), + ]) + ).toBeUndefined(); + }); + + it('Simple match', () => { + const items = [ + createEmojiRegexItemForNumbers([0x1234]), + createRegexForNumbersSequence([0x1234, 0x1235]), + createRegexForNumbersSequence([0xfe0f]), + ]; + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 6, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 'full', + }, + { + index: 1, + slice: 1, + }, + ], + }, + ], + }); + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences(items, sequence) + ); + + expect(set).toEqual({ + type: 'set', + regex: '\\u1234\\u1235?|\\uFE0F?', + sets: [ + createSequenceEmojiRegexItem([ + items[0], + createOptionalEmojiRegexItem( + createUTF16EmojiRegexItem([0x1235]) + ), + ]), + items[2], + ], + group: false, + }); + }); + + it('Range of numbers', () => { + const items = [ + createRegexForNumbersSequence([0x1f91d, 0x1f3fb]), + createRegexForNumbersSequence([0x1f91d, 0x1f3fc]), + createRegexForNumbersSequence([0x1f91d, 0x1f3fd]), + createRegexForNumbersSequence([0x1f91d, 0x1f3fe]), + createRegexForNumbersSequence([0x1f91d, 0x1f3ff]), + ]; + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 72, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 3, + }, + { + index: 1, + slice: 3, + }, + { + index: 2, + slice: 3, + }, + { + index: 3, + slice: 3, + }, + { + index: 4, + slice: 3, + }, + ], + }, + ], + }); + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences( + items, + sequence, + mergeSimilarItemsInSet + ) + ); + + const commonChunk = (items[0] as SequenceEmojiItemRegex).items.slice( + 0, + 3 + ); + expect(set).toEqual({ + type: 'set', + regex: '\\uD83E\\uDD1D\\uD83C[\\uDFFB-\\uDFFF]', + sets: [ + createSequenceEmojiRegexItem([ + ...commonChunk, + createUTF16EmojiRegexItem([ + 0xdffb, 0xdffc, 0xdffd, 0xdffe, 0xdfff, + ]), + ]), + ], + group: false, + }); + }); + + it('Multiple matches', () => { + const items = [ + createEmojiRegexItemForNumbers([0x1234]), + createRegexForNumbersSequence([0x1234, 0x1235]), + createEmojiRegexItemForNumbers([0x1235]), + ]; + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 6, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 'full', + }, + { + index: 1, + slice: 1, + }, + ], + }, + { + type: 'end', + slices: [ + { + index: 1, + slice: 1, + }, + { + index: 2, + slice: 'full', + }, + ], + }, + ], + }); + + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply first merge only + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences(items, sequence) + ); + + expect(set).toEqual({ + type: 'set', + regex: '\\u1234\\u1235?|\\u1235', + sets: [ + createSequenceEmojiRegexItem([ + items[0], + createOptionalEmojiRegexItem(items[2]), + ]), + items[2], + ], + group: false, + }); + }); + + it('Extra number', () => { + const items = [ + createRegexForNumbersSequence([0x1f64f]), + createRegexForNumbersSequence([0x1f64f, 0x1f3fb]), + ]; + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 12, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 'full', + }, + { + index: 1, + slice: 2, + }, + ], + }, + ], + }); + + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply merge + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences(items, sequence) + ); + expect(set).toEqual({ + type: 'set', + regex: '\\uD83D\\uDE4F(?:\\uD83C\\uDFFB)?', + sets: [ + createSequenceEmojiRegexItem([ + ...items[0].items, + createOptionalEmojiRegexItem( + createRegexForNumbersSequence( + splitUTF32Number(0x1f3fb)! + ) + ), + ]), + ], + group: false, + }); + }); + + it('Multiple matches', () => { + const items = [ + createEmojiRegexItemForNumbers([0x1234]), + createRegexForNumbersSequence([0x1234, 0x1235]), + createEmojiRegexItemForNumbers([0x1235]), + ]; + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 6, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 'full', + }, + { + index: 1, + slice: 1, + }, + ], + }, + { + type: 'end', + slices: [ + { + index: 1, + slice: 1, + }, + { + index: 2, + slice: 'full', + }, + ], + }, + ], + }); + + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply first merge only + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences(items, sequence) + ); + + expect(set).toEqual({ + type: 'set', + regex: '\\u1234\\u1235?|\\u1235', + sets: [ + createSequenceEmojiRegexItem([ + items[0], + createOptionalEmojiRegexItem(items[2]), + ]), + items[2], + ], + group: false, + }); + }); + + it('Complex sequence', () => { + const items = [ + // First 3 elements match, also last 2 elements create variations + createRegexForNumbersSequence([ + 0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fc, + ]), + createRegexForNumbersSequence([ + 0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fd, + ]), + createRegexForNumbersSequence([ + 0x1faf1, 0x1f3fb, 0x200d, 0x1faf1, 0x1f3fc, + ]), + createRegexForNumbersSequence([ + 0x1faf1, 0x1f3fb, 0x200d, 0x1faf2, 0x1f3fd, + ]), + // Variation + createRegexForNumbersSequence([0x1f64f]), + createRegexForNumbersSequence([0x1f64f, 0x1f3fb]), + ]; + + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 108, + sequences: [ + { + type: 'start', + slices: [ + { + index: 0, + slice: 6, + }, + { + index: 1, + slice: 6, + }, + { + index: 2, + slice: 6, + }, + { + index: 3, + slice: 6, + }, + ], + }, + ], + }); + + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Apply first merge only + const set = createSetEmojiRegexItem( + mergeSimilarRegexItemSequences(items, sequence) + ); + + const slicedSequence = (items[0] as SequenceEmojiItemRegex).items.slice( + 0, + 6 + ); + const slicedSet = createSetEmojiRegexItem([ + createSequenceEmojiRegexItem( + (items[0] as SequenceEmojiItemRegex).items.slice(6) + ), + createSequenceEmojiRegexItem( + (items[1] as SequenceEmojiItemRegex).items.slice(6) + ), + createSequenceEmojiRegexItem( + (items[2] as SequenceEmojiItemRegex).items.slice(6) + ), + createSequenceEmojiRegexItem( + (items[3] as SequenceEmojiItemRegex).items.slice(6) + ), + ]); + expect(slicedSet.regex).toBe( + // Test mix separately to see if it is correct instead of parsing whole regex + '\\uDEF1\\uD83C\\uDFFC|\\uDEF1\\uD83C\\uDFFD|\\uDEF2\\uD83C\\uDFFC|\\uDEF2\\uD83C\\uDFFD' + ); + expect(set).toEqual({ + type: 'set', + regex: + // last 2 items (set items are sorted alphabetically), + // 6 numbers from common chunks, grouped mix + '\\uD83D\\uDE4F|\\uD83D\\uDE4F\\uD83C\\uDFFB|\\uD83E\\uDEF1\\uD83C\\uDFFB\\u200D\\uD83E(?:' + + slicedSet.regex + + ')', + sets: [ + items[4], + items[5], + createSequenceEmojiRegexItem([...slicedSequence, slicedSet]), + ], + group: false, + }); + }); +}); diff --git a/packages/utils/tests/emoji-tree-test.ts b/packages/utils/tests/emoji-tree-test.ts new file mode 100644 index 0000000..f4e6799 --- /dev/null +++ b/packages/utils/tests/emoji-tree-test.ts @@ -0,0 +1,224 @@ +/* eslint-disable @typescript-eslint/no-non-null-assertion */ +import { getEmojiSequenceFromString } from '../lib/emoji/cleanup'; +import { createRegexForNumbersSequence } from '../lib/emoji/regex/numbers'; +import { createEmojisTree, parseEmojiTree } from '../lib/emoji/regex/tree'; + +describe('Emoji regex tree', () => { + it('Creating simple tree', () => { + const numbers = [ + getEmojiSequenceFromString('1F3C1'), + getEmojiSequenceFromString('1F3F3'), + getEmojiSequenceFromString('1F3F3 FE0F'), + getEmojiSequenceFromString('1F3F4 200D 2620 FE0F'), + getEmojiSequenceFromString('1F3F4 200D 2620'), + ]; + const tree = createEmojisTree(numbers); + expect(tree).toEqual([ + { + regex: createRegexForNumbersSequence([0x1f3c1]), + end: true, + }, + { + regex: createRegexForNumbersSequence([0x1f3f3]), + end: true, + }, + { + regex: createRegexForNumbersSequence([0x1f3f3, 0xfe0f]), + end: true, + }, + { + regex: createRegexForNumbersSequence([0x1f3f4]), + children: [ + { + regex: createRegexForNumbersSequence([0x2620, 0xfe0f]), + end: true, + }, + { + regex: createRegexForNumbersSequence([0x2620]), + end: true, + }, + ], + }, + ]); + + expect(parseEmojiTree(tree).regex).toEqual( + '\\uD83C(?:(?:\\uDFF3|\\uDFF4\\u200D\\u2620)\\uFE0F?|[\\uDFC1\\uDFF3])' + ); + }); + + it('Creating complex tree', () => { + const numbers = [ + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FB'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'), + getEmojiSequenceFromString('1FAF1 1F3FB'), + ]; + const tree = createEmojisTree(numbers); + expect(tree).toEqual([ + { + regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]), + end: true, + children: [ + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fc, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fd, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fe, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3ff, + ]), + end: true, + }, + ], + }, + { + regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]), + children: [ + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fb, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fd, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fe, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3ff, + ]), + end: true, + }, + ], + }, + ]); + + expect(parseEmojiTree(tree).regex).toEqual( + '\\uD83E\\uDEF1\\uD83C' + + // depth: 1 + '(?:\\uDFFB' + + // depth: 2 + '(?:\\u200D\\uD83E\\uDEF2\\uD83C' + + // depth: 3 + '[\\uDFFC-\\uDFFF]' + + // depth: 2 + ')?' + + // depth: 1 + '|\\uDFFC\\u200D\\uD83E\\uDEF2\\uD83C' + + // depth: 2 + '[\\uDFFB\\uDFFD-\\uDFFF]' + + // depth: 1 + ')' + ); + }); + + it('Creating complex optimisable tree', () => { + const numbers = [ + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FC'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FD'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FE'), + getEmojiSequenceFromString('1FAF1 1F3FB 200D 1FAF2 1F3FF'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FC'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FD'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FE'), + getEmojiSequenceFromString('1FAF1 1F3FC 200D 1FAF2 1F3FF'), + getEmojiSequenceFromString('1FAF1 1F3FB'), + getEmojiSequenceFromString('1FAF1 1F3FC'), + ]; + const tree = createEmojisTree(numbers); + expect(tree).toEqual([ + { + regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fb]), + end: true, + children: [ + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fc, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fd, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fe, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3ff, + ]), + end: true, + }, + ], + }, + { + regex: createRegexForNumbersSequence([0x1faf1, 0x1f3fc]), + end: true, + children: [ + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fc, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fd, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3fe, + ]), + end: true, + }, + { + regex: createRegexForNumbersSequence([ + 0x1faf2, 0x1f3ff, + ]), + end: true, + }, + ], + }, + ]); + + // expect(parseEmojiTree(tree).regex).toEqual( + // '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?' + // ); + }); +});