From e443b2ae12839654d68775395cf8fab808a0dd2f Mon Sep 17 00:00:00 2001 From: Vjacheslav Trushkin Date: Sun, 1 Jan 2023 17:40:07 +0200 Subject: [PATCH] fix(utils): bugged emoji regex --- packages/utils/package.json | 2 +- packages/utils/src/emoji/regex/similar.ts | 7 +- packages/utils/src/emoji/regex/tree.ts | 4 +- packages/utils/tests/emoji-regex-find-test.ts | 133 +++++++++++++++++- .../tests/emoji-regex-similar-items-test.ts | 40 ++++++ packages/utils/tests/emoji-regex-test.ts | 56 +++++--- packages/utils/tests/emoji-tree-test.ts | 53 +++++++ 7 files changed, 271 insertions(+), 24 deletions(-) diff --git a/packages/utils/package.json b/packages/utils/package.json index 3be5d99..f879edc 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -3,7 +3,7 @@ "type": "module", "description": "Common functions for working with Iconify icon sets used by various packages.", "author": "Vjacheslav Trushkin", - "version": "2.0.6", + "version": "2.0.7", "license": "MIT", "bugs": "https://github.com/iconify/iconify/issues", "homepage": "https://iconify.design/", diff --git a/packages/utils/src/emoji/regex/similar.ts b/packages/utils/src/emoji/regex/similar.ts index 1f7df92..215c47f 100644 --- a/packages/utils/src/emoji/regex/similar.ts +++ b/packages/utils/src/emoji/regex/similar.ts @@ -222,13 +222,12 @@ export function mergeSimilarRegexItemSequences( length = 1; } } else { - length = slice; - if (item.type !== 'sequence') { throw new Error( `Unexpected partial match for type "${item.type}"` ); } + length = type === 'start' ? slice : item.items.length - slice; // Copy remaining chunks differentSequences.push( @@ -267,7 +266,9 @@ export function mergeSimilarRegexItemSequences( sequence = type === 'start' ? commonItem.items.slice(0, longestMatch) - : commonItem.items.slice(longestMatch); + : commonItem.items.slice( + commonItem.items.length - longestMatch + ); } // Merge other chunks diff --git a/packages/utils/src/emoji/regex/tree.ts b/packages/utils/src/emoji/regex/tree.ts index 9daa36a..127e774 100644 --- a/packages/utils/src/emoji/regex/tree.ts +++ b/packages/utils/src/emoji/regex/tree.ts @@ -156,7 +156,9 @@ export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex { if (parsedItems.length === 1) { return parsedItems[0]; } - return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems)); + const set = createSetEmojiRegexItem(parsedItems); + const result = mergeSimilarItemsInSet(set); + return result; } function parseItemChildren(item: TreeItem): ParsedTreeItem { diff --git a/packages/utils/tests/emoji-regex-find-test.ts b/packages/utils/tests/emoji-regex-find-test.ts index 4509a40..314ef98 100644 --- a/packages/utils/tests/emoji-regex-find-test.ts +++ b/packages/utils/tests/emoji-regex-find-test.ts @@ -1,10 +1,58 @@ -import { createOptimisedRegex } from '../lib/emoji/regex/create'; +import { readFile, writeFile, unlink } from 'node:fs/promises'; +import { parseEmojiTestFile } from '../lib/emoji/test/parse'; +import { emojiVersion } from '../lib/emoji/data'; +import { + createOptimisedRegex, + createOptimisedRegexForEmojiSequences, +} from '../lib/emoji/regex/create'; import { getEmojiMatchesInText, sortEmojiMatchesInText, } from '../lib/emoji/replace/find'; +import { getQualifiedEmojiVariations } from '../lib/emoji/test/variations'; +import { getEmojiSequenceString } from '../lib/emoji/format'; describe('Finding emojis in text', () => { + async function fetchEmojiTestData(): Promise { + // Fetch emojis, cache it + const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`; + + let data: string | undefined; + try { + data = await readFile(source, 'utf8'); + } catch { + // + } + + if (!data) { + data = ( + await fetch( + `https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt` + ) + ) + .text() + .toString(); + await writeFile(source, data, 'utf8'); + } + + // Test content, unlink cache on failure + if (data.indexOf(`# Version: ${emojiVersion}`) === -1) { + try { + await unlink(source); + } catch { + // + } + return; + } + return data; + } + + let data: string | undefined; + + beforeAll(async () => { + data = await fetchEmojiTestData(); + }); + it('Simple regex', () => { const regexValue = createOptimisedRegex([ '1F600', @@ -325,4 +373,87 @@ describe('Finding emojis in text', () => { }, ]); }); + + it('Finding all test emojis', () => { + if (!data) { + console.warn('Test skipped: test data is not available'); + return; + } + + // Parse test data + const testData = parseEmojiTestFile(data); + const sequences = Object.values(testData).map(({ sequence }) => { + return { + sequence, + }; + }); + + // Get all icons + const iconsList = getQualifiedEmojiVariations(sequences, testData); + + // Get regex + const regexValue = createOptimisedRegexForEmojiSequences( + iconsList.map((item) => item.sequence) + ); + const regex = new RegExp(regexValue, 'g'); + + sequences.forEach((sequence) => { + const text = sequence.sequence + .map((code) => String.fromCodePoint(code)) + .join(''); + + // Test finding match + const result = getEmojiMatchesInText(regex, text); + + // Must have only 1 item + if (result.length !== 1) { + console.log( + getEmojiSequenceString(sequence.sequence), + `(\\u${getEmojiSequenceString(sequence.sequence, { + format: 'utf-16', + separator: '\\u', + case: 'upper', + })})`, + text + ); + result.forEach((match) => { + const sequence: number[] = []; + for (const codePoint of match.match) { + const num = codePoint.codePointAt(0) as number; + sequence.push(num); + } + console.log( + getEmojiSequenceString(sequence), + `(\\u${getEmojiSequenceString(sequence, { + format: 'utf-16', + separator: '\\u', + case: 'upper', + })})` + ); + }); + console.log(result); + expect(result.length).toBe(1); + } + + const firstMatch = result[0]; + const resultSequence = []; + for (const codePoint of firstMatch.match) { + const num = codePoint.codePointAt(0) as number; + resultSequence.push(num); + } + + if (resultSequence.length !== sequence.sequence.length) { + console.log( + getEmojiSequenceString(sequence.sequence), + `(\\u${getEmojiSequenceString(sequence.sequence, { + format: 'utf-16', + separator: '\\u', + case: 'upper', + })})`, + result + ); + } + expect(resultSequence).toEqual(sequence.sequence); + }); + }); }); diff --git a/packages/utils/tests/emoji-regex-similar-items-test.ts b/packages/utils/tests/emoji-regex-similar-items-test.ts index 1b5b96e..d154a27 100644 --- a/packages/utils/tests/emoji-regex-similar-items-test.ts +++ b/packages/utils/tests/emoji-regex-similar-items-test.ts @@ -446,4 +446,44 @@ describe('Similar chunks of regex', () => { group: false, }); }); + + it('Same end match', () => { + const items = [ + createRegexForNumbersSequence([128139, 8205, 129489, 127996]), + createRegexForNumbersSequence([129489, 127996]), + ]; + + const merge = findSimilarRegexItemSequences(items); + expect(merge).toEqual({ + score: 24, + sequences: [ + { + type: 'end', + slices: [ + { + index: 0, + slice: 3, + }, + { + index: 1, + slice: 'full', + }, + ], + }, + ], + }); + + const sequence = merge?.sequences[0]; + if (!sequence) { + throw new Error('Unexpected undefined sequence'); + } + + // Merge items + const merged = mergeSimilarRegexItemSequences(items, sequence); + + expect(merged.length).toBe(1); + expect(merged[0].regex).toBe( + '(?:\\uD83D\\uDC8B\\u200D)?\\uD83E\\uDDD1\\uD83C\\uDFFC' + ); + }); }); diff --git a/packages/utils/tests/emoji-regex-test.ts b/packages/utils/tests/emoji-regex-test.ts index 160a99a..b958d0c 100644 --- a/packages/utils/tests/emoji-regex-test.ts +++ b/packages/utils/tests/emoji-regex-test.ts @@ -1,6 +1,18 @@ -import { createOptimisedRegex } from '../lib/emoji/regex/create'; +import { + getEmojiSequenceFromString, + getSequenceFromEmojiStringOrKeyword, +} from '../lib/emoji/cleanup'; +import { + createOptimisedRegex, + createOptimisedRegexForEmojiSequences, +} from '../lib/emoji/regex/create'; describe('Emoji regex matching', () => { + function code(value: string): string { + const sequence = getSequenceFromEmojiStringOrKeyword(value); + return sequence.map((code) => String.fromCodePoint(code)).join(''); + } + it('Simple regex', () => { const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']); @@ -50,29 +62,17 @@ Tabby cat: :tabby_cat: const matches = ` E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)} -E13.1 face exhaling: ${ - String.fromCodePoint(0x1f62e) + - String.fromCodePoint(0x200d) + - String.fromCodePoint(0x1f4a8) - } +E13.1 face exhaling: ${code('1f62e-200d-1f4a8')} E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)} -E0.6 smiling face: ${ - String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f) - } (icon) +E0.6 smiling face: ${code('263a-fe0f')} (icon) E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text) `.match(new RegExp(regexValue, 'g')); expect(matches?.length).toBe(5); - expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8)); - expect(matches?.[1]).toBe( - String.fromCodePoint(0x1f62e) + - String.fromCodePoint(0x200d) + - String.fromCodePoint(0x1f4a8) - ); + expect(matches?.[0]).toBe(code('1f4a8')); + expect(matches?.[1]).toBe(code('1f62e 200d 1f4a8')); expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e)); - expect(matches?.[3]).toBe( - String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f) - ); + expect(matches?.[3]).toBe(code('263a fe0f')); expect(matches?.[4]).toBe(String.fromCodePoint(0x263a)); }); @@ -116,4 +116,24 @@ E1.0 waving hand: medium skin tone: ${ String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd) ); }); + + it('Bugged mix of sequences', () => { + const fullList = [ + '1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc', + '1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc', + ]; + + const regexValue = createOptimisedRegexForEmojiSequences( + fullList.map((code) => getEmojiSequenceFromString(code)) + ); + + const matches = code( + '1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc' + ).match(new RegExp(regexValue, 'g')); + + expect(matches?.length).toBe(1); + expect(matches?.[0]).toBe( + code('1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc') + ); + }); }); diff --git a/packages/utils/tests/emoji-tree-test.ts b/packages/utils/tests/emoji-tree-test.ts index 413db74..0f37f8a 100644 --- a/packages/utils/tests/emoji-tree-test.ts +++ b/packages/utils/tests/emoji-tree-test.ts @@ -221,4 +221,57 @@ describe('Emoji regex tree', () => { // '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?' // ); }); + + it('Multiple children with same last child', () => { + const numbers = [ + getEmojiSequenceFromString( + '1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc' + ), + getEmojiSequenceFromString( + '1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc' + ), + ]; + const tree = createEmojisTree(numbers); + expect(tree).toEqual([ + { + regex: createRegexForNumbersSequence([0x1f9d1, 0x1f3fb]), + children: [ + { + regex: createRegexForNumbersSequence([0x2764, 0xfe0f]), + children: [ + { + regex: createRegexForNumbersSequence([0x1f48b]), + children: [ + { + regex: createRegexForNumbersSequence([ + 0x1f9d1, 0x1f3fc, + ]), + end: true, + }, + ], + }, + { + regex: createRegexForNumbersSequence([ + 0x1f9d1, 0x1f3fc, + ]), + end: true, + }, + ], + }, + ], + }, + ]); + + // 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83D-DC8B-200D-D83E-DDD1-D83C-DFFC' + + // 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83E-DDD1-D83C-DFFC' = + // 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D' + 'D83D-DC8B-200D'? + 'D83E-DDD1-D83C-DFFC' + + expect(parseEmojiTree(tree).regex).toEqual( + // First common chunk + '\\uD83E\\uDDD1\\uD83C\\uDFFB\\u200D\\u2764\\uFE0F?\\u200D' + + // Optional chunk + '(?:\\uD83D\\uDC8B\\u200D)?' + + // Last common chunk + '\\uD83E\\uDDD1\\uD83C\\uDFFC' + ); + }); });