From b5e9ecd2b4216559939a25dee8d7f81923617067 Mon Sep 17 00:00:00 2001 From: Vjacheslav Trushkin Date: Sun, 25 Dec 2022 11:15:28 +0200 Subject: [PATCH] feat(utils): function to prepare emoji data for icons list, support multiple regexp --- packages/utils/package.json | 7 +- packages/utils/src/emoji/parse.ts | 109 +++++++++ packages/utils/src/emoji/replace/find.ts | 87 ++++---- packages/utils/src/emoji/replace/replace.ts | 2 +- packages/utils/src/index.ts | 6 +- packages/utils/tests/emoji-parse-test.ts | 208 ++++++++++++++++++ packages/utils/tests/emoji-regex-find-test.ts | 124 +++++++++++ packages/utils/tests/emoji-testdata-test.ts | 8 +- 8 files changed, 505 insertions(+), 46 deletions(-) create mode 100644 packages/utils/src/emoji/parse.ts create mode 100644 packages/utils/tests/emoji-parse-test.ts diff --git a/packages/utils/package.json b/packages/utils/package.json index 7241848..a0aafe9 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -3,7 +3,7 @@ "type": "module", "description": "Common functions for working with Iconify icon sets used by various packages.", "author": "Vjacheslav Trushkin", - "version": "2.0.4", + "version": "2.0.5", "license": "MIT", "bugs": "https://github.com/iconify/iconify/issues", "homepage": "https://iconify.design/", @@ -192,6 +192,11 @@ "import": "./lib/emoji/format.mjs", "types": "./lib/emoji/format.d.ts" }, + "./lib/emoji/parse": { + "require": "./lib/emoji/parse.cjs", + "import": "./lib/emoji/parse.mjs", + "types": "./lib/emoji/parse.d.ts" + }, "./lib/icon-set/convert-info": { "require": "./lib/icon-set/convert-info.cjs", "import": "./lib/icon-set/convert-info.mjs", diff --git a/packages/utils/src/emoji/parse.ts b/packages/utils/src/emoji/parse.ts new file mode 100644 index 0000000..3ff6660 --- /dev/null +++ b/packages/utils/src/emoji/parse.ts @@ -0,0 +1,109 @@ +import type { IconifyJSON } from '@iconify/types'; +import { + getEmojiSequenceFromString, + getUnqualifiedEmojiSequence, +} from './cleanup'; +import { getEmojiSequenceKeyword } from './format'; +import { createOptimisedRegexForEmojiSequences } from './regex/create'; +import { findMissingEmojis } from './test/missing'; +import { parseEmojiTestFile } from './test/parse'; +import { combineSimilarEmojiTestData } from './test/similar'; +import { getEmojiTestDataTree } from './test/tree'; +import { getQualifiedEmojiVariations } from './test/variations'; + +/** + * Parsed icon + */ +export interface PreparedEmojiIcon { + // Icon name + icon: string; + + // Emoji sequence as string + sequence: string; +} + +/** + * Parse + */ +export interface PreparedEmojiResult { + // List of icons + icons: PreparedEmojiIcon[]; + + // Regular expression + regex: string; +} + +/** + * Prepare emoji for icons list + * + * Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt' + * It is used to detect missing emojis and optimise regular expression + */ +export function prepareEmojiForIconsList( + icons: Record, + rawTestData?: string +): PreparedEmojiResult { + // Prepare test data + const testData = rawTestData ? parseEmojiTestFile(rawTestData) : void 0; + + // Convert icons to object + interface IconsListItem { + icon: string; + sequence: number[]; + } + let iconsList: IconsListItem[] = []; + for (const char in icons) { + const sequence = getEmojiSequenceFromString(char); + iconsList.push({ + icon: icons[char], + sequence, + }); + } + + // Get fully-qualified versions of emojis + iconsList = getQualifiedEmojiVariations(iconsList, testData); + + // Find and add missing emojis if test data is available + if (testData) { + iconsList = iconsList.concat( + findMissingEmojis( + iconsList, + getEmojiTestDataTree(combineSimilarEmojiTestData(testData)) + ) + ); + } + + // Prepare icons list + const preparedIcons: PreparedEmojiIcon[] = iconsList.map((item) => { + const sequence = getEmojiSequenceKeyword( + getUnqualifiedEmojiSequence(item.sequence) + ); + return { + icon: item.icon, + sequence, + }; + }); + + // Prepare regex + const regex = createOptimisedRegexForEmojiSequences( + iconsList.map((item) => item.sequence) + ); + + return { + regex, + icons: preparedIcons, + }; +} + +/** + * Prepare emoji for an icon set + * + * Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt' + * It is used to detect missing emojis and optimise regular expression + */ +export function prepareEmojiForIconSet( + iconSet: IconifyJSON, + rawTestData?: string +): PreparedEmojiResult { + return prepareEmojiForIconsList(iconSet.chars || {}, rawTestData); +} diff --git a/packages/utils/src/emoji/replace/find.ts b/packages/utils/src/emoji/replace/find.ts index 9c72515..be7287b 100644 --- a/packages/utils/src/emoji/replace/find.ts +++ b/packages/utils/src/emoji/replace/find.ts @@ -21,6 +21,9 @@ export interface EmojiRegexMatch { // Icon name keyword: string; + + // Regex index, used if multiple regular expressions were provided + regexp: number; } /** @@ -45,54 +48,56 @@ interface PrevNextMatch extends PrevMatch { * Returns only one entry per match */ export function getEmojiMatchesInText( - regexp: string | RegExp, + regexp: string | RegExp | (string | RegExp)[], content: string ): EmojiRegexMatch[] { const results: EmojiRegexMatch[] = []; - const matches = content.match( - typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp - ); + const found: Set = new Set(); + (regexp instanceof Array ? regexp : [regexp]).forEach((regexp, index) => { + const matches = content.match( + typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp + ); - if (matches) { - // Sort matches by length to make sure longest matches get replaced first - matches.sort((a, b) => { - if (b.length === a.length) { - return a.localeCompare(b); - } - return b.length - a.length; - }); - - // Add all matches - let lastMatch: EmojiRegexMatch | undefined; - for (let i = 0; i < matches.length; i++) { - const match = matches[i]; - - if (lastMatch && lastMatch.match === match) { - continue; - } - - // Get sequence - const sequence: number[] = []; - for (const codePoint of match) { - const num = codePoint.codePointAt(0) as number; - if (num !== vs16Emoji) { - sequence.push(num); + if (matches) { + // Add all matches + for (let i = 0; i < matches.length; i++) { + const match = matches[i]; + if (found.has(match)) { + continue; } + found.add(match); + + // Get sequence + const sequence: number[] = []; + for (const codePoint of match) { + const num = codePoint.codePointAt(0) as number; + if (num !== vs16Emoji) { + sequence.push(num); + } + } + + // Add result + results.push({ + match, + sequence, + keyword: getEmojiSequenceKeyword( + convertEmojiSequenceToUTF32(sequence) + ), + regexp: index, + }); } - - // Get keyword - const keyword = getEmojiSequenceKeyword( - convertEmojiSequenceToUTF32(sequence) - ); - - lastMatch = { - match, - sequence, - keyword, - }; - results.push(lastMatch); } - } + }); + + // Sort matches by length to make sure longest matches get replaced first + results.sort((a, b) => { + const match1 = a.match; + const match2 = b.match; + if (match2.length === match1.length) { + return match1.localeCompare(match2); + } + return match2.length - match1.length; + }); return results; } diff --git a/packages/utils/src/emoji/replace/replace.ts b/packages/utils/src/emoji/replace/replace.ts index e1f426b..214c354 100644 --- a/packages/utils/src/emoji/replace/replace.ts +++ b/packages/utils/src/emoji/replace/replace.ts @@ -22,7 +22,7 @@ export type FindAndReplaceEmojisInTextCallback = ( * Returns null if nothing was replaced */ export function findAndReplaceEmojisInText( - regexp: string | RegExp, + regexp: string | RegExp | (string | RegExp)[], content: string, callback: FindAndReplaceEmojisInTextCallback ): string | null { diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts index d354ef0..7101b89 100644 --- a/packages/utils/src/index.ts +++ b/packages/utils/src/index.ts @@ -104,11 +104,15 @@ export { } from './emoji/format'; export { parseEmojiTestFile } from './emoji/test/parse'; export { getQualifiedEmojiVariations } from './emoji/test/variations'; -// export { getEmojisSequencesToCopy } from './emoji/test/copy'; +export { findMissingEmojis } from './emoji/test/missing'; export { createOptimisedRegex, createOptimisedRegexForEmojiSequences, } from './emoji/regex/create'; +export { + prepareEmojiForIconsList, + prepareEmojiForIconSet, +} from './emoji/parse'; export { findAndReplaceEmojisInText } from './emoji/replace/replace'; // Misc diff --git a/packages/utils/tests/emoji-parse-test.ts b/packages/utils/tests/emoji-parse-test.ts new file mode 100644 index 0000000..5719c42 --- /dev/null +++ b/packages/utils/tests/emoji-parse-test.ts @@ -0,0 +1,208 @@ +import { readFile, writeFile, unlink } from 'node:fs/promises'; +import { emojiVersion } from '../lib/emoji/data'; +import { prepareEmojiForIconsList } from '../lib/emoji/parse'; + +describe('Testing unicode test data', () => { + async function fetchEmojiTestData(): Promise { + // Fetch emojis, cache it + const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`; + + let data: string | undefined; + try { + data = await readFile(source, 'utf8'); + } catch { + // + } + + if (!data) { + data = ( + await fetch( + `https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt` + ) + ) + .text() + .toString(); + await writeFile(source, data, 'utf8'); + } + + // Test content, unlink cache on failure + if (data.indexOf(`# Version: ${emojiVersion}`) === -1) { + try { + await unlink(source); + } catch { + // + } + return; + } + return data; + } + + let data: string | undefined; + + beforeAll(async () => { + data = await fetchEmojiTestData(); + }); + + it('Preparing icon set without test data', () => { + // One emoji + expect( + prepareEmojiForIconsList({ + '2615': 'hot-beverage', + }) + ).toEqual({ + icons: [ + { + icon: 'hot-beverage', + sequence: '2615', + }, + ], + regex: '\\u2615\\uFE0F?', + }); + + // Multiple emojis + expect( + prepareEmojiForIconsList({ + '2615': 'hot-beverage', + '1f1e6-1f1e8': 'flag-ascension-island', + '1f1e6-1f1e9': 'flag-andorra', + '1f1e6-1f1ea': 'flag-united-arab-emirates', + }) + ).toEqual({ + icons: [ + { + icon: 'hot-beverage', + sequence: '2615', + }, + { + icon: 'flag-ascension-island', + sequence: '1f1e6-1f1e8', + }, + { + icon: 'flag-andorra', + sequence: '1f1e6-1f1e9', + }, + { + icon: 'flag-united-arab-emirates', + sequence: '1f1e6-1f1ea', + }, + ], + regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615\\uFE0F?', + }); + }); + + it('Preparing icon set with test data', () => { + if (!data) { + console.warn('Test skipped: test data is not available'); + return; + } + + // One emoji without variation + expect( + prepareEmojiForIconsList( + { + '2615': 'hot-beverage', + }, + data + ) + ).toEqual({ + icons: [ + { + icon: 'hot-beverage', + sequence: '2615', + }, + ], + regex: '\\u2615', + }); + + // One emoji with variation + expect( + prepareEmojiForIconsList( + { + // Upper case + '263A': 'smiling-face', + }, + data + ) + ).toEqual({ + icons: [ + { + icon: 'smiling-face', + // Lower case + sequence: '263a', + }, + ], + regex: '\\u263A\\uFE0F?', + }); + + // One emoji that has components in test data + expect( + prepareEmojiForIconsList( + { + '270b': 'raised-hand', + }, + data + ) + ).toEqual({ + icons: [ + { + icon: 'raised-hand', + sequence: '270b', + }, + { + icon: 'raised-hand', + sequence: '270b-1f3fb', + }, + { + icon: 'raised-hand', + sequence: '270b-1f3fc', + }, + { + icon: 'raised-hand', + sequence: '270b-1f3fd', + }, + { + icon: 'raised-hand', + sequence: '270b-1f3fe', + }, + { + icon: 'raised-hand', + sequence: '270b-1f3ff', + }, + ], + regex: '\\u270B(?:\\uD83C[\\uDFFB-\\uDFFF])?', + }); + + // Multiple emojis, all without variations + expect( + prepareEmojiForIconsList( + { + '2615': 'hot-beverage', + '1f1e6-1f1e8': 'flag-ascension-island', + '1f1e6-1f1e9': 'flag-andorra', + '1f1e6-1f1ea': 'flag-united-arab-emirates', + }, + data + ) + ).toEqual({ + icons: [ + { + icon: 'hot-beverage', + sequence: '2615', + }, + { + icon: 'flag-ascension-island', + sequence: '1f1e6-1f1e8', + }, + { + icon: 'flag-andorra', + sequence: '1f1e6-1f1e9', + }, + { + icon: 'flag-united-arab-emirates', + sequence: '1f1e6-1f1ea', + }, + ], + regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615', + }); + }); +}); diff --git a/packages/utils/tests/emoji-regex-find-test.ts b/packages/utils/tests/emoji-regex-find-test.ts index 3abc3e7..106bd86 100644 --- a/packages/utils/tests/emoji-regex-find-test.ts +++ b/packages/utils/tests/emoji-regex-find-test.ts @@ -44,23 +44,27 @@ describe('Finding emojis in text', () => { match: '\u263A\uFE0F', sequence: [0x263a], keyword: '263a', + regexp: 0, }, { // Should be returned only once match: String.fromCodePoint(0x1f600), sequence: [0x1f600], keyword: '1f600', + regexp: 0, }, { match: String.fromCodePoint(0x1f603), sequence: [0x1f603], keyword: '1f603', + regexp: 0, }, { // Same as first, but without 'FE0F' match: '\u263A', sequence: [0x263a], keyword: '263a', + regexp: 0, }, ]); @@ -72,6 +76,7 @@ describe('Finding emojis in text', () => { match: emoji1, sequence: [0x1f600], keyword: '1f600', + regexp: 0, }, prev: text1, next: text2, @@ -81,6 +86,7 @@ describe('Finding emojis in text', () => { match: emoji2, sequence: [0x1f603], keyword: '1f603', + regexp: 0, }, prev: text2, next: text3, @@ -90,6 +96,7 @@ describe('Finding emojis in text', () => { match: emoji3, sequence: [0x1f600], keyword: '1f600', + regexp: 0, }, prev: text3, next: text4, @@ -99,6 +106,7 @@ describe('Finding emojis in text', () => { match: emoji4, sequence: [0x263a], keyword: '263a', + regexp: 0, }, prev: text4, next: text5, @@ -108,6 +116,122 @@ describe('Finding emojis in text', () => { match: emoji5, sequence: [0x263a], keyword: '263a', + regexp: 0, + }, + prev: text5, + next: text6, + }, + ]); + }); + + it('Multiple regex', () => { + const regex0 = createOptimisedRegex(['1F600', '1F603', '1F604']); + const regex1 = createOptimisedRegex(['263A FE0F']); + + const text1 = 'E1.0 grinning face: '; + const emoji1 = String.fromCodePoint(0x1f600); + const text2 = '\nE0.6 grinning face with big eyes: '; + const emoji2 = String.fromCodePoint(0x1f603); + const text3 = 'E1.0 grinning face: '; + const emoji3 = emoji1; + const text4 = 'E0.6 smiling face: '; + const emoji4 = '\u263A\uFE0F'; + const text5 = '(fully-qualified)\nE0.6 smiling face: '; + const emoji5 = '\u263A'; + const text6 = '(unqualified)'; + + const content = + text1 + + emoji1 + + text2 + + emoji2 + + text3 + + emoji3 + + text4 + + emoji4 + + text5 + + emoji5 + + text6; + const matches = getEmojiMatchesInText([regex0, regex1], content); + + expect(matches).toEqual([ + { + match: '\u263A\uFE0F', + sequence: [0x263a], + keyword: '263a', + regexp: 1, + }, + { + // Should be returned only once + match: String.fromCodePoint(0x1f600), + sequence: [0x1f600], + keyword: '1f600', + regexp: 0, + }, + { + match: String.fromCodePoint(0x1f603), + sequence: [0x1f603], + keyword: '1f603', + regexp: 0, + }, + { + // Same as first, but without 'FE0F' + match: '\u263A', + sequence: [0x263a], + keyword: '263a', + regexp: 1, + }, + ]); + + const sortedMatches = sortEmojiMatchesInText(content, matches); + expect(sortedMatches).toEqual([ + // Same order as in content + { + match: { + match: emoji1, + sequence: [0x1f600], + keyword: '1f600', + regexp: 0, + }, + prev: text1, + next: text2, + }, + { + match: { + match: emoji2, + sequence: [0x1f603], + keyword: '1f603', + regexp: 0, + }, + prev: text2, + next: text3, + }, + { + match: { + match: emoji3, + sequence: [0x1f600], + keyword: '1f600', + regexp: 0, + }, + prev: text3, + next: text4, + }, + { + match: { + match: emoji4, + sequence: [0x263a], + keyword: '263a', + regexp: 1, + }, + prev: text4, + next: text5, + }, + { + match: { + match: emoji5, + sequence: [0x263a], + keyword: '263a', + regexp: 1, }, prev: text5, next: text6, diff --git a/packages/utils/tests/emoji-testdata-test.ts b/packages/utils/tests/emoji-testdata-test.ts index b542800..72c403c 100644 --- a/packages/utils/tests/emoji-testdata-test.ts +++ b/packages/utils/tests/emoji-testdata-test.ts @@ -8,7 +8,7 @@ import { minUTF32, emojiVersion, } from '../lib/emoji/data'; -import { parseEmojiTestFile } from '../lib/emoji/test/parse'; +import { EmojiTestDataItem, parseEmojiTestFile } from '../lib/emoji/test/parse'; import { mapEmojiTestDataComponents, replaceEmojiComponentsInCombinedSequence, @@ -713,7 +713,11 @@ describe('Testing unicode test data', () => { const tree = getEmojiTestDataTree(splitTestData); // Use test data - const testList = []; + interface TestListItem extends EmojiTestDataItem { + // Add it for easier testing + sequenceKey: string; + } + const testList: TestListItem[] = []; for (const sequenceKey in testData) { testList.push({ ...testData[sequenceKey],