feat(utils): function to prepare emoji data for icons list, support multiple regexp

2025-01-06 07:20:40 +00:00 · 2022-12-25 11:15:28 +02:00 · 2022-12-25 11:15:28 +02:00 · b5e9ecd2b4
commit b5e9ecd2b4
parent e5dbd00cba
8 changed files with 505 additions and 46 deletions
--- a/packages/utils/package.json
+++ b/packages/utils/package.json
@ -3,7 +3,7 @@
 	"type": "module",
 	"description": "Common functions for working with Iconify icon sets used by various packages.",
 	"author": "Vjacheslav Trushkin",
-	"version": "2.0.4",
+	"version": "2.0.5",
 	"license": "MIT",
 	"bugs": "https://github.com/iconify/iconify/issues",
 	"homepage": "https://iconify.design/",
@ -192,6 +192,11 @@
 			"import": "./lib/emoji/format.mjs",
 			"types": "./lib/emoji/format.d.ts"
 		},
+		"./lib/emoji/parse": {
+			"require": "./lib/emoji/parse.cjs",
+			"import": "./lib/emoji/parse.mjs",
+			"types": "./lib/emoji/parse.d.ts"
+		},
 		"./lib/icon-set/convert-info": {
 			"require": "./lib/icon-set/convert-info.cjs",
 			"import": "./lib/icon-set/convert-info.mjs",
--- a/packages/utils/src/emoji/parse.ts
+++ b/packages/utils/src/emoji/parse.ts
@ -0,0 +1,109 @@
+import type { IconifyJSON } from '@iconify/types';
+import {
+	getEmojiSequenceFromString,
+	getUnqualifiedEmojiSequence,
+} from './cleanup';
+import { getEmojiSequenceKeyword } from './format';
+import { createOptimisedRegexForEmojiSequences } from './regex/create';
+import { findMissingEmojis } from './test/missing';
+import { parseEmojiTestFile } from './test/parse';
+import { combineSimilarEmojiTestData } from './test/similar';
+import { getEmojiTestDataTree } from './test/tree';
+import { getQualifiedEmojiVariations } from './test/variations';
+
+/**
+ * Parsed icon
+ */
+export interface PreparedEmojiIcon {
+	// Icon name
+	icon: string;
+
+	// Emoji sequence as string
+	sequence: string;
+}
+
+/**
+ * Parse
+ */
+export interface PreparedEmojiResult {
+	// List of icons
+	icons: PreparedEmojiIcon[];
+
+	// Regular expression
+	regex: string;
+}
+
+/**
+ * Prepare emoji for icons list
+ *
+ * Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
+ * It is used to detect missing emojis and optimise regular expression
+ */
+export function prepareEmojiForIconsList(
+	icons: Record<string, string>,
+	rawTestData?: string
+): PreparedEmojiResult {
+	// Prepare test data
+	const testData = rawTestData ? parseEmojiTestFile(rawTestData) : void 0;
+
+	// Convert icons to object
+	interface IconsListItem {
+		icon: string;
+		sequence: number[];
+	}
+	let iconsList: IconsListItem[] = [];
+	for (const char in icons) {
+		const sequence = getEmojiSequenceFromString(char);
+		iconsList.push({
+			icon: icons[char],
+			sequence,
+		});
+	}
+
+	// Get fully-qualified versions of emojis
+	iconsList = getQualifiedEmojiVariations(iconsList, testData);
+
+	// Find and add missing emojis if test data is available
+	if (testData) {
+		iconsList = iconsList.concat(
+			findMissingEmojis(
+				iconsList,
+				getEmojiTestDataTree(combineSimilarEmojiTestData(testData))
+			)
+		);
+	}
+
+	// Prepare icons list
+	const preparedIcons: PreparedEmojiIcon[] = iconsList.map((item) => {
+		const sequence = getEmojiSequenceKeyword(
+			getUnqualifiedEmojiSequence(item.sequence)
+		);
+		return {
+			icon: item.icon,
+			sequence,
+		};
+	});
+
+	// Prepare regex
+	const regex = createOptimisedRegexForEmojiSequences(
+		iconsList.map((item) => item.sequence)
+	);
+
+	return {
+		regex,
+		icons: preparedIcons,
+	};
+}
+
+/**
+ * Prepare emoji for an icon set
+ *
+ * Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
+ * It is used to detect missing emojis and optimise regular expression
+ */
+export function prepareEmojiForIconSet(
+	iconSet: IconifyJSON,
+	rawTestData?: string
+): PreparedEmojiResult {
+	return prepareEmojiForIconsList(iconSet.chars || {}, rawTestData);
+}
--- a/packages/utils/src/emoji/replace/find.ts
+++ b/packages/utils/src/emoji/replace/find.ts
@ -21,6 +21,9 @@ export interface EmojiRegexMatch {

 	// Icon name
 	keyword: string;
+
+	// Regex index, used if multiple regular expressions were provided
+	regexp: number;
 }

 /**
@ -45,54 +48,56 @@ interface PrevNextMatch extends PrevMatch {
 * Returns only one entry per match
 */
 export function getEmojiMatchesInText(
-	regexp: string | RegExp,
+	regexp: string | RegExp | (string | RegExp)[],
 	content: string
 ): EmojiRegexMatch[] {
 	const results: EmojiRegexMatch[] = [];
-	const matches = content.match(
-		typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
-	);
+	const found: Set<string> = new Set();
+	(regexp instanceof Array ? regexp : [regexp]).forEach((regexp, index) => {
+		const matches = content.match(
+			typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
+		);

-	if (matches) {
-		// Sort matches by length to make sure longest matches get replaced first
-		matches.sort((a, b) => {
-			if (b.length === a.length) {
-				return a.localeCompare(b);
-			}
-			return b.length - a.length;
-		});
-
-		// Add all matches
-		let lastMatch: EmojiRegexMatch | undefined;
-		for (let i = 0; i < matches.length; i++) {
-			const match = matches[i];
-
-			if (lastMatch && lastMatch.match === match) {
-				continue;
-			}
-
-			// Get sequence
-			const sequence: number[] = [];
-			for (const codePoint of match) {
-				const num = codePoint.codePointAt(0) as number;
-				if (num !== vs16Emoji) {
-					sequence.push(num);
+		if (matches) {
+			// Add all matches
+			for (let i = 0; i < matches.length; i++) {
+				const match = matches[i];
+				if (found.has(match)) {
+					continue;
 				}
+				found.add(match);
+
+				// Get sequence
+				const sequence: number[] = [];
+				for (const codePoint of match) {
+					const num = codePoint.codePointAt(0) as number;
+					if (num !== vs16Emoji) {
+						sequence.push(num);
+					}
+				}
+
+				// Add result
+				results.push({
+					match,
+					sequence,
+					keyword: getEmojiSequenceKeyword(
+						convertEmojiSequenceToUTF32(sequence)
+					),
+					regexp: index,
+				});
 			}
-
-			// Get keyword
-			const keyword = getEmojiSequenceKeyword(
-				convertEmojiSequenceToUTF32(sequence)
-			);
-
-			lastMatch = {
-				match,
-				sequence,
-				keyword,
-			};
-			results.push(lastMatch);
 		}
-	}
+	});
+
+	// Sort matches by length to make sure longest matches get replaced first
+	results.sort((a, b) => {
+		const match1 = a.match;
+		const match2 = b.match;
+		if (match2.length === match1.length) {
+			return match1.localeCompare(match2);
+		}
+		return match2.length - match1.length;
+	});

 	return results;
 }
--- a/packages/utils/src/emoji/replace/replace.ts
+++ b/packages/utils/src/emoji/replace/replace.ts
@ -22,7 +22,7 @@ export type FindAndReplaceEmojisInTextCallback = (
 * Returns null if nothing was replaced
 */
 export function findAndReplaceEmojisInText(
-	regexp: string | RegExp,
+	regexp: string | RegExp | (string | RegExp)[],
 	content: string,
 	callback: FindAndReplaceEmojisInTextCallback
 ): string | null {
--- a/packages/utils/src/index.ts
+++ b/packages/utils/src/index.ts
@ -104,11 +104,15 @@ export {
 } from './emoji/format';
 export { parseEmojiTestFile } from './emoji/test/parse';
 export { getQualifiedEmojiVariations } from './emoji/test/variations';
-// export { getEmojisSequencesToCopy } from './emoji/test/copy';
+export { findMissingEmojis } from './emoji/test/missing';
 export {
 	createOptimisedRegex,
 	createOptimisedRegexForEmojiSequences,
 } from './emoji/regex/create';
+export {
+	prepareEmojiForIconsList,
+	prepareEmojiForIconSet,
+} from './emoji/parse';
 export { findAndReplaceEmojisInText } from './emoji/replace/replace';

 // Misc
--- a/packages/utils/tests/emoji-parse-test.ts
+++ b/packages/utils/tests/emoji-parse-test.ts
@ -0,0 +1,208 @@
+import { readFile, writeFile, unlink } from 'node:fs/promises';
+import { emojiVersion } from '../lib/emoji/data';
+import { prepareEmojiForIconsList } from '../lib/emoji/parse';
+
+describe('Testing unicode test data', () => {
+	async function fetchEmojiTestData(): Promise<string | undefined> {
+		// Fetch emojis, cache it
+		const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
+
+		let data: string | undefined;
+		try {
+			data = await readFile(source, 'utf8');
+		} catch {
+			//
+		}
+
+		if (!data) {
+			data = (
+				await fetch(
+					`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
+				)
+			)
+				.text()
+				.toString();
+			await writeFile(source, data, 'utf8');
+		}
+
+		// Test content, unlink cache on failure
+		if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
+			try {
+				await unlink(source);
+			} catch {
+				//
+			}
+			return;
+		}
+		return data;
+	}
+
+	let data: string | undefined;
+
+	beforeAll(async () => {
+		data = await fetchEmojiTestData();
+	});
+
+	it('Preparing icon set without test data', () => {
+		// One emoji
+		expect(
+			prepareEmojiForIconsList({
+				'2615': 'hot-beverage',
+			})
+		).toEqual({
+			icons: [
+				{
+					icon: 'hot-beverage',
+					sequence: '2615',
+				},
+			],
+			regex: '\\u2615\\uFE0F?',
+		});
+
+		// Multiple emojis
+		expect(
+			prepareEmojiForIconsList({
+				'2615': 'hot-beverage',
+				'1f1e6-1f1e8': 'flag-ascension-island',
+				'1f1e6-1f1e9': 'flag-andorra',
+				'1f1e6-1f1ea': 'flag-united-arab-emirates',
+			})
+		).toEqual({
+			icons: [
+				{
+					icon: 'hot-beverage',
+					sequence: '2615',
+				},
+				{
+					icon: 'flag-ascension-island',
+					sequence: '1f1e6-1f1e8',
+				},
+				{
+					icon: 'flag-andorra',
+					sequence: '1f1e6-1f1e9',
+				},
+				{
+					icon: 'flag-united-arab-emirates',
+					sequence: '1f1e6-1f1ea',
+				},
+			],
+			regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615\\uFE0F?',
+		});
+	});
+
+	it('Preparing icon set with test data', () => {
+		if (!data) {
+			console.warn('Test skipped: test data is not available');
+			return;
+		}
+
+		// One emoji without variation
+		expect(
+			prepareEmojiForIconsList(
+				{
+					'2615': 'hot-beverage',
+				},
+				data
+			)
+		).toEqual({
+			icons: [
+				{
+					icon: 'hot-beverage',
+					sequence: '2615',
+				},
+			],
+			regex: '\\u2615',
+		});
+
+		// One emoji with variation
+		expect(
+			prepareEmojiForIconsList(
+				{
+					// Upper case
+					'263A': 'smiling-face',
+				},
+				data
+			)
+		).toEqual({
+			icons: [
+				{
+					icon: 'smiling-face',
+					// Lower case
+					sequence: '263a',
+				},
+			],
+			regex: '\\u263A\\uFE0F?',
+		});
+
+		// One emoji that has components in test data
+		expect(
+			prepareEmojiForIconsList(
+				{
+					'270b': 'raised-hand',
+				},
+				data
+			)
+		).toEqual({
+			icons: [
+				{
+					icon: 'raised-hand',
+					sequence: '270b',
+				},
+				{
+					icon: 'raised-hand',
+					sequence: '270b-1f3fb',
+				},
+				{
+					icon: 'raised-hand',
+					sequence: '270b-1f3fc',
+				},
+				{
+					icon: 'raised-hand',
+					sequence: '270b-1f3fd',
+				},
+				{
+					icon: 'raised-hand',
+					sequence: '270b-1f3fe',
+				},
+				{
+					icon: 'raised-hand',
+					sequence: '270b-1f3ff',
+				},
+			],
+			regex: '\\u270B(?:\\uD83C[\\uDFFB-\\uDFFF])?',
+		});
+
+		// Multiple emojis, all without variations
+		expect(
+			prepareEmojiForIconsList(
+				{
+					'2615': 'hot-beverage',
+					'1f1e6-1f1e8': 'flag-ascension-island',
+					'1f1e6-1f1e9': 'flag-andorra',
+					'1f1e6-1f1ea': 'flag-united-arab-emirates',
+				},
+				data
+			)
+		).toEqual({
+			icons: [
+				{
+					icon: 'hot-beverage',
+					sequence: '2615',
+				},
+				{
+					icon: 'flag-ascension-island',
+					sequence: '1f1e6-1f1e8',
+				},
+				{
+					icon: 'flag-andorra',
+					sequence: '1f1e6-1f1e9',
+				},
+				{
+					icon: 'flag-united-arab-emirates',
+					sequence: '1f1e6-1f1ea',
+				},
+			],
+			regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615',
+		});
+	});
+});
--- a/packages/utils/tests/emoji-regex-find-test.ts
+++ b/packages/utils/tests/emoji-regex-find-test.ts
@ -44,23 +44,27 @@ describe('Finding emojis in text', () => {
 				match: '\u263A\uFE0F',
 				sequence: [0x263a],
 				keyword: '263a',
+				regexp: 0,
 			},
 			{
 				// Should be returned only once
 				match: String.fromCodePoint(0x1f600),
 				sequence: [0x1f600],
 				keyword: '1f600',
+				regexp: 0,
 			},
 			{
 				match: String.fromCodePoint(0x1f603),
 				sequence: [0x1f603],
 				keyword: '1f603',
+				regexp: 0,
 			},
 			{
 				// Same as first, but without 'FE0F'
 				match: '\u263A',
 				sequence: [0x263a],
 				keyword: '263a',
+				regexp: 0,
 			},
 		]);

@ -72,6 +76,7 @@ describe('Finding emojis in text', () => {
 					match: emoji1,
 					sequence: [0x1f600],
 					keyword: '1f600',
+					regexp: 0,
 				},
 				prev: text1,
 				next: text2,
@ -81,6 +86,7 @@ describe('Finding emojis in text', () => {
 					match: emoji2,
 					sequence: [0x1f603],
 					keyword: '1f603',
+					regexp: 0,
 				},
 				prev: text2,
 				next: text3,
@ -90,6 +96,7 @@ describe('Finding emojis in text', () => {
 					match: emoji3,
 					sequence: [0x1f600],
 					keyword: '1f600',
+					regexp: 0,
 				},
 				prev: text3,
 				next: text4,
@ -99,6 +106,7 @@ describe('Finding emojis in text', () => {
 					match: emoji4,
 					sequence: [0x263a],
 					keyword: '263a',
+					regexp: 0,
 				},
 				prev: text4,
 				next: text5,
@ -108,6 +116,122 @@ describe('Finding emojis in text', () => {
 					match: emoji5,
 					sequence: [0x263a],
 					keyword: '263a',
+					regexp: 0,
+				},
+				prev: text5,
+				next: text6,
+			},
+		]);
+	});
+
+	it('Multiple regex', () => {
+		const regex0 = createOptimisedRegex(['1F600', '1F603', '1F604']);
+		const regex1 = createOptimisedRegex(['263A FE0F']);
+
+		const text1 = 'E1.0 grinning face: ';
+		const emoji1 = String.fromCodePoint(0x1f600);
+		const text2 = '\nE0.6 grinning face with big eyes: ';
+		const emoji2 = String.fromCodePoint(0x1f603);
+		const text3 = 'E1.0 grinning face: ';
+		const emoji3 = emoji1;
+		const text4 = 'E0.6 smiling face: ';
+		const emoji4 = '\u263A\uFE0F';
+		const text5 = '(fully-qualified)\nE0.6 smiling face: ';
+		const emoji5 = '\u263A';
+		const text6 = '(unqualified)';
+
+		const content =
+			text1 +
+			emoji1 +
+			text2 +
+			emoji2 +
+			text3 +
+			emoji3 +
+			text4 +
+			emoji4 +
+			text5 +
+			emoji5 +
+			text6;
+		const matches = getEmojiMatchesInText([regex0, regex1], content);
+
+		expect(matches).toEqual([
+			{
+				match: '\u263A\uFE0F',
+				sequence: [0x263a],
+				keyword: '263a',
+				regexp: 1,
+			},
+			{
+				// Should be returned only once
+				match: String.fromCodePoint(0x1f600),
+				sequence: [0x1f600],
+				keyword: '1f600',
+				regexp: 0,
+			},
+			{
+				match: String.fromCodePoint(0x1f603),
+				sequence: [0x1f603],
+				keyword: '1f603',
+				regexp: 0,
+			},
+			{
+				// Same as first, but without 'FE0F'
+				match: '\u263A',
+				sequence: [0x263a],
+				keyword: '263a',
+				regexp: 1,
+			},
+		]);
+
+		const sortedMatches = sortEmojiMatchesInText(content, matches);
+		expect(sortedMatches).toEqual([
+			// Same order as in content
+			{
+				match: {
+					match: emoji1,
+					sequence: [0x1f600],
+					keyword: '1f600',
+					regexp: 0,
+				},
+				prev: text1,
+				next: text2,
+			},
+			{
+				match: {
+					match: emoji2,
+					sequence: [0x1f603],
+					keyword: '1f603',
+					regexp: 0,
+				},
+				prev: text2,
+				next: text3,
+			},
+			{
+				match: {
+					match: emoji3,
+					sequence: [0x1f600],
+					keyword: '1f600',
+					regexp: 0,
+				},
+				prev: text3,
+				next: text4,
+			},
+			{
+				match: {
+					match: emoji4,
+					sequence: [0x263a],
+					keyword: '263a',
+					regexp: 1,
+				},
+				prev: text4,
+				next: text5,
+			},
+			{
+				match: {
+					match: emoji5,
+					sequence: [0x263a],
+					keyword: '263a',
+					regexp: 1,
 				},
 				prev: text5,
 				next: text6,
--- a/packages/utils/tests/emoji-testdata-test.ts
+++ b/packages/utils/tests/emoji-testdata-test.ts
@ -8,7 +8,7 @@ import {
 	minUTF32,
 	emojiVersion,
 } from '../lib/emoji/data';
-import { parseEmojiTestFile } from '../lib/emoji/test/parse';
+import { EmojiTestDataItem, parseEmojiTestFile } from '../lib/emoji/test/parse';
 import {
 	mapEmojiTestDataComponents,
 	replaceEmojiComponentsInCombinedSequence,
@ -713,7 +713,11 @@ describe('Testing unicode test data', () => {
 		const tree = getEmojiTestDataTree(splitTestData);

 		// Use test data
-		const testList = [];
+		interface TestListItem extends EmojiTestDataItem {
+			// Add it for easier testing
+			sequenceKey: string;
+		}
+		const testList: TestListItem[] = [];
 		for (const sequenceKey in testData) {
 			testList.push({
 				...testData[sequenceKey],