2
0
mirror of https://github.com/iconify/iconify.git synced 2025-01-22 14:48:24 +00:00

feat(utils): function to prepare emoji data for icons list, support multiple regexp

This commit is contained in:
Vjacheslav Trushkin 2022-12-25 11:15:28 +02:00
parent e5dbd00cba
commit b5e9ecd2b4
8 changed files with 505 additions and 46 deletions

View File

@ -3,7 +3,7 @@
"type": "module", "type": "module",
"description": "Common functions for working with Iconify icon sets used by various packages.", "description": "Common functions for working with Iconify icon sets used by various packages.",
"author": "Vjacheslav Trushkin", "author": "Vjacheslav Trushkin",
"version": "2.0.4", "version": "2.0.5",
"license": "MIT", "license": "MIT",
"bugs": "https://github.com/iconify/iconify/issues", "bugs": "https://github.com/iconify/iconify/issues",
"homepage": "https://iconify.design/", "homepage": "https://iconify.design/",
@ -192,6 +192,11 @@
"import": "./lib/emoji/format.mjs", "import": "./lib/emoji/format.mjs",
"types": "./lib/emoji/format.d.ts" "types": "./lib/emoji/format.d.ts"
}, },
"./lib/emoji/parse": {
"require": "./lib/emoji/parse.cjs",
"import": "./lib/emoji/parse.mjs",
"types": "./lib/emoji/parse.d.ts"
},
"./lib/icon-set/convert-info": { "./lib/icon-set/convert-info": {
"require": "./lib/icon-set/convert-info.cjs", "require": "./lib/icon-set/convert-info.cjs",
"import": "./lib/icon-set/convert-info.mjs", "import": "./lib/icon-set/convert-info.mjs",

View File

@ -0,0 +1,109 @@
import type { IconifyJSON } from '@iconify/types';
import {
getEmojiSequenceFromString,
getUnqualifiedEmojiSequence,
} from './cleanup';
import { getEmojiSequenceKeyword } from './format';
import { createOptimisedRegexForEmojiSequences } from './regex/create';
import { findMissingEmojis } from './test/missing';
import { parseEmojiTestFile } from './test/parse';
import { combineSimilarEmojiTestData } from './test/similar';
import { getEmojiTestDataTree } from './test/tree';
import { getQualifiedEmojiVariations } from './test/variations';
/**
* Parsed icon
*/
export interface PreparedEmojiIcon {
// Icon name
icon: string;
// Emoji sequence as string
sequence: string;
}
/**
* Parse
*/
export interface PreparedEmojiResult {
// List of icons
icons: PreparedEmojiIcon[];
// Regular expression
regex: string;
}
/**
* Prepare emoji for icons list
*
* Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
* It is used to detect missing emojis and optimise regular expression
*/
export function prepareEmojiForIconsList(
icons: Record<string, string>,
rawTestData?: string
): PreparedEmojiResult {
// Prepare test data
const testData = rawTestData ? parseEmojiTestFile(rawTestData) : void 0;
// Convert icons to object
interface IconsListItem {
icon: string;
sequence: number[];
}
let iconsList: IconsListItem[] = [];
for (const char in icons) {
const sequence = getEmojiSequenceFromString(char);
iconsList.push({
icon: icons[char],
sequence,
});
}
// Get fully-qualified versions of emojis
iconsList = getQualifiedEmojiVariations(iconsList, testData);
// Find and add missing emojis if test data is available
if (testData) {
iconsList = iconsList.concat(
findMissingEmojis(
iconsList,
getEmojiTestDataTree(combineSimilarEmojiTestData(testData))
)
);
}
// Prepare icons list
const preparedIcons: PreparedEmojiIcon[] = iconsList.map((item) => {
const sequence = getEmojiSequenceKeyword(
getUnqualifiedEmojiSequence(item.sequence)
);
return {
icon: item.icon,
sequence,
};
});
// Prepare regex
const regex = createOptimisedRegexForEmojiSequences(
iconsList.map((item) => item.sequence)
);
return {
regex,
icons: preparedIcons,
};
}
/**
* Prepare emoji for an icon set
*
* Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
* It is used to detect missing emojis and optimise regular expression
*/
export function prepareEmojiForIconSet(
iconSet: IconifyJSON,
rawTestData?: string
): PreparedEmojiResult {
return prepareEmojiForIconsList(iconSet.chars || {}, rawTestData);
}

View File

@ -21,6 +21,9 @@ export interface EmojiRegexMatch {
// Icon name // Icon name
keyword: string; keyword: string;
// Regex index, used if multiple regular expressions were provided
regexp: number;
} }
/** /**
@ -45,54 +48,56 @@ interface PrevNextMatch extends PrevMatch {
* Returns only one entry per match * Returns only one entry per match
*/ */
export function getEmojiMatchesInText( export function getEmojiMatchesInText(
regexp: string | RegExp, regexp: string | RegExp | (string | RegExp)[],
content: string content: string
): EmojiRegexMatch[] { ): EmojiRegexMatch[] {
const results: EmojiRegexMatch[] = []; const results: EmojiRegexMatch[] = [];
const matches = content.match( const found: Set<string> = new Set();
typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp (regexp instanceof Array ? regexp : [regexp]).forEach((regexp, index) => {
); const matches = content.match(
typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
);
if (matches) { if (matches) {
// Sort matches by length to make sure longest matches get replaced first // Add all matches
matches.sort((a, b) => { for (let i = 0; i < matches.length; i++) {
if (b.length === a.length) { const match = matches[i];
return a.localeCompare(b); if (found.has(match)) {
} continue;
return b.length - a.length;
});
// Add all matches
let lastMatch: EmojiRegexMatch | undefined;
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
if (lastMatch && lastMatch.match === match) {
continue;
}
// Get sequence
const sequence: number[] = [];
for (const codePoint of match) {
const num = codePoint.codePointAt(0) as number;
if (num !== vs16Emoji) {
sequence.push(num);
} }
found.add(match);
// Get sequence
const sequence: number[] = [];
for (const codePoint of match) {
const num = codePoint.codePointAt(0) as number;
if (num !== vs16Emoji) {
sequence.push(num);
}
}
// Add result
results.push({
match,
sequence,
keyword: getEmojiSequenceKeyword(
convertEmojiSequenceToUTF32(sequence)
),
regexp: index,
});
} }
// Get keyword
const keyword = getEmojiSequenceKeyword(
convertEmojiSequenceToUTF32(sequence)
);
lastMatch = {
match,
sequence,
keyword,
};
results.push(lastMatch);
} }
} });
// Sort matches by length to make sure longest matches get replaced first
results.sort((a, b) => {
const match1 = a.match;
const match2 = b.match;
if (match2.length === match1.length) {
return match1.localeCompare(match2);
}
return match2.length - match1.length;
});
return results; return results;
} }

View File

@ -22,7 +22,7 @@ export type FindAndReplaceEmojisInTextCallback = (
* Returns null if nothing was replaced * Returns null if nothing was replaced
*/ */
export function findAndReplaceEmojisInText( export function findAndReplaceEmojisInText(
regexp: string | RegExp, regexp: string | RegExp | (string | RegExp)[],
content: string, content: string,
callback: FindAndReplaceEmojisInTextCallback callback: FindAndReplaceEmojisInTextCallback
): string | null { ): string | null {

View File

@ -104,11 +104,15 @@ export {
} from './emoji/format'; } from './emoji/format';
export { parseEmojiTestFile } from './emoji/test/parse'; export { parseEmojiTestFile } from './emoji/test/parse';
export { getQualifiedEmojiVariations } from './emoji/test/variations'; export { getQualifiedEmojiVariations } from './emoji/test/variations';
// export { getEmojisSequencesToCopy } from './emoji/test/copy'; export { findMissingEmojis } from './emoji/test/missing';
export { export {
createOptimisedRegex, createOptimisedRegex,
createOptimisedRegexForEmojiSequences, createOptimisedRegexForEmojiSequences,
} from './emoji/regex/create'; } from './emoji/regex/create';
export {
prepareEmojiForIconsList,
prepareEmojiForIconSet,
} from './emoji/parse';
export { findAndReplaceEmojisInText } from './emoji/replace/replace'; export { findAndReplaceEmojisInText } from './emoji/replace/replace';
// Misc // Misc

View File

@ -0,0 +1,208 @@
import { readFile, writeFile, unlink } from 'node:fs/promises';
import { emojiVersion } from '../lib/emoji/data';
import { prepareEmojiForIconsList } from '../lib/emoji/parse';
describe('Testing unicode test data', () => {
async function fetchEmojiTestData(): Promise<string | undefined> {
// Fetch emojis, cache it
const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
let data: string | undefined;
try {
data = await readFile(source, 'utf8');
} catch {
//
}
if (!data) {
data = (
await fetch(
`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
)
)
.text()
.toString();
await writeFile(source, data, 'utf8');
}
// Test content, unlink cache on failure
if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
try {
await unlink(source);
} catch {
//
}
return;
}
return data;
}
let data: string | undefined;
beforeAll(async () => {
data = await fetchEmojiTestData();
});
it('Preparing icon set without test data', () => {
// One emoji
expect(
prepareEmojiForIconsList({
'2615': 'hot-beverage',
})
).toEqual({
icons: [
{
icon: 'hot-beverage',
sequence: '2615',
},
],
regex: '\\u2615\\uFE0F?',
});
// Multiple emojis
expect(
prepareEmojiForIconsList({
'2615': 'hot-beverage',
'1f1e6-1f1e8': 'flag-ascension-island',
'1f1e6-1f1e9': 'flag-andorra',
'1f1e6-1f1ea': 'flag-united-arab-emirates',
})
).toEqual({
icons: [
{
icon: 'hot-beverage',
sequence: '2615',
},
{
icon: 'flag-ascension-island',
sequence: '1f1e6-1f1e8',
},
{
icon: 'flag-andorra',
sequence: '1f1e6-1f1e9',
},
{
icon: 'flag-united-arab-emirates',
sequence: '1f1e6-1f1ea',
},
],
regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615\\uFE0F?',
});
});
it('Preparing icon set with test data', () => {
if (!data) {
console.warn('Test skipped: test data is not available');
return;
}
// One emoji without variation
expect(
prepareEmojiForIconsList(
{
'2615': 'hot-beverage',
},
data
)
).toEqual({
icons: [
{
icon: 'hot-beverage',
sequence: '2615',
},
],
regex: '\\u2615',
});
// One emoji with variation
expect(
prepareEmojiForIconsList(
{
// Upper case
'263A': 'smiling-face',
},
data
)
).toEqual({
icons: [
{
icon: 'smiling-face',
// Lower case
sequence: '263a',
},
],
regex: '\\u263A\\uFE0F?',
});
// One emoji that has components in test data
expect(
prepareEmojiForIconsList(
{
'270b': 'raised-hand',
},
data
)
).toEqual({
icons: [
{
icon: 'raised-hand',
sequence: '270b',
},
{
icon: 'raised-hand',
sequence: '270b-1f3fb',
},
{
icon: 'raised-hand',
sequence: '270b-1f3fc',
},
{
icon: 'raised-hand',
sequence: '270b-1f3fd',
},
{
icon: 'raised-hand',
sequence: '270b-1f3fe',
},
{
icon: 'raised-hand',
sequence: '270b-1f3ff',
},
],
regex: '\\u270B(?:\\uD83C[\\uDFFB-\\uDFFF])?',
});
// Multiple emojis, all without variations
expect(
prepareEmojiForIconsList(
{
'2615': 'hot-beverage',
'1f1e6-1f1e8': 'flag-ascension-island',
'1f1e6-1f1e9': 'flag-andorra',
'1f1e6-1f1ea': 'flag-united-arab-emirates',
},
data
)
).toEqual({
icons: [
{
icon: 'hot-beverage',
sequence: '2615',
},
{
icon: 'flag-ascension-island',
sequence: '1f1e6-1f1e8',
},
{
icon: 'flag-andorra',
sequence: '1f1e6-1f1e9',
},
{
icon: 'flag-united-arab-emirates',
sequence: '1f1e6-1f1ea',
},
],
regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615',
});
});
});

View File

@ -44,23 +44,27 @@ describe('Finding emojis in text', () => {
match: '\u263A\uFE0F', match: '\u263A\uFE0F',
sequence: [0x263a], sequence: [0x263a],
keyword: '263a', keyword: '263a',
regexp: 0,
}, },
{ {
// Should be returned only once // Should be returned only once
match: String.fromCodePoint(0x1f600), match: String.fromCodePoint(0x1f600),
sequence: [0x1f600], sequence: [0x1f600],
keyword: '1f600', keyword: '1f600',
regexp: 0,
}, },
{ {
match: String.fromCodePoint(0x1f603), match: String.fromCodePoint(0x1f603),
sequence: [0x1f603], sequence: [0x1f603],
keyword: '1f603', keyword: '1f603',
regexp: 0,
}, },
{ {
// Same as first, but without 'FE0F' // Same as first, but without 'FE0F'
match: '\u263A', match: '\u263A',
sequence: [0x263a], sequence: [0x263a],
keyword: '263a', keyword: '263a',
regexp: 0,
}, },
]); ]);
@ -72,6 +76,7 @@ describe('Finding emojis in text', () => {
match: emoji1, match: emoji1,
sequence: [0x1f600], sequence: [0x1f600],
keyword: '1f600', keyword: '1f600',
regexp: 0,
}, },
prev: text1, prev: text1,
next: text2, next: text2,
@ -81,6 +86,7 @@ describe('Finding emojis in text', () => {
match: emoji2, match: emoji2,
sequence: [0x1f603], sequence: [0x1f603],
keyword: '1f603', keyword: '1f603',
regexp: 0,
}, },
prev: text2, prev: text2,
next: text3, next: text3,
@ -90,6 +96,7 @@ describe('Finding emojis in text', () => {
match: emoji3, match: emoji3,
sequence: [0x1f600], sequence: [0x1f600],
keyword: '1f600', keyword: '1f600',
regexp: 0,
}, },
prev: text3, prev: text3,
next: text4, next: text4,
@ -99,6 +106,7 @@ describe('Finding emojis in text', () => {
match: emoji4, match: emoji4,
sequence: [0x263a], sequence: [0x263a],
keyword: '263a', keyword: '263a',
regexp: 0,
}, },
prev: text4, prev: text4,
next: text5, next: text5,
@ -108,6 +116,122 @@ describe('Finding emojis in text', () => {
match: emoji5, match: emoji5,
sequence: [0x263a], sequence: [0x263a],
keyword: '263a', keyword: '263a',
regexp: 0,
},
prev: text5,
next: text6,
},
]);
});
it('Multiple regex', () => {
const regex0 = createOptimisedRegex(['1F600', '1F603', '1F604']);
const regex1 = createOptimisedRegex(['263A FE0F']);
const text1 = 'E1.0 grinning face: ';
const emoji1 = String.fromCodePoint(0x1f600);
const text2 = '\nE0.6 grinning face with big eyes: ';
const emoji2 = String.fromCodePoint(0x1f603);
const text3 = 'E1.0 grinning face: ';
const emoji3 = emoji1;
const text4 = 'E0.6 smiling face: ';
const emoji4 = '\u263A\uFE0F';
const text5 = '(fully-qualified)\nE0.6 smiling face: ';
const emoji5 = '\u263A';
const text6 = '(unqualified)';
const content =
text1 +
emoji1 +
text2 +
emoji2 +
text3 +
emoji3 +
text4 +
emoji4 +
text5 +
emoji5 +
text6;
const matches = getEmojiMatchesInText([regex0, regex1], content);
expect(matches).toEqual([
{
match: '\u263A\uFE0F',
sequence: [0x263a],
keyword: '263a',
regexp: 1,
},
{
// Should be returned only once
match: String.fromCodePoint(0x1f600),
sequence: [0x1f600],
keyword: '1f600',
regexp: 0,
},
{
match: String.fromCodePoint(0x1f603),
sequence: [0x1f603],
keyword: '1f603',
regexp: 0,
},
{
// Same as first, but without 'FE0F'
match: '\u263A',
sequence: [0x263a],
keyword: '263a',
regexp: 1,
},
]);
const sortedMatches = sortEmojiMatchesInText(content, matches);
expect(sortedMatches).toEqual([
// Same order as in content
{
match: {
match: emoji1,
sequence: [0x1f600],
keyword: '1f600',
regexp: 0,
},
prev: text1,
next: text2,
},
{
match: {
match: emoji2,
sequence: [0x1f603],
keyword: '1f603',
regexp: 0,
},
prev: text2,
next: text3,
},
{
match: {
match: emoji3,
sequence: [0x1f600],
keyword: '1f600',
regexp: 0,
},
prev: text3,
next: text4,
},
{
match: {
match: emoji4,
sequence: [0x263a],
keyword: '263a',
regexp: 1,
},
prev: text4,
next: text5,
},
{
match: {
match: emoji5,
sequence: [0x263a],
keyword: '263a',
regexp: 1,
}, },
prev: text5, prev: text5,
next: text6, next: text6,

View File

@ -8,7 +8,7 @@ import {
minUTF32, minUTF32,
emojiVersion, emojiVersion,
} from '../lib/emoji/data'; } from '../lib/emoji/data';
import { parseEmojiTestFile } from '../lib/emoji/test/parse'; import { EmojiTestDataItem, parseEmojiTestFile } from '../lib/emoji/test/parse';
import { import {
mapEmojiTestDataComponents, mapEmojiTestDataComponents,
replaceEmojiComponentsInCombinedSequence, replaceEmojiComponentsInCombinedSequence,
@ -713,7 +713,11 @@ describe('Testing unicode test data', () => {
const tree = getEmojiTestDataTree(splitTestData); const tree = getEmojiTestDataTree(splitTestData);
// Use test data // Use test data
const testList = []; interface TestListItem extends EmojiTestDataItem {
// Add it for easier testing
sequenceKey: string;
}
const testList: TestListItem[] = [];
for (const sequenceKey in testData) { for (const sequenceKey in testData) {
testList.push({ testList.push({
...testData[sequenceKey], ...testData[sequenceKey],