mirror of
https://github.com/iconify/iconify.git
synced 2025-01-07 15:44:05 +00:00
feat(utils): function to prepare emoji data for icons list, support multiple regexp
This commit is contained in:
parent
e5dbd00cba
commit
b5e9ecd2b4
@ -3,7 +3,7 @@
|
||||
"type": "module",
|
||||
"description": "Common functions for working with Iconify icon sets used by various packages.",
|
||||
"author": "Vjacheslav Trushkin",
|
||||
"version": "2.0.4",
|
||||
"version": "2.0.5",
|
||||
"license": "MIT",
|
||||
"bugs": "https://github.com/iconify/iconify/issues",
|
||||
"homepage": "https://iconify.design/",
|
||||
@ -192,6 +192,11 @@
|
||||
"import": "./lib/emoji/format.mjs",
|
||||
"types": "./lib/emoji/format.d.ts"
|
||||
},
|
||||
"./lib/emoji/parse": {
|
||||
"require": "./lib/emoji/parse.cjs",
|
||||
"import": "./lib/emoji/parse.mjs",
|
||||
"types": "./lib/emoji/parse.d.ts"
|
||||
},
|
||||
"./lib/icon-set/convert-info": {
|
||||
"require": "./lib/icon-set/convert-info.cjs",
|
||||
"import": "./lib/icon-set/convert-info.mjs",
|
||||
|
109
packages/utils/src/emoji/parse.ts
Normal file
109
packages/utils/src/emoji/parse.ts
Normal file
@ -0,0 +1,109 @@
|
||||
import type { IconifyJSON } from '@iconify/types';
|
||||
import {
|
||||
getEmojiSequenceFromString,
|
||||
getUnqualifiedEmojiSequence,
|
||||
} from './cleanup';
|
||||
import { getEmojiSequenceKeyword } from './format';
|
||||
import { createOptimisedRegexForEmojiSequences } from './regex/create';
|
||||
import { findMissingEmojis } from './test/missing';
|
||||
import { parseEmojiTestFile } from './test/parse';
|
||||
import { combineSimilarEmojiTestData } from './test/similar';
|
||||
import { getEmojiTestDataTree } from './test/tree';
|
||||
import { getQualifiedEmojiVariations } from './test/variations';
|
||||
|
||||
/**
|
||||
* Parsed icon
|
||||
*/
|
||||
export interface PreparedEmojiIcon {
|
||||
// Icon name
|
||||
icon: string;
|
||||
|
||||
// Emoji sequence as string
|
||||
sequence: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse
|
||||
*/
|
||||
export interface PreparedEmojiResult {
|
||||
// List of icons
|
||||
icons: PreparedEmojiIcon[];
|
||||
|
||||
// Regular expression
|
||||
regex: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare emoji for icons list
|
||||
*
|
||||
* Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
|
||||
* It is used to detect missing emojis and optimise regular expression
|
||||
*/
|
||||
export function prepareEmojiForIconsList(
|
||||
icons: Record<string, string>,
|
||||
rawTestData?: string
|
||||
): PreparedEmojiResult {
|
||||
// Prepare test data
|
||||
const testData = rawTestData ? parseEmojiTestFile(rawTestData) : void 0;
|
||||
|
||||
// Convert icons to object
|
||||
interface IconsListItem {
|
||||
icon: string;
|
||||
sequence: number[];
|
||||
}
|
||||
let iconsList: IconsListItem[] = [];
|
||||
for (const char in icons) {
|
||||
const sequence = getEmojiSequenceFromString(char);
|
||||
iconsList.push({
|
||||
icon: icons[char],
|
||||
sequence,
|
||||
});
|
||||
}
|
||||
|
||||
// Get fully-qualified versions of emojis
|
||||
iconsList = getQualifiedEmojiVariations(iconsList, testData);
|
||||
|
||||
// Find and add missing emojis if test data is available
|
||||
if (testData) {
|
||||
iconsList = iconsList.concat(
|
||||
findMissingEmojis(
|
||||
iconsList,
|
||||
getEmojiTestDataTree(combineSimilarEmojiTestData(testData))
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Prepare icons list
|
||||
const preparedIcons: PreparedEmojiIcon[] = iconsList.map((item) => {
|
||||
const sequence = getEmojiSequenceKeyword(
|
||||
getUnqualifiedEmojiSequence(item.sequence)
|
||||
);
|
||||
return {
|
||||
icon: item.icon,
|
||||
sequence,
|
||||
};
|
||||
});
|
||||
|
||||
// Prepare regex
|
||||
const regex = createOptimisedRegexForEmojiSequences(
|
||||
iconsList.map((item) => item.sequence)
|
||||
);
|
||||
|
||||
return {
|
||||
regex,
|
||||
icons: preparedIcons,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare emoji for an icon set
|
||||
*
|
||||
* Test data should be fetched from 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'
|
||||
* It is used to detect missing emojis and optimise regular expression
|
||||
*/
|
||||
export function prepareEmojiForIconSet(
|
||||
iconSet: IconifyJSON,
|
||||
rawTestData?: string
|
||||
): PreparedEmojiResult {
|
||||
return prepareEmojiForIconsList(iconSet.chars || {}, rawTestData);
|
||||
}
|
@ -21,6 +21,9 @@ export interface EmojiRegexMatch {
|
||||
|
||||
// Icon name
|
||||
keyword: string;
|
||||
|
||||
// Regex index, used if multiple regular expressions were provided
|
||||
regexp: number;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -45,54 +48,56 @@ interface PrevNextMatch extends PrevMatch {
|
||||
* Returns only one entry per match
|
||||
*/
|
||||
export function getEmojiMatchesInText(
|
||||
regexp: string | RegExp,
|
||||
regexp: string | RegExp | (string | RegExp)[],
|
||||
content: string
|
||||
): EmojiRegexMatch[] {
|
||||
const results: EmojiRegexMatch[] = [];
|
||||
const matches = content.match(
|
||||
typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
|
||||
);
|
||||
const found: Set<string> = new Set();
|
||||
(regexp instanceof Array ? regexp : [regexp]).forEach((regexp, index) => {
|
||||
const matches = content.match(
|
||||
typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
|
||||
);
|
||||
|
||||
if (matches) {
|
||||
// Sort matches by length to make sure longest matches get replaced first
|
||||
matches.sort((a, b) => {
|
||||
if (b.length === a.length) {
|
||||
return a.localeCompare(b);
|
||||
}
|
||||
return b.length - a.length;
|
||||
});
|
||||
|
||||
// Add all matches
|
||||
let lastMatch: EmojiRegexMatch | undefined;
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const match = matches[i];
|
||||
|
||||
if (lastMatch && lastMatch.match === match) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get sequence
|
||||
const sequence: number[] = [];
|
||||
for (const codePoint of match) {
|
||||
const num = codePoint.codePointAt(0) as number;
|
||||
if (num !== vs16Emoji) {
|
||||
sequence.push(num);
|
||||
if (matches) {
|
||||
// Add all matches
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const match = matches[i];
|
||||
if (found.has(match)) {
|
||||
continue;
|
||||
}
|
||||
found.add(match);
|
||||
|
||||
// Get sequence
|
||||
const sequence: number[] = [];
|
||||
for (const codePoint of match) {
|
||||
const num = codePoint.codePointAt(0) as number;
|
||||
if (num !== vs16Emoji) {
|
||||
sequence.push(num);
|
||||
}
|
||||
}
|
||||
|
||||
// Add result
|
||||
results.push({
|
||||
match,
|
||||
sequence,
|
||||
keyword: getEmojiSequenceKeyword(
|
||||
convertEmojiSequenceToUTF32(sequence)
|
||||
),
|
||||
regexp: index,
|
||||
});
|
||||
}
|
||||
|
||||
// Get keyword
|
||||
const keyword = getEmojiSequenceKeyword(
|
||||
convertEmojiSequenceToUTF32(sequence)
|
||||
);
|
||||
|
||||
lastMatch = {
|
||||
match,
|
||||
sequence,
|
||||
keyword,
|
||||
};
|
||||
results.push(lastMatch);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Sort matches by length to make sure longest matches get replaced first
|
||||
results.sort((a, b) => {
|
||||
const match1 = a.match;
|
||||
const match2 = b.match;
|
||||
if (match2.length === match1.length) {
|
||||
return match1.localeCompare(match2);
|
||||
}
|
||||
return match2.length - match1.length;
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ export type FindAndReplaceEmojisInTextCallback = (
|
||||
* Returns null if nothing was replaced
|
||||
*/
|
||||
export function findAndReplaceEmojisInText(
|
||||
regexp: string | RegExp,
|
||||
regexp: string | RegExp | (string | RegExp)[],
|
||||
content: string,
|
||||
callback: FindAndReplaceEmojisInTextCallback
|
||||
): string | null {
|
||||
|
@ -104,11 +104,15 @@ export {
|
||||
} from './emoji/format';
|
||||
export { parseEmojiTestFile } from './emoji/test/parse';
|
||||
export { getQualifiedEmojiVariations } from './emoji/test/variations';
|
||||
// export { getEmojisSequencesToCopy } from './emoji/test/copy';
|
||||
export { findMissingEmojis } from './emoji/test/missing';
|
||||
export {
|
||||
createOptimisedRegex,
|
||||
createOptimisedRegexForEmojiSequences,
|
||||
} from './emoji/regex/create';
|
||||
export {
|
||||
prepareEmojiForIconsList,
|
||||
prepareEmojiForIconSet,
|
||||
} from './emoji/parse';
|
||||
export { findAndReplaceEmojisInText } from './emoji/replace/replace';
|
||||
|
||||
// Misc
|
||||
|
208
packages/utils/tests/emoji-parse-test.ts
Normal file
208
packages/utils/tests/emoji-parse-test.ts
Normal file
@ -0,0 +1,208 @@
|
||||
import { readFile, writeFile, unlink } from 'node:fs/promises';
|
||||
import { emojiVersion } from '../lib/emoji/data';
|
||||
import { prepareEmojiForIconsList } from '../lib/emoji/parse';
|
||||
|
||||
describe('Testing unicode test data', () => {
|
||||
async function fetchEmojiTestData(): Promise<string | undefined> {
|
||||
// Fetch emojis, cache it
|
||||
const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
|
||||
|
||||
let data: string | undefined;
|
||||
try {
|
||||
data = await readFile(source, 'utf8');
|
||||
} catch {
|
||||
//
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
data = (
|
||||
await fetch(
|
||||
`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
|
||||
)
|
||||
)
|
||||
.text()
|
||||
.toString();
|
||||
await writeFile(source, data, 'utf8');
|
||||
}
|
||||
|
||||
// Test content, unlink cache on failure
|
||||
if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
|
||||
try {
|
||||
await unlink(source);
|
||||
} catch {
|
||||
//
|
||||
}
|
||||
return;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
let data: string | undefined;
|
||||
|
||||
beforeAll(async () => {
|
||||
data = await fetchEmojiTestData();
|
||||
});
|
||||
|
||||
it('Preparing icon set without test data', () => {
|
||||
// One emoji
|
||||
expect(
|
||||
prepareEmojiForIconsList({
|
||||
'2615': 'hot-beverage',
|
||||
})
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'hot-beverage',
|
||||
sequence: '2615',
|
||||
},
|
||||
],
|
||||
regex: '\\u2615\\uFE0F?',
|
||||
});
|
||||
|
||||
// Multiple emojis
|
||||
expect(
|
||||
prepareEmojiForIconsList({
|
||||
'2615': 'hot-beverage',
|
||||
'1f1e6-1f1e8': 'flag-ascension-island',
|
||||
'1f1e6-1f1e9': 'flag-andorra',
|
||||
'1f1e6-1f1ea': 'flag-united-arab-emirates',
|
||||
})
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'hot-beverage',
|
||||
sequence: '2615',
|
||||
},
|
||||
{
|
||||
icon: 'flag-ascension-island',
|
||||
sequence: '1f1e6-1f1e8',
|
||||
},
|
||||
{
|
||||
icon: 'flag-andorra',
|
||||
sequence: '1f1e6-1f1e9',
|
||||
},
|
||||
{
|
||||
icon: 'flag-united-arab-emirates',
|
||||
sequence: '1f1e6-1f1ea',
|
||||
},
|
||||
],
|
||||
regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615\\uFE0F?',
|
||||
});
|
||||
});
|
||||
|
||||
it('Preparing icon set with test data', () => {
|
||||
if (!data) {
|
||||
console.warn('Test skipped: test data is not available');
|
||||
return;
|
||||
}
|
||||
|
||||
// One emoji without variation
|
||||
expect(
|
||||
prepareEmojiForIconsList(
|
||||
{
|
||||
'2615': 'hot-beverage',
|
||||
},
|
||||
data
|
||||
)
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'hot-beverage',
|
||||
sequence: '2615',
|
||||
},
|
||||
],
|
||||
regex: '\\u2615',
|
||||
});
|
||||
|
||||
// One emoji with variation
|
||||
expect(
|
||||
prepareEmojiForIconsList(
|
||||
{
|
||||
// Upper case
|
||||
'263A': 'smiling-face',
|
||||
},
|
||||
data
|
||||
)
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'smiling-face',
|
||||
// Lower case
|
||||
sequence: '263a',
|
||||
},
|
||||
],
|
||||
regex: '\\u263A\\uFE0F?',
|
||||
});
|
||||
|
||||
// One emoji that has components in test data
|
||||
expect(
|
||||
prepareEmojiForIconsList(
|
||||
{
|
||||
'270b': 'raised-hand',
|
||||
},
|
||||
data
|
||||
)
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b',
|
||||
},
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b-1f3fb',
|
||||
},
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b-1f3fc',
|
||||
},
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b-1f3fd',
|
||||
},
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b-1f3fe',
|
||||
},
|
||||
{
|
||||
icon: 'raised-hand',
|
||||
sequence: '270b-1f3ff',
|
||||
},
|
||||
],
|
||||
regex: '\\u270B(?:\\uD83C[\\uDFFB-\\uDFFF])?',
|
||||
});
|
||||
|
||||
// Multiple emojis, all without variations
|
||||
expect(
|
||||
prepareEmojiForIconsList(
|
||||
{
|
||||
'2615': 'hot-beverage',
|
||||
'1f1e6-1f1e8': 'flag-ascension-island',
|
||||
'1f1e6-1f1e9': 'flag-andorra',
|
||||
'1f1e6-1f1ea': 'flag-united-arab-emirates',
|
||||
},
|
||||
data
|
||||
)
|
||||
).toEqual({
|
||||
icons: [
|
||||
{
|
||||
icon: 'hot-beverage',
|
||||
sequence: '2615',
|
||||
},
|
||||
{
|
||||
icon: 'flag-ascension-island',
|
||||
sequence: '1f1e6-1f1e8',
|
||||
},
|
||||
{
|
||||
icon: 'flag-andorra',
|
||||
sequence: '1f1e6-1f1e9',
|
||||
},
|
||||
{
|
||||
icon: 'flag-united-arab-emirates',
|
||||
sequence: '1f1e6-1f1ea',
|
||||
},
|
||||
],
|
||||
regex: '\\uD83C\\uDDE6\\uD83C[\\uDDE8-\\uDDEA]|\\u2615',
|
||||
});
|
||||
});
|
||||
});
|
@ -44,23 +44,27 @@ describe('Finding emojis in text', () => {
|
||||
match: '\u263A\uFE0F',
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 0,
|
||||
},
|
||||
{
|
||||
// Should be returned only once
|
||||
match: String.fromCodePoint(0x1f600),
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
{
|
||||
match: String.fromCodePoint(0x1f603),
|
||||
sequence: [0x1f603],
|
||||
keyword: '1f603',
|
||||
regexp: 0,
|
||||
},
|
||||
{
|
||||
// Same as first, but without 'FE0F'
|
||||
match: '\u263A',
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 0,
|
||||
},
|
||||
]);
|
||||
|
||||
@ -72,6 +76,7 @@ describe('Finding emojis in text', () => {
|
||||
match: emoji1,
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text1,
|
||||
next: text2,
|
||||
@ -81,6 +86,7 @@ describe('Finding emojis in text', () => {
|
||||
match: emoji2,
|
||||
sequence: [0x1f603],
|
||||
keyword: '1f603',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text2,
|
||||
next: text3,
|
||||
@ -90,6 +96,7 @@ describe('Finding emojis in text', () => {
|
||||
match: emoji3,
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text3,
|
||||
next: text4,
|
||||
@ -99,6 +106,7 @@ describe('Finding emojis in text', () => {
|
||||
match: emoji4,
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text4,
|
||||
next: text5,
|
||||
@ -108,6 +116,122 @@ describe('Finding emojis in text', () => {
|
||||
match: emoji5,
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text5,
|
||||
next: text6,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('Multiple regex', () => {
|
||||
const regex0 = createOptimisedRegex(['1F600', '1F603', '1F604']);
|
||||
const regex1 = createOptimisedRegex(['263A FE0F']);
|
||||
|
||||
const text1 = 'E1.0 grinning face: ';
|
||||
const emoji1 = String.fromCodePoint(0x1f600);
|
||||
const text2 = '\nE0.6 grinning face with big eyes: ';
|
||||
const emoji2 = String.fromCodePoint(0x1f603);
|
||||
const text3 = 'E1.0 grinning face: ';
|
||||
const emoji3 = emoji1;
|
||||
const text4 = 'E0.6 smiling face: ';
|
||||
const emoji4 = '\u263A\uFE0F';
|
||||
const text5 = '(fully-qualified)\nE0.6 smiling face: ';
|
||||
const emoji5 = '\u263A';
|
||||
const text6 = '(unqualified)';
|
||||
|
||||
const content =
|
||||
text1 +
|
||||
emoji1 +
|
||||
text2 +
|
||||
emoji2 +
|
||||
text3 +
|
||||
emoji3 +
|
||||
text4 +
|
||||
emoji4 +
|
||||
text5 +
|
||||
emoji5 +
|
||||
text6;
|
||||
const matches = getEmojiMatchesInText([regex0, regex1], content);
|
||||
|
||||
expect(matches).toEqual([
|
||||
{
|
||||
match: '\u263A\uFE0F',
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 1,
|
||||
},
|
||||
{
|
||||
// Should be returned only once
|
||||
match: String.fromCodePoint(0x1f600),
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
{
|
||||
match: String.fromCodePoint(0x1f603),
|
||||
sequence: [0x1f603],
|
||||
keyword: '1f603',
|
||||
regexp: 0,
|
||||
},
|
||||
{
|
||||
// Same as first, but without 'FE0F'
|
||||
match: '\u263A',
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 1,
|
||||
},
|
||||
]);
|
||||
|
||||
const sortedMatches = sortEmojiMatchesInText(content, matches);
|
||||
expect(sortedMatches).toEqual([
|
||||
// Same order as in content
|
||||
{
|
||||
match: {
|
||||
match: emoji1,
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text1,
|
||||
next: text2,
|
||||
},
|
||||
{
|
||||
match: {
|
||||
match: emoji2,
|
||||
sequence: [0x1f603],
|
||||
keyword: '1f603',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text2,
|
||||
next: text3,
|
||||
},
|
||||
{
|
||||
match: {
|
||||
match: emoji3,
|
||||
sequence: [0x1f600],
|
||||
keyword: '1f600',
|
||||
regexp: 0,
|
||||
},
|
||||
prev: text3,
|
||||
next: text4,
|
||||
},
|
||||
{
|
||||
match: {
|
||||
match: emoji4,
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 1,
|
||||
},
|
||||
prev: text4,
|
||||
next: text5,
|
||||
},
|
||||
{
|
||||
match: {
|
||||
match: emoji5,
|
||||
sequence: [0x263a],
|
||||
keyword: '263a',
|
||||
regexp: 1,
|
||||
},
|
||||
prev: text5,
|
||||
next: text6,
|
||||
|
@ -8,7 +8,7 @@ import {
|
||||
minUTF32,
|
||||
emojiVersion,
|
||||
} from '../lib/emoji/data';
|
||||
import { parseEmojiTestFile } from '../lib/emoji/test/parse';
|
||||
import { EmojiTestDataItem, parseEmojiTestFile } from '../lib/emoji/test/parse';
|
||||
import {
|
||||
mapEmojiTestDataComponents,
|
||||
replaceEmojiComponentsInCombinedSequence,
|
||||
@ -713,7 +713,11 @@ describe('Testing unicode test data', () => {
|
||||
const tree = getEmojiTestDataTree(splitTestData);
|
||||
|
||||
// Use test data
|
||||
const testList = [];
|
||||
interface TestListItem extends EmojiTestDataItem {
|
||||
// Add it for easier testing
|
||||
sequenceKey: string;
|
||||
}
|
||||
const testList: TestListItem[] = [];
|
||||
for (const sequenceKey in testData) {
|
||||
testList.push({
|
||||
...testData[sequenceKey],
|
||||
|
Loading…
Reference in New Issue
Block a user