2
0
mirror of https://github.com/iconify/iconify.git synced 2024-09-19 16:59:02 +00:00

fix: various fixes for emojis, functions to find and replace emojis in text

This commit is contained in:
Vjacheslav Trushkin 2022-12-14 20:49:50 +02:00
parent 0410d73067
commit 9f5be0f10d
13 changed files with 643 additions and 88 deletions

View File

@ -127,6 +127,16 @@
"import": "./lib/emoji/regex/tree.mjs",
"types": "./lib/emoji/regex/tree.d.ts"
},
"./lib/emoji/replace/find": {
"require": "./lib/emoji/replace/find.cjs",
"import": "./lib/emoji/replace/find.mjs",
"types": "./lib/emoji/replace/find.d.ts"
},
"./lib/emoji/replace/replace": {
"require": "./lib/emoji/replace/replace.cjs",
"import": "./lib/emoji/replace/replace.mjs",
"types": "./lib/emoji/replace/replace.d.ts"
},
"./lib/emoji/cleanup": {
"require": "./lib/emoji/cleanup.cjs",
"import": "./lib/emoji/cleanup.mjs",

View File

@ -1,8 +1,14 @@
import { getEmojiCodePoint } from './convert';
import { emojiTones, joinerEmoji, vs16Emoji } from './data';
import { getEmojiSequenceString } from './format';
/**
* Get emoji sequence from string
*
* Examples (shows same emoji sequence formatted differently):
* '1F441 FE0F 200D 1F5E8 FE0F' => [0x1f441, 0xfe0f, 0x200d, 0x1f5e8, 0xfe0f]
* '1f441-fe0f-200d-1f5e8-fe0f' => [0x1f441, 0xfe0f, 0x200d, 0x1f5e8, 0xfe0f]
* '\\uD83D\\uDC41\\uFE0F\\u200D\\uD83D\\uDDE8\\uFE0F' => [0x1f441, 0xfe0f, 0x200d, 0x1f5e8, 0xfe0f]
*/
export function getEmojiSequenceFromString(value: string): number[] {
return value
@ -13,7 +19,12 @@ export function getEmojiSequenceFromString(value: string): number[] {
}
/**
* Split sequence by joiner
* Split emoji sequence by joiner
*
* Result represents one emoji, split in smaller sequences separated by 0x200D
*
* Example:
* [0x1FAF1, 0x1F3FB, 0x200D, 0x1FAF2, 0x1F3FC] => [[0x1FAF1, 0x1F3FB], [0x1FAF2, 0x1F3FC]]
*/
export function splitEmojiSequences(sequence: number[]): number[][] {
const results: number[][] = [];
@ -33,6 +44,11 @@ export function splitEmojiSequences(sequence: number[]): number[][] {
/**
* Join emoji sequences
*
* Parameter represents one emoji, split in smaller sequences
*
* Example:
* [[0x1FAF1, 0x1F3FB], [0x1FAF2, 0x1F3FC]] => [0x1FAF1, 0x1F3FB, 0x200D, 0x1FAF2, 0x1F3FC]
*/
export function joinEmojiSequences(sequences: number[][]): number[] {
let results: number[] = [];
@ -69,7 +85,19 @@ export function removeEmojiTones(sequence: number[]): number[] {
});
}
/**
* Types for mapEmojiSequence()
*/
type MapCallback = (sequence: number[]) => number[];
interface MapOptions {
removeEmpty?: boolean;
removeDuplicates?: boolean;
}
const mapOptions: Required<MapOptions> = {
removeEmpty: true,
removeDuplicates: false,
};
/**
* Run function on sequences
@ -79,10 +107,35 @@ type MapCallback = (sequence: number[]) => number[];
export function mapEmojiSequences(
sequences: number[][],
callback: MapCallback,
removeEmpty = true
options: MapOptions = {}
): number[][] {
const results = sequences.map((sequence) => callback(sequence));
return removeEmpty
? results.filter((sequence) => sequence.length > 0)
: results;
const fullOptions = {
...mapOptions,
...options,
};
const values: Set<string> = new Set();
const results: number[][] = [];
sequences.forEach((sequence) => {
const result = callback(sequence);
// Check for empty sequences
if (fullOptions.removeEmpty && !result.length) {
return;
}
// Check for duplicate
if (fullOptions.removeDuplicates) {
const value = getEmojiSequenceString(result);
if (values.has(value)) {
// duplicate
return;
}
values.add(value);
}
results.push(result);
});
return results;
}

View File

@ -60,6 +60,9 @@ function convert(
/**
* Convert unicode number to string
*
* Example:
* 0x1F600 => '1F600'
*/
export function getEmojiUnicodeString(
code: number,
@ -78,6 +81,9 @@ const defaultSequenceOptions: UnicodeFormattingOptions = {
/**
* Convert unicode numbers sequence to string
*
* Example:
* [0x1f441, 0xfe0f] => '1f441-fe0f'
*/
export function getEmojiSequenceString(
sequence: number[],
@ -88,25 +94,3 @@ export function getEmojiSequenceString(
...options,
});
}
const keywordOptions: UnicodeFormattingOptions = {
prefix: '',
separator: '-',
case: 'lower',
format: 'utf-32',
add0: true,
throwOnError: true,
};
/**
* Merge unicode numbers sequence as icon keyword
*/
export function emojiSequenceToKeyword(
sequence: number[],
throwOnError = true
): string {
return convert(sequence, {
...keywordOptions,
throwOnError,
});
}

View File

@ -1,5 +1,7 @@
import { getEmojiSequenceFromString } from './cleanup';
import { convertEmojiSequenceToUTF32 } from './convert';
import { getEmojiSequenceString } from './format';
import { getUnqualifiedEmojiSequence } from './variations';
// Emoji types
type EmojiType =
@ -62,3 +64,50 @@ export function parseEmojiTestFile(data: string): number[][] {
convertEmojiSequenceToUTF32(getEmojiSequenceFromString(item))
);
}
/**
* Get qualified variations from parsed test file
*
* Key is unqualified emoji, value is longest fully qualified emoji
*/
export function getQualifiedEmojiSequencesMap(
sequences: number[][]
): Map<number[], number[]>;
export function getQualifiedEmojiSequencesMap(
sequences: number[][],
toString: (value: number[]) => string
): Record<string, string>;
export function getQualifiedEmojiSequencesMap(
sequences: number[][],
toString?: (value: number[]) => string
): Map<number[], number[]> | Record<string, string> {
const convert = toString || getEmojiSequenceString;
const results = Object.create(null) as Record<string, string>;
for (let i = 0; i < sequences.length; i++) {
const value = convert(sequences[i]);
const unqualified = convert(getUnqualifiedEmojiSequence(sequences[i]));
// Check if values mismatch, set results to longest value
if (
!results[unqualified] ||
results[unqualified].length < value.length
) {
results[unqualified] = value;
}
}
// Return
if (toString) {
return results;
}
const map: Map<number[], number[]> = new Map();
for (const key in results) {
const value = results[key];
map.set(
getEmojiSequenceFromString(key),
getEmojiSequenceFromString(value)
);
}
return map;
}

View File

@ -0,0 +1,176 @@
import { convertEmojiSequenceToUTF32 } from '../convert';
import { vs16Emoji } from '../data';
import { getEmojiSequenceString } from '../format';
/**
* Create regular expression instance
*/
export function createEmojiRegExp(regexp: string): RegExp {
return new RegExp(regexp, 'g');
}
/**
* Match
*/
export interface EmojiRegexMatch {
// Match to replace in text
match: string;
// Sequence
sequence: number[];
// Icon name
keyword: string;
}
/**
* Add prev/next
*/
interface PrevMatch {
// Match
match: EmojiRegexMatch;
// Content between previous emoji and this emoji
prev: string;
}
interface PrevNextMatch extends PrevMatch {
// Content betweed this emoji and next emoji
next: string;
}
/**
* Find emojis in text
*
* Returns only one entry per match
*/
export function getEmojiMatchesInText(
regexp: string | RegExp,
content: string
): EmojiRegexMatch[] {
const results: EmojiRegexMatch[] = [];
const matches = content.match(
typeof regexp === 'string' ? createEmojiRegExp(regexp) : regexp
);
if (matches) {
// Sort matches by length to make sure longest matches get replaced first
matches.sort((a, b) => {
if (b.length === a.length) {
return a.localeCompare(b);
}
return b.length - a.length;
});
// Add all matches
let lastMatch: EmojiRegexMatch | undefined;
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
if (lastMatch && lastMatch.match === match) {
continue;
}
// Get sequence
const sequence: number[] = [];
for (const codePoint of match) {
const num = codePoint.codePointAt(0) as number;
if (num !== vs16Emoji) {
sequence.push(num);
}
}
// Get keyword
const keyword = getEmojiSequenceString(
convertEmojiSequenceToUTF32(sequence)
);
lastMatch = {
match,
sequence,
keyword,
};
results.push(lastMatch);
}
}
return results;
}
/**
* Sort emojis, get prev and next text
*/
export function sortEmojiMatchesInText(
content: string,
matches: EmojiRegexMatch[]
): PrevNextMatch[] {
// Find all ranges
interface Range {
match: EmojiRegexMatch;
start: number;
end: number;
}
const ranges: Range[] = [];
const check = (start: number, end: number): boolean => {
for (let i = 0; i < ranges.length; i++) {
if (start < ranges[i].end && end >= ranges[i].start) {
return false;
}
}
return true;
};
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const search = match.match;
let startFrom = 0;
let start: number;
while ((start = content.indexOf(search, startFrom)) !== -1) {
const end = start + search.length;
startFrom = end;
// Make sure it doesn't interfere with other replacements
if (check(start, end)) {
ranges.push({
start,
end,
match,
});
}
}
}
// Sort ranges
ranges.sort((a, b) => a.start - b.start);
const list: PrevMatch[] = [];
let prevRange: Range | undefined;
let lastEnd: number | undefined;
for (let i = 0; i < ranges.length; i++) {
const range = ranges[i];
const prev = content.slice(prevRange ? prevRange.end : 0, range.start);
list.push({
match: range.match,
prev,
});
prevRange = range;
lastEnd = range.end;
}
// Convert to full data
if (!lastEnd) {
// Empty list
return [];
}
const replacements: PrevNextMatch[] = list.map((item, index) => {
const nextItem = list[index + 1];
return {
...item,
next: nextItem ? nextItem.prev : content.slice(lastEnd),
};
});
return replacements;
}

View File

@ -0,0 +1,60 @@
import {
EmojiRegexMatch,
getEmojiMatchesInText,
sortEmojiMatchesInText,
} from './find';
/**
* Callback for replacing emoji in text
*
* Returns text to replace emoji with, undefined to skip replacement
*/
export type FindAndReplaceEmojisInTextCallback = (
// Match
match: EmojiRegexMatch,
// Text before replacement
prev: string
) => string | undefined;
/**
* Find and replace emojis in text
*
* Returns null if nothing was replaced
*/
export function findAndReplaceEmojisInText(
regexp: string | RegExp,
content: string,
callback: FindAndReplaceEmojisInTextCallback
): string | null {
const matches = getEmojiMatchesInText(regexp, content);
if (!matches.length) {
return null;
}
const sortedMatches = sortEmojiMatchesInText(content, matches);
// Replace all matches
let result = '';
let replaced = false;
for (let i = 0; i < sortedMatches.length; i++) {
const item = sortedMatches[i];
result += item.prev;
const replacement = callback(
{
...item.match,
},
result
);
if (replacement === void 0) {
// Nothing to replace
result += item.match.match;
} else {
// Replace content
result += replacement;
replaced = true;
}
}
result += sortedMatches[sortedMatches.length - 1].next;
return replaced ? result : null;
}

View File

@ -7,6 +7,41 @@ import {
import { convertEmojiSequenceToUTF32 } from './convert';
import { keycapEmoji, vs16Emoji } from './data';
import { getEmojiSequenceString } from './format';
import { getQualifiedEmojiSequencesMap } from './parse-test';
/**
* Get unqualified sequence
*/
export function getUnqualifiedEmojiSequence(sequence: number[]): number[] {
return sequence.filter((num) => num !== vs16Emoji);
}
/**
* Get qualified sequence, adding optional `FE0F` wherever it might exist
*
* This might result in sequence that is not actually valid, but considering
* that `FE0F` is always treated as optional, full sequence used in regex will
* catch both qualified and unqualified emojis, so proper sequence will get
* caught anyway. This function just makes sure that in case if sequence does
* have `FE0F`, it will be caught by regex too.
*/
export function guessQualifiedEmojiSequence(sequence: number[]): number[] {
const split = splitEmojiSequences(sequence).map((part) => {
// Check for `FE0F`
if (part.indexOf(vs16Emoji) !== -1) {
return part;
}
// Check for keycap
if (part.length === 2 && part[1] === keycapEmoji) {
return [part[0], vs16Emoji, part[1]];
}
// Add `FE0F` to 1 character emojis
return part.length === 1 ? [part[0], vs16Emoji] : part;
});
return joinEmojiSequences(split);
}
/**
* Add optional variations to emojis
@ -19,55 +54,42 @@ import { getEmojiSequenceString } from './format';
export function addOptionalEmojiVariations(
sequences: number[][],
testData?: number[][]
): number[][] {
): number[][];
export function addOptionalEmojiVariations(
sequences: number[][],
testData: number[][],
toString: (value: number[]) => string
): string[];
export function addOptionalEmojiVariations(
sequences: number[][],
testData: number[][] = [],
toString?: (value: number[]) => string
): number[][] | string[] {
const convert = toString || getEmojiSequenceString;
// Map test data
const testDataMap = Object.create(null) as Record<string, string>;
testData?.forEach((sequence) => {
const convertedSequence = convertEmojiSequenceToUTF32(sequence);
// Clean up sequence
const key = getEmojiSequenceString(
removeEmojiVariations(convertedSequence)
);
if (testDataMap[key]?.length > convertedSequence.length) {
// Already got version with more variations
return;
}
testDataMap[key] = getEmojiSequenceString(convertedSequence);
});
const testDataMap = getQualifiedEmojiSequencesMap(testData, convert);
// Parse all sequences
const set: Set<string> = new Set();
sequences.forEach((sequence) => {
// Convert to UTF-32, remove variations
const convertedSequence = convertEmojiSequenceToUTF32(sequence);
const cleanSequence = removeEmojiVariations(convertedSequence);
const mapKey = getEmojiSequenceString(cleanSequence);
// Check test data
const mapKey = convert(cleanSequence);
if (testDataMap[mapKey]) {
// Got item from test data
set.add(testDataMap[mapKey]);
return;
}
// Emoji is missing in test data: add `FE0F` as needed
const parts = splitEmojiSequences(convertedSequence).map((part) => {
// Check for `FE0F`
if (part.indexOf(vs16Emoji) !== -1) {
return part;
}
// Check for keycap
if (part.length === 2 && part[1] === keycapEmoji) {
return [part[0], vs16Emoji, part[1]];
}
// Add `FE0F` to 1 character emojis
return part.length === 1 ? [part[0], vs16Emoji] : part;
});
set.add(getEmojiSequenceString(joinEmojiSequences(parts)));
// Not in test data: guess variations
set.add(convert(guessQualifiedEmojiSequence(cleanSequence)));
});
return Array.from(set).map(getEmojiSequenceFromString);
const results = Array.from(set);
return toString ? results : results.map(getEmojiSequenceFromString);
}

View File

@ -86,8 +86,6 @@ export { loadIcon } from './loader/loader';
// Emojis
export {
getEmojiSequenceFromString,
splitEmojiSequences,
joinEmojiSequences,
removeEmojiVariations,
removeEmojiTones,
mapEmojiSequences,
@ -101,17 +99,20 @@ export {
convertEmojiSequenceToUTF16,
convertEmojiSequenceToUTF32,
} from './emoji/convert';
export { getEmojiUnicodeString, getEmojiSequenceString } from './emoji/format';
export {
getEmojiUnicodeString,
getEmojiSequenceString,
emojiSequenceToKeyword,
} from './emoji/format';
export { parseEmojiTestFile } from './emoji/parse-test';
export { addOptionalEmojiVariations } from './emoji/variations';
parseEmojiTestFile,
getQualifiedEmojiSequencesMap,
} from './emoji/parse-test';
export {
getUnqualifiedEmojiSequence,
addOptionalEmojiVariations,
} from './emoji/variations';
export {
createOptimisedRegex,
createOptimisedRegexForEmojiSequences,
} from './emoji/regex/create';
export { findAndReplaceEmojisInText } from './emoji/replace/replace';
// Misc
export { camelize, camelToKebab, snakelize, pascalize } from './misc/strings';

View File

@ -1,5 +1,4 @@
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import { convertEmojiSequenceToUTF32 } from '../lib';
import { convertEmojiSequenceToUTF32 } from '../lib/emoji/convert';
import {
getEmojiSequenceFromString,
joinEmojiSequences,

View File

@ -1,21 +1,20 @@
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import {
getEmojiSequenceString,
emojiSequenceToKeyword,
} from '../lib/emoji/format';
import { getEmojiSequenceString } from '../lib/emoji/format';
describe('Testing formatting emoji code points', () => {
it('Empty sequence', () => {
const sequence: number[] = [];
expect(getEmojiSequenceString(sequence)).toBe('');
expect(emojiSequenceToKeyword(sequence)).toBe('');
});
it('Keycap sequence', () => {
const sequence: number[] = [0x23, 0xfe0f, 0x20e3];
expect(getEmojiSequenceString(sequence)).toBe('23-fe0f-20e3');
expect(emojiSequenceToKeyword(sequence)).toBe('0023-fe0f-20e3');
expect(
getEmojiSequenceString(sequence, {
add0: true,
})
).toBe('0023-fe0f-20e3');
});
it('UTF-16 sequence', () => {
@ -27,9 +26,6 @@ describe('Testing formatting emoji code points', () => {
expect(getEmojiSequenceString(sequence)).toBe(
'1f441-fe0f-200d-1f5e8-fe0f'
);
expect(emojiSequenceToKeyword(sequence)).toBe(
'1f441-fe0f-200d-1f5e8-fe0f'
);
});
it('UTF-32 sequence', () => {
@ -39,8 +35,5 @@ describe('Testing formatting emoji code points', () => {
expect(getEmojiSequenceString(sequence)).toBe(
'1f441-fe0f-200d-1f5e8-fe0f'
);
expect(emojiSequenceToKeyword(sequence)).toBe(
'1f441-fe0f-200d-1f5e8-fe0f'
);
});
});

View File

@ -1,8 +1,10 @@
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import { readFile, writeFile, unlink } from 'node:fs/promises';
import { getEmojiSequenceFromString } from '../lib/emoji/cleanup';
import { getEmojiSequenceString } from '../lib/emoji/format';
import { parseEmojiTestFile } from '../lib/emoji/parse-test';
import {
getQualifiedEmojiSequencesMap,
parseEmojiTestFile,
} from '../lib/emoji/parse-test';
import { addOptionalEmojiVariations } from '../lib/emoji/variations';
describe('Optional variations of emoji sequences', () => {
@ -88,6 +90,23 @@ describe('Optional variations of emoji sequences', () => {
}
const testData = parseEmojiTestFile(data);
// Make sure testData contains both fully-qualified and unqualified emojis
const testDataStrings = new Set(
testData.map((sequence) => getEmojiSequenceString(sequence))
);
expect(testDataStrings.has('1f600')).toBe(true);
expect(testDataStrings.has('263a')).toBe(true);
expect(testDataStrings.has('263a-fe0f')).toBe(true);
// Test getQualifiedEmojiSequencesMap
const unqualifiedTest = getQualifiedEmojiSequencesMap(
testData,
getEmojiSequenceString
);
expect(unqualifiedTest['1f600']).toBe('1f600');
expect(unqualifiedTest['263a']).toBe('263a-fe0f');
// Sequences to test
const sequences = [
// emoji without variation in test file
'1F601',

View File

@ -0,0 +1,107 @@
import { createOptimisedRegex } from '../lib/emoji/regex/create';
import {
getEmojiMatchesInText,
sortEmojiMatchesInText,
} from '../lib/emoji/replace/find';
describe('Finding emojis in text', () => {
it('Simple regex', () => {
const regexValue = createOptimisedRegex([
'1F600',
'1F603',
'1F604',
'263A FE0F',
]);
const text1 = 'E1.0 grinning face: ';
const emoji1 = String.fromCodePoint(0x1f600);
const text2 = '\nE0.6 grinning face with big eyes: ';
const emoji2 = String.fromCodePoint(0x1f603);
const text3 = 'E1.0 grinning face: ';
const emoji3 = emoji1;
const text4 = 'E0.6 smiling face: ';
const emoji4 = '\u263A\uFE0F';
const text5 = '(fully-qualified)\nE0.6 smiling face: ';
const emoji5 = '\u263A';
const text6 = '(unqualified)';
const content =
text1 +
emoji1 +
text2 +
emoji2 +
text3 +
emoji3 +
text4 +
emoji4 +
text5 +
emoji5 +
text6;
const matches = getEmojiMatchesInText(regexValue, content);
expect(matches).toEqual([
{
match: '\u263A\uFE0F',
sequence: [0x263a],
keyword: '263a',
},
{
// Should be returned only once
match: String.fromCodePoint(0x1f600),
sequence: [0x1f600],
keyword: '1f600',
},
{
match: String.fromCodePoint(0x1f603),
sequence: [0x1f603],
keyword: '1f603',
},
{
// Same as first, but without 'FE0F'
match: '\u263A',
sequence: [0x263a],
keyword: '263a',
},
]);
const sortedMatches = sortEmojiMatchesInText(content, matches);
expect(sortedMatches).toEqual([
// Same order as in content
{
match: emoji1,
sequence: [0x1f600],
keyword: '1f600',
prev: text1,
next: text2,
},
{
match: emoji2,
sequence: [0x1f603],
keyword: '1f603',
prev: text2,
next: text3,
},
{
match: emoji3,
sequence: [0x1f600],
keyword: '1f600',
prev: text3,
next: text4,
},
{
match: emoji4,
sequence: [0x263a],
keyword: '263a',
prev: text4,
next: text5,
},
{
match: emoji5,
sequence: [0x263a],
keyword: '263a',
prev: text5,
next: text6,
},
]);
});
});

View File

@ -0,0 +1,82 @@
import { createOptimisedRegex } from '../lib/emoji/regex/create';
import { findAndReplaceEmojisInText } from '../lib/emoji/replace/replace';
describe('Replacing emojis in text', () => {
it('Simple and complex regex matches', () => {
const grinningCatEmoji = String.fromCodePoint(0x1f63a);
const alienEmoji = String.fromCodePoint(0x1f47d);
const testEmoji =
String.fromCodePoint(0x1f441) +
String.fromCodePoint(0xfe0f) +
String.fromCodePoint(0x200d) +
String.fromCodePoint(0x1f5e8);
const sequence = [
'1f63a',
'1f47d',
// 2 emojis that can be sequences of each other
'1F441 FE0F',
'1F441 FE0F 200D 1F5E8 FE0F',
'1F5E8 FE0F',
];
const regex = createOptimisedRegex(sequence);
const text =
'Grinning Cat: ' +
grinningCatEmoji +
', aliens: ' +
alienEmoji +
alienEmoji +
alienEmoji +
', Test: ' +
testEmoji +
'end!';
// Counters
let grinningCatCalled = 0;
let alienCalled = 0;
let testCalled = 0;
const replaced = findAndReplaceEmojisInText(
regex,
text,
(match, prev) => {
switch (match.match) {
case grinningCatEmoji: {
expect(prev).toBe('Grinning Cat: ');
grinningCatCalled++;
return ':cat:';
}
case alienEmoji: {
if (alienCalled) {
expect(prev).toBe(
'Grinning Cat: :cat:, aliens: ' +
':alien:'.repeat(alienCalled)
);
}
alienCalled++;
return ':alien:';
}
case testEmoji: {
testCalled++;
return ':test:';
}
default: {
throw new Error(
`Unexpected match: ${JSON.stringify(match)}`
);
}
}
}
);
expect(grinningCatCalled).toBe(1);
expect(alienCalled).toBe(3);
expect(testCalled).toBe(1);
expect(replaced).toBe(
'Grinning Cat: :cat:, aliens: :alien::alien::alien:, Test: :test:end!'
);
});
});